anomaly-pipeline 0.1.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2 @@
1
+ from .main import timeseries_anomaly_detection
2
+ from .helpers import help_info
@@ -0,0 +1,188 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import sklearn
4
+ from sklearn.preprocessing import StandardScaler
5
+ from sklearn.cluster import DBSCAN
6
+ from sklearn.neighbors import NearestNeighbors
7
+ from sklearn.ensemble import IsolationForest
8
+ from statsmodels.tsa.stattools import acf
9
+
10
+
11
+ def get_dynamic_lags(series: pd.Series) -> list:
12
+
13
+ n = len(series)
14
+
15
+ # Determine Max Lags (Max is min(50% of data, a hard cap of 60))
16
+ nlags = min(int(n * 0.5), 60)
17
+
18
+ if nlags < 5:
19
+ return [1, 2, 3]
20
+
21
+ # Calculate ACF and Confidence Intervals, get the 10 most-significant lags
22
+ autocorrelations, confint = acf(series.dropna(), nlags=nlags, alpha=0.25, fft=True)
23
+ autocorr_values = autocorrelations[1:]
24
+ conf_limit = confint[1:, 1] - autocorr_values
25
+ is_significant = np.abs(autocorr_values) > conf_limit
26
+ significant_autocorr = autocorr_values[is_significant]
27
+ significant_lags_indices = np.where(is_significant)[0] + 1
28
+ ranked_indices = np.argsort(np.abs(significant_autocorr))[::-1]
29
+ top_lags_indices = ranked_indices[:10]
30
+ top_lags = significant_lags_indices[top_lags_indices].tolist()
31
+ base_lags = [1, 2, 3]
32
+ dynamic_lags = sorted(list(set(base_lags + top_lags)))[:10]
33
+
34
+ return dynamic_lags
35
+
36
+ def find_optimal_epsilon(X_scaled: np.ndarray, k: int) -> float:
37
+ """
38
+ Finds the optimal epsilon by calculating the distance to the k-th nearest neighbor
39
+ and taking a high percentile (90-95th) of those distances as the cutoff.
40
+ This serves as a programmatic proxy for the 'elbow' method in a rolling window.
41
+ """
42
+ if len(X_scaled) < k:
43
+ return 1.0 # Fallback
44
+
45
+ # Find the distance to the k-th (min_samples) neighbor for every point
46
+ # n_neighbors is k+1 because the first distance is 0 (to itself)
47
+ neigh = NearestNeighbors(n_neighbors=k + 1)
48
+ neigh.fit(X_scaled)
49
+
50
+ # distances matrix: [n_samples, k+1]
51
+ distances, indices = neigh.kneighbors(X_scaled)
52
+
53
+ # We are interested in the distance to the k-th neighbor (index k)
54
+ # This k-distance is the required radius for a point to be a core point's neighbor.
55
+ k_distances = distances[:, k]
56
+
57
+ # The elbow is hard to find programmatically. A robust proxy for the density
58
+ # threshold is to take a high percentile (e.g., 95th) of the k-distances.
59
+ # This sets epsilon such that 95% of your *training* points would be considered
60
+ # part of a cluster's neighborhood.
61
+ optimal_eps = np.percentile(k_distances, 95)
62
+
63
+ # Ensure a minimum value if data is extremely sparse
64
+ return max(optimal_eps, 0.1)
65
+
66
+
67
+ def detect_time_series_anomalies_dbscan(
68
+ group,
69
+ variable,
70
+ date_column,
71
+ eval_period,
72
+ ):
73
+
74
+ group[date_column] = pd.to_datetime(group[date_column])
75
+ group = group.copy().sort_values(date_column).reset_index(drop=True)
76
+
77
+ # --- Default DBSCAN Parameters ---
78
+ # These parameters often need tuning, but these are reasonable starting points:
79
+ DEFAULT_EPS = 0.5 # Neighborhood radius (critical parameter)
80
+
81
+ try:
82
+ test_anom = []
83
+
84
+ for t in list(range(eval_period - 1, -1, -1)):
85
+
86
+ try:
87
+ # Boundary between rolling train and rolling forecast region
88
+ cutoff_date = group[date_column].max() - pd.Timedelta(weeks=t)
89
+
90
+ # Get train set to determine lags
91
+ model_group = group.copy()
92
+ train = model_group[model_group[date_column] <= cutoff_date].copy()
93
+ lags = get_dynamic_lags(train[variable])
94
+
95
+ # Create lag features and rolling stats for the entire DF
96
+ rolling_stats_features = []
97
+ for lag in lags:
98
+ model_group[f'lag{lag}'] = model_group[variable].shift(lag)
99
+
100
+ for w in [int(np.ceil(max(lags)/4)), int(np.ceil(max(lags)/2)), int(max(lags))]:
101
+ if w >= 3:
102
+ rolling_stats_features.extend([f'roll_mean_{w}', f'roll_std_{w}'])
103
+ model_group[f'roll_mean_{w}'] = model_group[variable].shift(1).rolling(w).mean()
104
+ model_group[f'roll_std_{w}'] = model_group[variable].shift(1).rolling(w).std()
105
+
106
+ model_group['trend'] = group.index
107
+ model_group = model_group.copy().dropna()
108
+
109
+ # Split into train and test
110
+ train = model_group[model_group[date_column] <= cutoff_date].copy()
111
+ test = model_group[model_group[date_column] == cutoff_date].copy()
112
+
113
+ # Identify all model features (lags, rolling stats, trend, and the variable itself)
114
+ features = [f'lag{i}' for i in lags] + rolling_stats_features + ['trend'] + [variable]
115
+
116
+ # Fit the scaler ONLY on the training data to avoid data leakage
117
+ scaler = StandardScaler()
118
+
119
+ # Fit the scaler on the train data features
120
+ scaler.fit(train[features])
121
+
122
+ # Transform both train and test sets
123
+ train_scaled = scaler.transform(train[features])
124
+ test_scaled = scaler.transform(test[features])
125
+
126
+ # Determine min_samples based on feature space dimension
127
+ min_samples = max(2 * len(features), 3)
128
+
129
+ # Find optimal epsilon
130
+ calculated_eps = find_optimal_epsilon(train_scaled, k=min_samples)
131
+
132
+ # --- DBSCAN MODEL ---
133
+ dbscan_model = DBSCAN(
134
+ eps=calculated_eps,
135
+ min_samples=min_samples,
136
+ n_jobs=-1
137
+ )
138
+
139
+ # Fit DBSCAN on the scaled training data
140
+ dbscan_model.fit(train_scaled)
141
+
142
+ # Since DBSCAN doesn't have a direct predict() method for new data points,
143
+ # the simplest (and common) proxy is to treat the test point as unassigned noise,
144
+ # which requires complex distance logic.
145
+
146
+ neigh = NearestNeighbors(n_neighbors=min_samples)
147
+ neigh.fit(train_scaled)
148
+
149
+ # Find the distance of the test point to its nearest neighbors in the train set
150
+ distances, indices = neigh.kneighbors(test_scaled)
151
+
152
+ # Anomaly check: If the distance to the min_samples-th neighbor is > eps, it's noise.
153
+ # Use the distance to the k-th neighbor (index min_samples-1)
154
+ k_distance = distances[:, min_samples - 1]
155
+
156
+ # Flag as anomaly if the k-distance is greater than the trained eps threshold
157
+ test['dbscan_score'] = k_distance - calculated_eps
158
+ test['is_DBSCAN_anomaly'] = np.where(k_distance > calculated_eps, True, False)
159
+
160
+ test = test[[variable, date_column, 'dbscan_score', 'is_DBSCAN_anomaly']]
161
+ test_anom.append(test)
162
+
163
+ except Exception as e:
164
+ print(f"Error in iteration {t}: {e}")
165
+ pass
166
+
167
+ try:
168
+ test_anom = pd.concat(test_anom)
169
+ group = group.merge(test_anom[[variable, date_column, 'dbscan_score', 'is_DBSCAN_anomaly']], on=[variable, date_column], how='left')
170
+ # group["is_DBSCAN_anomaly"] = group["is_DBSCAN_anomaly"].fillna(False)
171
+ except:
172
+ print("Error in DBSCAN process")
173
+ group['dbscan_score'] = np.nan
174
+ group["is_DBSCAN_anomaly"] = np.nan
175
+
176
+ except Exception as e:
177
+ # Fallback error handling
178
+ # Replace key_series with group for robustness if key_series is not defined
179
+ try:
180
+ group_id_cols = group.select_dtypes(include=['object', 'string']).columns.tolist()
181
+ group_id = " ".join(group[group_id_cols].reset_index(drop=True).iloc[0].astype(str).to_list())
182
+ except:
183
+ group_id = "Unknown Group ID"
184
+ print(f'DBSCAN Anomaly Detection failed for {group_id}. Error: {e}')
185
+ group['dbscan_score'] = np.nan
186
+ group["is_DBSCAN_anomaly"] = np.nan
187
+
188
+ return group
@@ -0,0 +1,71 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from .Preprocessing import classify
4
+
5
+ def detect_outliers_iqr(group, variable, date_column, eval_period):
6
+ n = len(group)
7
+ if n < 10:
8
+ return pd.DataFrame(columns=group.columns)
9
+
10
+ group = group.copy()
11
+ # Explicitly ensure date_column is datetime right at the start
12
+ group[date_column] = pd.to_datetime(group[date_column])
13
+ train_size = n - eval_period
14
+
15
+ # --- 1. HANDLE TRAINING DATA (Initial Block) ---
16
+ # Calculate baseline IQR using all data available before eval_period
17
+ initial_train = group[variable].iloc[:train_size]
18
+
19
+ q1 = initial_train.quantile(0.25)
20
+ q3 = initial_train.quantile(0.75)
21
+ iqr = q3 - q1
22
+
23
+ low = max(q1 - 1.5 * iqr, 0)
24
+ high = q3 + 1.5 * iqr
25
+
26
+ # Assign initial bounds to the training rows
27
+ group.loc[group.index[:train_size], 'Q1'] = q1
28
+ group.loc[group.index[:train_size], 'Q3'] = q3
29
+ group.loc[group.index[:train_size], 'IQR'] = iqr
30
+ group.loc[group.index[:train_size], 'IQR_low'] = low
31
+ group.loc[group.index[:train_size], 'IQR_high'] = high
32
+ group.loc[group.index[:train_size], 'set'] = "TRAIN"
33
+ group.loc[group.index[:train_size], 'IQR_anomaly'] = group[variable].iloc[:train_size].apply(
34
+ lambda x: classify(x, low, high)
35
+ )
36
+ group.loc[group.index[:train_size], 'is_IQR_anomaly'] = (
37
+ (group[variable].iloc[:train_size] < low) |
38
+ (group[variable].iloc[:train_size] > high)
39
+ )
40
+
41
+
42
+ # --- 2. HANDLE EVALUATION DATA (Expanding Window) ---
43
+ # Iterate through the eval period, increasing the training set one point at a time
44
+ for i in range(train_size, n):
45
+ # Data available up to this point (expanding)
46
+ current_train = group[variable].iloc[:i]
47
+
48
+ Q1 = current_train.quantile(0.25)
49
+ Q3 = current_train.quantile(0.75)
50
+ IQR = Q3 - Q1
51
+
52
+ lower_q = max(Q1 - 1.5 * IQR, 0)
53
+ upper_q = Q3 + 1.5 * IQR
54
+
55
+ # Test the current point i
56
+ current_val = group[variable].iloc[i]
57
+ group.iloc[i, group.columns.get_loc('Q1')] = Q1
58
+ group.iloc[i, group.columns.get_loc('Q3')] = Q3
59
+ group.iloc[i, group.columns.get_loc('IQR')] = IQR
60
+ group.iloc[i, group.columns.get_loc('IQR_low')] = lower_q
61
+ group.iloc[i, group.columns.get_loc('IQR_high')] = upper_q
62
+ group.iloc[i, group.columns.get_loc('set')] = "TEST"
63
+ group.iloc[i, group.columns.get_loc('IQR_anomaly')] = classify(current_val, lower_q, upper_q)
64
+ group.iloc[i, group.columns.get_loc('is_IQR_anomaly')] = (current_val < lower_q) or (current_val > upper_q)
65
+
66
+ # Cast boolean column properly
67
+ group['is_IQR_anomaly'] = group['is_IQR_anomaly'].astype(bool)
68
+ # FINAL SAFETY CHECK
69
+ group[date_column] = pd.to_datetime(group[date_column])
70
+
71
+ return group
@@ -0,0 +1,88 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from .Preprocessing import classify
4
+
5
+
6
+
7
+ def detect_outliers_mad(group, variable, date_column, mad_threshold, mad_scale_factor, eval_period):
8
+ n = len(group)
9
+ if n < 10:
10
+ return pd.DataFrame(columns=group.columns)
11
+
12
+ group = group.copy()
13
+ # Explicitly ensure date_column is datetime right at the start
14
+ group[date_column] = pd.to_datetime(group[date_column])
15
+ train_size = n - eval_period
16
+
17
+ # Initialize columns to store the expanding window metrics
18
+ group['Median'] = np.nan
19
+ group['MAD'] = np.nan
20
+ group['MAD_low'] = np.nan
21
+ group['MAD_high'] = np.nan
22
+ group['set'] = ""
23
+ group['is_MAD_anomaly'] = False
24
+
25
+ # --- 1. HANDLE TRAINING DATA (Initial Block) ---
26
+ initial_train = group[variable].iloc[:train_size]
27
+ median = initial_train.median()
28
+ mad = np.median(np.abs(initial_train - median))
29
+
30
+ if mad == 0:
31
+ lower_mad = median
32
+ upper_mad = median
33
+ else:
34
+ margin = mad_threshold * mad / mad_scale_factor
35
+ lower_mad = max(median - margin, 0)
36
+ upper_mad = median + margin
37
+
38
+ # Assign baseline values to the training block
39
+ train_idx = group.index[:train_size]
40
+ group.loc[train_idx, 'Median'] = median
41
+ group.loc[train_idx, 'MAD'] = mad
42
+ group.loc[train_idx, 'MAD_low'] = lower_mad
43
+ group.loc[train_idx, 'MAD_high'] = upper_mad
44
+ group.loc[train_idx, 'set'] = "TRAIN"
45
+ group.loc[train_idx, 'MAD_anomaly'] = group[variable].iloc[:train_size].apply(
46
+ lambda x: classify(x, lower_mad, upper_mad)
47
+ )
48
+ group.loc[train_idx, 'is_MAD_anomaly'] = (group[variable].iloc[:train_size] < lower_mad) | \
49
+ (group[variable].iloc[:train_size] > upper_mad)
50
+
51
+
52
+ # --- 2. HANDLE EVALUATION DATA (Expanding Window) ---
53
+ for i in range(train_size, n):
54
+ # Recursive growth: use all data up to the current point i
55
+ current_train = group[variable].iloc[:i]
56
+
57
+ curr_median = current_train.median()
58
+ curr_mad = np.median(np.abs(current_train - curr_median))
59
+
60
+ if curr_mad == 0:
61
+ lower_mad = curr_median
62
+ upper_mad = curr_median
63
+ else:
64
+ margin = mad_threshold * curr_mad / mad_scale_factor
65
+ lower_mad = max(curr_median - margin, 0)
66
+ upper_mad = curr_median + margin
67
+
68
+ # Test current point i
69
+ current_val = group[variable].iloc[i]
70
+
71
+ group.iloc[i, group.columns.get_loc('Median')] = curr_median
72
+ group.iloc[i, group.columns.get_loc('MAD')] = curr_mad
73
+ group.iloc[i, group.columns.get_loc('MAD_low')] = lower_mad
74
+ group.iloc[i, group.columns.get_loc('MAD_high')] = upper_mad
75
+ group.iloc[i, group.columns.get_loc('set')] = "TEST"
76
+ group.iloc[i, group.columns.get_loc('MAD_anomaly')] = classify(current_val, lower_mad, upper_mad)
77
+ group.iloc[i, group.columns.get_loc('is_MAD_anomaly')] = (current_val < lower_mad) or (current_val > upper_mad)
78
+
79
+ # If you have your classify function available:
80
+ # group['MAD_anomaly'] = group.apply(lambda row: classify(row[variable], row['MAD_low'], row['MAD_high']), axis=1)
81
+
82
+ group['is_MAD_anomaly'] = group['is_MAD_anomaly'].astype(bool)
83
+ # FINAL SAFETY CHECK
84
+ group[date_column] = pd.to_datetime(group[date_column])
85
+
86
+ return group
87
+
88
+
@@ -0,0 +1,116 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from datetime import datetime
4
+
5
+ def classify(val,lower,upper):
6
+ if val < lower:
7
+ return 'low'
8
+ elif val > upper:
9
+ return 'high'
10
+ else:
11
+ return 'none'
12
+
13
+ def create_full_calendar_and_interpolate(
14
+ master_data,
15
+ group_columns,
16
+ variable,
17
+ date_column,
18
+ freq
19
+ ):
20
+ """
21
+ Creates a complete weekly date range for each group,
22
+ merges with the master data, marks missing rows,
23
+ and fills missing values using linear interpolation.
24
+
25
+ Parameters
26
+ ----------
27
+ master_data : pd.DataFrame
28
+ group_columns : list
29
+ One or multiple columns that define a group.
30
+ date_column : str
31
+ Name of the date column (must be datetime-like)
32
+ missing_check_cols : list
33
+ Columns used to detect missing values.
34
+ If None → ALL numeric columns will be used.
35
+ freq : str
36
+ Frequency for calendar generation (default weekly Mondays).
37
+ """
38
+
39
+ # Ensure datetime
40
+ master_data[date_column] = pd.to_datetime(master_data[date_column])
41
+
42
+ full_group_data = []
43
+
44
+ for group_key, group in master_data.groupby(group_columns):
45
+
46
+ # ---- Step 1: Create full calendar for this group ----
47
+ min_date = group[date_column].min()
48
+ max_date = group[date_column].max()
49
+
50
+ full_dates = pd.date_range(start=min_date, end=max_date, freq=freq)
51
+
52
+ # Build calendar DF dynamically using group_columns
53
+ calendar_dict = {col: group_key[i] if isinstance(group_key, tuple) else group_key
54
+ for i, col in enumerate(group_columns)}
55
+ calendar_dict[date_column] = full_dates
56
+
57
+ full_calendar = pd.DataFrame(calendar_dict)
58
+
59
+ # ---- Step 2: Join with actual group data ----
60
+ merged = full_calendar.merge(
61
+ group,
62
+ on=group_columns + [date_column],
63
+ how="left"
64
+ )
65
+
66
+ # ---- Step 3: Mark missing rows based on selected columns ----
67
+ merged["is_missing_record"] = merged[variable].isna()
68
+
69
+
70
+ # ---- Step 4: Interpolate numeric columns ----
71
+ numeric_cols = merged.select_dtypes(include=[np.number]).columns
72
+
73
+ for col in numeric_cols:
74
+ merged[col] = merged[col].interpolate(method="linear", limit_direction="both")
75
+
76
+ full_group_data.append(merged)
77
+
78
+ final_df = pd.concat(full_group_data, ignore_index=True)
79
+ #print(f"The number of records missing {final_df['is_missing_record'].sum()}")
80
+ return final_df
81
+
82
+
83
+ def print_anomaly_stats(df, group_columns):
84
+ # Calculate global stats
85
+ total_records = len(df)
86
+ # Ensure is_anomaly is treated as boolean for counting
87
+ total_anomalies = df['is_Anomaly'].fillna(False).astype(bool).sum()
88
+ anomaly_rate = (total_anomalies / total_records) * 100
89
+
90
+ print("\n" + "="*45)
91
+ print(f"{'ANOMALY DETECTION SUMMARY':^45}")
92
+ print("="*45)
93
+ print(f"{'Total Records:':<25} {total_records:,}")
94
+ print(f"{'Total Anomalies:':<25} {total_anomalies:,}")
95
+ print(f"{'Overall Anomaly Rate:':<25} {anomaly_rate:.2f}%")
96
+ print("-" * 45)
97
+
98
+ # --- CHANGE START: Group by Rate ---
99
+ print(f"Top 5 Groups by Anomaly Rate ({' > '.join(group_columns)}):")
100
+
101
+ # 1. Group by keys
102
+ # 2. Calculate mean (rate) and count (to show absolute numbers too)
103
+ group_stats = df.groupby(group_columns)['is_Anomaly'].agg(['mean', 'sum']).sort_values(by='mean', ascending=False).head(5)
104
+
105
+ for label, row in group_stats.iterrows():
106
+ # Handle single vs multiple group columns for clean printing
107
+ group_label = label if isinstance(label, str) else " | ".join(map(str, label))
108
+ rate_pct = row['mean'] * 100
109
+ count = int(row['sum'])
110
+
111
+ # Print the Rate % and the absolute count in brackets for context
112
+ print(f" - {group_label:<25} : {rate_pct:>6.2f}% ({count:>3} anomalies)")
113
+ # --- CHANGE END ---
114
+
115
+ print("="*45 + "\n")
116
+
@@ -0,0 +1,70 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from .Preprocessing import classify
4
+
5
+ def detect_outliers_sd(group, variable, date_column, eval_period):
6
+ n = len(group)
7
+ # checking the min_size requirements
8
+ if n < 10:
9
+ return pd.DataFrame(columns=group.columns)
10
+
11
+ group = group.copy()
12
+ # Explicitly ensure date_column is datetime right at the start
13
+ group[date_column] = pd.to_datetime(group[date_column])
14
+ train_size = n - eval_period
15
+
16
+ # --- 1. HANDLE TRAINING DATA (Initial Block) ---
17
+ # Calculate baseline IQR using all data available before eval_period
18
+ initial_train = group[variable].iloc[:train_size]
19
+
20
+ # SD-based bounds
21
+ mean = initial_train.mean()
22
+ std = initial_train .std()
23
+
24
+ lower_2sd = max(mean - 2*std,0)
25
+ upper_2sd = mean + 2*std
26
+
27
+ # Assign initial bounds to the training rows
28
+ group.loc[group.index[:train_size], "Mean"] = mean
29
+ group.loc[group.index[:train_size], 'SD'] = std
30
+ group.loc[group.index[:train_size], 'SD2_low'] = lower_2sd
31
+ group.loc[group.index[:train_size], 'SD2_high'] = upper_2sd
32
+ group.loc[group.index[:train_size], 'set'] = "TRAIN"
33
+ group.loc[group.index[:train_size], 'SD_anomaly'] = group[variable].iloc[:train_size].apply(
34
+ lambda x: classify(x, lower_2sd , upper_2sd)
35
+ )
36
+ group.loc[group.index[:train_size], 'is_SD_anomaly'] = (
37
+ (group[variable].iloc[:train_size] < lower_2sd) |
38
+ (group[variable].iloc[:train_size] > upper_2sd)
39
+ )
40
+
41
+
42
+ # --- 2. HANDLE EVALUATION DATA (Expanding Window) ---
43
+ # Iterate through the eval period, increasing the training set one point at a time
44
+ for i in range(train_size, n):
45
+ # Data available up to this point (expanding)
46
+ current_train = group[variable].iloc[:i]
47
+
48
+ MEAN = current_train.mean()
49
+ STD = current_train.std()
50
+
51
+ LOWER_2SD = max(MEAN - 2*STD,0)
52
+ UPPER_2SD = MEAN + 2*STD
53
+
54
+ # Test the current point i
55
+ current_val = group[variable].iloc[i]
56
+ group.iloc[i, group.columns.get_loc("Mean")] = MEAN
57
+ group.iloc[i, group.columns.get_loc('SD')] = STD
58
+ group.iloc[i, group.columns.get_loc('SD2_low')] = LOWER_2SD
59
+ group.iloc[i, group.columns.get_loc('SD2_high')] = UPPER_2SD
60
+ group.iloc[i, group.columns.get_loc('set')] = "TEST"
61
+ group.iloc[i, group.columns.get_loc('SD_anomaly')] = classify(current_val, LOWER_2SD, UPPER_2SD)
62
+ group.iloc[i, group.columns.get_loc('is_SD_anomaly')] = (current_val < LOWER_2SD) or (current_val > UPPER_2SD)
63
+
64
+ # Cast boolean column properly
65
+ group['is_SD_anomaly'] = group['is_SD_anomaly'].astype(bool)
66
+ # FINAL SAFETY CHECK
67
+ group[date_column] = pd.to_datetime(group[date_column])
68
+
69
+ return group
70
+
@@ -0,0 +1 @@
1
+ from .help_info import help_info
@@ -0,0 +1,112 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.ensemble import IsolationForest
4
+
5
+ # Anomaly category columns (optional, keep if you still want string labels)
6
+
7
+
8
+ def remove_outliers_iqr_and_sd(group, variable,contamination=0.03, random_state=42):
9
+ if len(group) < 10:
10
+ # Return empty DataFrame to exclude this group entirely
11
+ print(f"the {group[key].unique()} has {len(group)} records.Hence dropping from the analysis")
12
+ return pd.DataFrame(columns=group.columns)
13
+ # Quantile-based bounds
14
+ min_value = group[variable].min()
15
+ max_value = group[variable].max()
16
+ Q1 = group[variable].quantile(0.25)
17
+ Q3 = group[variable].quantile(0.75)
18
+ median = group[variable].quantile(0.5)
19
+ IQR = Q3 - Q1
20
+ low_percentile = group[variable].quantile(0.05)
21
+ high_percentile = group[variable].quantile(0.95)
22
+ lower_q = max(Q1 - 1.5 * IQR,0)
23
+ upper_q = Q3 + 1.5 * IQR
24
+
25
+ group["MIN_value"]= min_value
26
+ group["MAX_value"]= max_value
27
+ group["Percentile_low"]=low_percentile
28
+ group["Percentile_high"]=high_percentile
29
+
30
+ # SD-based bounds
31
+ mean = group[variable].mean()
32
+ std = group[variable].std()
33
+
34
+ lower_1sd = max(mean - 1*std, 0)
35
+ upper_1sd = mean + 1*std
36
+ group["Mean"]=mean
37
+ group["SD"]=std
38
+ group['SD1_low'] = lower_1sd
39
+ group['SD1_high'] = upper_1sd
40
+
41
+ lower_2sd = max(mean - 2*std,0)
42
+ upper_2sd = mean + 2*std
43
+ #group["mean"]=mean
44
+ #group["std"]=std
45
+ group['SD2_low'] = lower_2sd
46
+ group['SD2_high'] = upper_2sd
47
+
48
+
49
+ lower_3sd = max(mean - 3 * std,0)
50
+ upper_3sd = mean + 3 * std
51
+ group['SD3_low'] = lower_3sd
52
+ group['SD3_high'] = upper_3sd
53
+
54
+
55
+ # MAD-based bounds
56
+ abs_dev = np.abs(group[variable] - median)
57
+ mad = np.median(abs_dev)
58
+ threshold_v1 = 2.5
59
+ threshold_v2 = 2.5
60
+ scale_factor = 0.6745
61
+
62
+ if mad == 0:
63
+ lower_mad_v1 = median
64
+ upper_mad_v1= median
65
+ lower_mad_v2 = median
66
+ upper_mad_v2 = median
67
+ else:
68
+ margin_v1 = threshold_v1 * mad / scale_factor
69
+ lower_mad_v1 = max(median - margin_v1,0)
70
+ upper_mad_v1 = median + margin_v1
71
+ margin_v2 = threshold_v2 * mad / scale_factor
72
+ lower_mad_v2 = max(median - margin_v2,0)
73
+ upper_mad_v2 = median + margin_v2
74
+
75
+ group["Median"]=median
76
+ group['MAD'] = mad
77
+ #group['MAD2.5_low'] = lower_mad_v1
78
+ #group['MAD2.5_high'] = upper_mad_v1
79
+ group['MAD_low'] = lower_mad_v2
80
+ group['MAD_high'] = upper_mad_v2
81
+
82
+
83
+ group["Q1"]=Q1
84
+ group["Q3"]= Q3
85
+ group["IQR"]=IQR
86
+ group['IQR_low'] = lower_q
87
+ group['IQR_high'] = upper_q
88
+
89
+ """
90
+ # ---- Isolation Forest ----
91
+ iso = IsolationForest(contamination=contamination, random_state=random_state)
92
+ preds = iso.fit_predict(group[[variable]])
93
+ scores = iso.decision_function(group[[variable]])
94
+
95
+ group["IsolationForest_score"] = scores
96
+ """
97
+
98
+ group['Percentile_anomaly'] = group[variable].apply(lambda val: classify(val, low_percentile, high_percentile))
99
+ group['SD_anomaly'] = group[variable].apply(lambda val: classify(val, lower_2sd, upper_2sd))
100
+ group['MAD_anomaly'] = group[variable].apply(lambda val: classify(val, lower_mad_v2, upper_mad_v2))
101
+ group['IQR_anomaly'] = group[variable].apply(lambda val: classify(val, lower_q, upper_q))
102
+
103
+ # Boolean anomaly flags
104
+
105
+ group['is_Percentile_anomaly'] = (group[variable] < low_percentile) | (group[variable] > high_percentile)
106
+ group['is_SD_anomaly'] = (group[variable] < lower_2sd) | (group[variable] > upper_2sd)
107
+ group['is_MAD_anomaly'] = (group[variable] < lower_mad_v2) | (group[variable] > upper_mad_v2)
108
+ group['is_IQR_anomaly'] = (group[variable] < lower_q) | (group[variable] > upper_q)
109
+ #group["is_IsolationForest_anomaly"] = preds == -1
110
+
111
+ return group
112
+