cherrypick-ml 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cherrypick/__init__.py +4 -0
- cherrypick/anomaly.py +120 -0
- cherrypick/explain.py +178 -0
- cherrypick/orchestrator.py +797 -0
- cherrypick/preprocessing.py +197 -0
- cherrypick/splits.py +56 -0
- cherrypick_ml-0.1.0.dist-info/METADATA +117 -0
- cherrypick_ml-0.1.0.dist-info/RECORD +11 -0
- cherrypick_ml-0.1.0.dist-info/WHEEL +5 -0
- cherrypick_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
- cherrypick_ml-0.1.0.dist-info/top_level.txt +1 -0
cherrypick/__init__.py
ADDED
cherrypick/anomaly.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from scipy.stats import zscore
|
|
4
|
+
from sklearn.ensemble import IsolationForest
|
|
5
|
+
from sklearn.neighbors import LocalOutlierFactor
|
|
6
|
+
from typing import Literal
|
|
7
|
+
|
|
8
|
+
import warnings as war
|
|
9
|
+
war.filterwarnings('ignore')
|
|
10
|
+
|
|
11
|
+
class OutlierPruner:
|
|
12
|
+
"""
|
|
13
|
+
OutlierPruner provides statistical and ML-based methods
|
|
14
|
+
for detecting and removing outliers from a dataset.
|
|
15
|
+
|
|
16
|
+
Parameters
|
|
17
|
+
----------
|
|
18
|
+
method : {'iqr', 'zscore', 'mod_zscore', 'isoforest', 'lof'}
|
|
19
|
+
Method used for outlier detection.
|
|
20
|
+
|
|
21
|
+
- `'iqr'` - Interquartile Range method
|
|
22
|
+
- `'zscore'` - Standard Z-score normalization
|
|
23
|
+
- `'mod_zscore'` - Modified Z-score:
|
|
24
|
+
|
|
25
|
+
`modified_Zscore = 0.6745 * (X - median) / MAD`
|
|
26
|
+
|
|
27
|
+
*Where,*
|
|
28
|
+
**median** = *median of the sample data*
|
|
29
|
+
**MAD** = *median absolute deviation*
|
|
30
|
+
**X** = *sample data points(Xi)*
|
|
31
|
+
- `'isoforest'` - Isolation Forest, an ensemble-based anomaly detection method
|
|
32
|
+
- `'lof'` - Local Outlier Factor, detects outliers using local density
|
|
33
|
+
|
|
34
|
+
df : pandas.DataFrame
|
|
35
|
+
Input dataset on which outlier pruning will be applied.
|
|
36
|
+
|
|
37
|
+
col : str
|
|
38
|
+
Column name used for outlier detection in statistical methods.
|
|
39
|
+
|
|
40
|
+
Notes
|
|
41
|
+
-----
|
|
42
|
+
- Statistical methods require a specific column (``col``).
|
|
43
|
+
- ML-based methods (Isolation Forest, Local Outlier Factor) operate on numerical features.
|
|
44
|
+
- Modified Z-score is robust to extreme values as it uses the median instead of mean.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(self, method: Literal['iqr', 'zscore', 'mod_zscore', 'isoforest', 'lof'], df:pd.DataFrame, col:str):
|
|
48
|
+
self.df = df
|
|
49
|
+
self.col = col
|
|
50
|
+
self.method = method
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def __iqr(self):
|
|
54
|
+
|
|
55
|
+
Q1 = self.df[self.col].quantile(0.25)
|
|
56
|
+
Q3 = self.df[self.col].quantile(0.75)
|
|
57
|
+
IQR = Q3 - Q1
|
|
58
|
+
|
|
59
|
+
lower_fence = Q1 - 1.5 * IQR
|
|
60
|
+
upper_fence = Q3 + 1.5 * IQR
|
|
61
|
+
|
|
62
|
+
return self.df[(self.df[self.col] >= lower_fence) & (self.df[self.col] <= upper_fence)]
|
|
63
|
+
|
|
64
|
+
def __zscore(self):
|
|
65
|
+
z = zscore(self.df[self.col])
|
|
66
|
+
return self.df[np.abs(z) < 3]
|
|
67
|
+
|
|
68
|
+
def __isoforest(self):
|
|
69
|
+
isolate = IsolationForest(contamination=0.3, n_jobs=-1, random_state=42)
|
|
70
|
+
|
|
71
|
+
X = self.df.select_dtypes(include=np.number)
|
|
72
|
+
labels_ = isolate.fit_predict(X)
|
|
73
|
+
|
|
74
|
+
# outliers = np.where(labels_ == -1)[0]
|
|
75
|
+
return self.df.iloc[labels_!= -1]
|
|
76
|
+
|
|
77
|
+
def __lof(self):
|
|
78
|
+
lof = LocalOutlierFactor(n_jobs=-1, n_neighbors=20, algorithm='kd_tree')
|
|
79
|
+
X = self.df.select_dtypes(include = np.number)
|
|
80
|
+
labels = lof.fit_predict(X)
|
|
81
|
+
|
|
82
|
+
return self.df.iloc[labels != -1]
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def __modded_zscore(self):
|
|
86
|
+
|
|
87
|
+
df1=self.df
|
|
88
|
+
median = np.median(df1[self.col])
|
|
89
|
+
mad = np.median(np.abs(df1[self.col] - median))
|
|
90
|
+
## If MAD value == 0, then it will return original DataFrame instead of garbage value and prevent division by zero error
|
|
91
|
+
if mad == 0 :
|
|
92
|
+
return self.df
|
|
93
|
+
|
|
94
|
+
mod_zscore = 0.6745 * (df1[self.col] - median)/mad
|
|
95
|
+
|
|
96
|
+
normal_data = df1[mod_zscore.abs() < 3]
|
|
97
|
+
outliers = df1[mod_zscore.abs() > 3]
|
|
98
|
+
|
|
99
|
+
return normal_data
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def remove_outlier(self):
|
|
103
|
+
'''
|
|
104
|
+
Calling this function will transform dataset with configuration provided to **OutlierPruner**.
|
|
105
|
+
'''
|
|
106
|
+
try:
|
|
107
|
+
METHOD_CONFIG = {
|
|
108
|
+
"iqr" : self.__iqr,
|
|
109
|
+
"zscore" : self.__zscore,
|
|
110
|
+
"mod_zscore":self.__modded_zscore,
|
|
111
|
+
"isoforest" : self.__isoforest,
|
|
112
|
+
"lof" : self.__lof
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
return METHOD_CONFIG[self.method]()
|
|
116
|
+
|
|
117
|
+
except KeyError:
|
|
118
|
+
raise ValueError(f"Provide an appropriate method : {self.method}")
|
|
119
|
+
except Exception as err:
|
|
120
|
+
raise ValueError(err)
|
cherrypick/explain.py
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
import shap
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import matplotlib.pyplot as plt
|
|
5
|
+
from sklearn import tree
|
|
6
|
+
from cherrypick.orchestrator import Orchestrator
|
|
7
|
+
from typing import Literal
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def explainer(model, data, impact_type :Literal['pos', 'neg', 'all'] = 'all'):
|
|
11
|
+
"""
|
|
12
|
+
Compute SHAP-based feature importance and return sorted impact values.
|
|
13
|
+
|
|
14
|
+
This function uses SHAP's TreeExplainer to calculate feature contributions
|
|
15
|
+
for a given model and dataset. It aggregates SHAP values across samples
|
|
16
|
+
and (if applicable) across multiple classes, returning feature importance
|
|
17
|
+
based on absolute SHAP magnitudes.
|
|
18
|
+
|
|
19
|
+
Parameters
|
|
20
|
+
----------
|
|
21
|
+
model : object
|
|
22
|
+
A trained tree-based model compatible with shap.TreeExplainer
|
|
23
|
+
(e.g., XGBoost, LightGBM, RandomForest).
|
|
24
|
+
|
|
25
|
+
data : pandas.DataFrame
|
|
26
|
+
Input dataset for which SHAP values are to be computed.
|
|
27
|
+
Must contain only feature columns (no target column).
|
|
28
|
+
|
|
29
|
+
impact_type : {'pos', 'neg', 'all'}, default='all'
|
|
30
|
+
Type of feature impact to return:
|
|
31
|
+
- '**pos**' - Returns features with positive contribution.
|
|
32
|
+
- '**neg**' - Returns features with negative contribution.
|
|
33
|
+
- '**all**' - Returns all features with overall importance
|
|
34
|
+
(absolute SHAP values).
|
|
35
|
+
|
|
36
|
+
Returns
|
|
37
|
+
-------
|
|
38
|
+
result : pandas.DataFrame
|
|
39
|
+
A sorted DataFrame containing feature importance:
|
|
40
|
+
- For 'all' → columns: ['Features', 'Overall_Impact']
|
|
41
|
+
- For 'pos' → columns: ['Features', 'Positive_Impact']
|
|
42
|
+
- For 'neg' → columns: ['Features', 'Negative_Impact']
|
|
43
|
+
|
|
44
|
+
shap_values : shap.Explanation
|
|
45
|
+
Raw SHAP explanation object containing per-sample contributions.
|
|
46
|
+
|
|
47
|
+
Notes
|
|
48
|
+
-----
|
|
49
|
+
- For multi-class models, SHAP values are averaged across classes.
|
|
50
|
+
- Feature importance is computed using mean absolute SHAP values.
|
|
51
|
+
- The function also stores SHAP values globally in `_shap_val`.
|
|
52
|
+
|
|
53
|
+
Raises
|
|
54
|
+
------
|
|
55
|
+
ValueError
|
|
56
|
+
If `impact_type` is not one of {'pos', 'neg', 'all'}.
|
|
57
|
+
|
|
58
|
+
Example
|
|
59
|
+
-------
|
|
60
|
+
>>> result, shap_vals = explainer(model, X_test, impact_type='all')
|
|
61
|
+
>>> print(result.head())
|
|
62
|
+
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
## All the Shap values with magnitude based as well!
|
|
66
|
+
features = [ ]
|
|
67
|
+
all_values = [ ]
|
|
68
|
+
|
|
69
|
+
neg_values = [ ]
|
|
70
|
+
neg_feature = [ ]
|
|
71
|
+
|
|
72
|
+
pos_values = [ ]
|
|
73
|
+
pos_feature = [ ]
|
|
74
|
+
|
|
75
|
+
explain = shap.TreeExplainer(model = model)
|
|
76
|
+
|
|
77
|
+
shap_values = explain(X = data)
|
|
78
|
+
|
|
79
|
+
global _shap_val
|
|
80
|
+
_shap_val = shap_values
|
|
81
|
+
|
|
82
|
+
vals = _shap_val.values
|
|
83
|
+
|
|
84
|
+
if vals.ndim >= 3 and impact_type == 'all':
|
|
85
|
+
vals = np.abs(vals).mean(axis = (0, 2))
|
|
86
|
+
elif vals.ndim == 2 and impact_type == 'all':
|
|
87
|
+
vals = np.abs(vals).mean(axis=0)
|
|
88
|
+
|
|
89
|
+
elif (impact_type == "pos" or impact_type == "neg") and vals.ndim >=3:
|
|
90
|
+
vals = vals.mean(axis=(0, 2))
|
|
91
|
+
elif (impact_type == "pos" or impact_type == "neg") and vals.ndim == 2:
|
|
92
|
+
vals = vals.mean(axis = 0)
|
|
93
|
+
|
|
94
|
+
else:
|
|
95
|
+
raise ValueError("Invalid Impact type or dimentions of shap values")
|
|
96
|
+
|
|
97
|
+
for feature, value in zip(data.columns, vals):
|
|
98
|
+
|
|
99
|
+
features.append(feature)
|
|
100
|
+
all_values.append(value)
|
|
101
|
+
|
|
102
|
+
if value < 0:
|
|
103
|
+
neg_values.append(value)
|
|
104
|
+
neg_feature.append(feature)
|
|
105
|
+
|
|
106
|
+
else:
|
|
107
|
+
pos_values.append(value)
|
|
108
|
+
pos_feature.append(feature)
|
|
109
|
+
|
|
110
|
+
if impact_type == 'neg':
|
|
111
|
+
result = pd.DataFrame({
|
|
112
|
+
"Features" : neg_feature,
|
|
113
|
+
"Negative_Impact" : neg_values
|
|
114
|
+
}).sort_values(by="Negative_Impact", ascending=False)
|
|
115
|
+
|
|
116
|
+
elif impact_type == 'pos':
|
|
117
|
+
result = pd.DataFrame({
|
|
118
|
+
"Features" : pos_feature,
|
|
119
|
+
"Positive_Impact" : pos_values
|
|
120
|
+
|
|
121
|
+
}).sort_values(by="Positive_Impact", ascending=False)
|
|
122
|
+
|
|
123
|
+
elif impact_type == 'all':
|
|
124
|
+
result = pd.DataFrame({
|
|
125
|
+
"Features" : features,
|
|
126
|
+
"Overall_Impact" : all_values
|
|
127
|
+
}).sort_values(by="Overall_Impact", ascending=False)
|
|
128
|
+
|
|
129
|
+
else:
|
|
130
|
+
raise ValueError("Invalid Impact type : must be neg, pos or all")
|
|
131
|
+
|
|
132
|
+
return result, shap_values
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def summary_plot(data):
|
|
136
|
+
'''
|
|
137
|
+
Summary plot for feature contribution for all the classes.
|
|
138
|
+
'''
|
|
139
|
+
|
|
140
|
+
shap.summary_plot(_shap_val, data)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def bar_plot(n_classes):
|
|
144
|
+
'''
|
|
145
|
+
Bar plot analysis of feature contribution for each class
|
|
146
|
+
'''
|
|
147
|
+
for class_id in range(n_classes):
|
|
148
|
+
plt.title(f"For class_id {class_id}")
|
|
149
|
+
shap.plots.bar(_shap_val[..., class_id])
|
|
150
|
+
plt.tight_layout()
|
|
151
|
+
plt.show()
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
# def force_plot(shap_values):
|
|
155
|
+
# pass
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def tree_plot(model , feature_names, size:tuple):
|
|
159
|
+
plt.figure(figsize=size)
|
|
160
|
+
tree.plot_tree(model, filled=True, feature_names=feature_names, class_names=True)
|
|
161
|
+
plt.tight_layout()
|
|
162
|
+
plt.show()
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
|