cherrypick-ml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cherrypick/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ from .orchestrator import Orchestrator
2
+ from . import explain, preprocessing, anomaly, splits
3
+
4
+ __all__ = ['Orchestrator']
cherrypick/anomaly.py ADDED
@@ -0,0 +1,120 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from scipy.stats import zscore
4
+ from sklearn.ensemble import IsolationForest
5
+ from sklearn.neighbors import LocalOutlierFactor
6
+ from typing import Literal
7
+
8
+ import warnings as war
9
+ war.filterwarnings('ignore')
10
+
11
+ class OutlierPruner:
12
+ """
13
+ OutlierPruner provides statistical and ML-based methods
14
+ for detecting and removing outliers from a dataset.
15
+
16
+ Parameters
17
+ ----------
18
+ method : {'iqr', 'zscore', 'mod_zscore', 'isoforest', 'lof'}
19
+ Method used for outlier detection.
20
+
21
+ - `'iqr'` - Interquartile Range method
22
+ - `'zscore'` - Standard Z-score normalization
23
+ - `'mod_zscore'` - Modified Z-score:
24
+
25
+ `modified_Zscore = 0.6745 * (X - median) / MAD`
26
+
27
+ *Where,*
28
+ **median** = *median of the sample data*
29
+ **MAD** = *median absolute deviation*
30
+ **X** = *sample data points(Xi)*
31
+ - `'isoforest'` - Isolation Forest, an ensemble-based anomaly detection method
32
+ - `'lof'` - Local Outlier Factor, detects outliers using local density
33
+
34
+ df : pandas.DataFrame
35
+ Input dataset on which outlier pruning will be applied.
36
+
37
+ col : str
38
+ Column name used for outlier detection in statistical methods.
39
+
40
+ Notes
41
+ -----
42
+ - Statistical methods require a specific column (``col``).
43
+ - ML-based methods (Isolation Forest, Local Outlier Factor) operate on numerical features.
44
+ - Modified Z-score is robust to extreme values as it uses the median instead of mean.
45
+ """
46
+
47
+ def __init__(self, method: Literal['iqr', 'zscore', 'mod_zscore', 'isoforest', 'lof'], df:pd.DataFrame, col:str):
48
+ self.df = df
49
+ self.col = col
50
+ self.method = method
51
+
52
+
53
+ def __iqr(self):
54
+
55
+ Q1 = self.df[self.col].quantile(0.25)
56
+ Q3 = self.df[self.col].quantile(0.75)
57
+ IQR = Q3 - Q1
58
+
59
+ lower_fence = Q1 - 1.5 * IQR
60
+ upper_fence = Q3 + 1.5 * IQR
61
+
62
+ return self.df[(self.df[self.col] >= lower_fence) & (self.df[self.col] <= upper_fence)]
63
+
64
+ def __zscore(self):
65
+ z = zscore(self.df[self.col])
66
+ return self.df[np.abs(z) < 3]
67
+
68
+ def __isoforest(self):
69
+ isolate = IsolationForest(contamination=0.3, n_jobs=-1, random_state=42)
70
+
71
+ X = self.df.select_dtypes(include=np.number)
72
+ labels_ = isolate.fit_predict(X)
73
+
74
+ # outliers = np.where(labels_ == -1)[0]
75
+ return self.df.iloc[labels_!= -1]
76
+
77
+ def __lof(self):
78
+ lof = LocalOutlierFactor(n_jobs=-1, n_neighbors=20, algorithm='kd_tree')
79
+ X = self.df.select_dtypes(include = np.number)
80
+ labels = lof.fit_predict(X)
81
+
82
+ return self.df.iloc[labels != -1]
83
+
84
+
85
+ def __modded_zscore(self):
86
+
87
+ df1=self.df
88
+ median = np.median(df1[self.col])
89
+ mad = np.median(np.abs(df1[self.col] - median))
90
+ ## If MAD value == 0, then it will return original DataFrame instead of garbage value and prevent division by zero error
91
+ if mad == 0 :
92
+ return self.df
93
+
94
+ mod_zscore = 0.6745 * (df1[self.col] - median)/mad
95
+
96
+ normal_data = df1[mod_zscore.abs() < 3]
97
+ outliers = df1[mod_zscore.abs() > 3]
98
+
99
+ return normal_data
100
+
101
+
102
+ def remove_outlier(self):
103
+ '''
104
+ Calling this function will transform dataset with configuration provided to **OutlierPruner**.
105
+ '''
106
+ try:
107
+ METHOD_CONFIG = {
108
+ "iqr" : self.__iqr,
109
+ "zscore" : self.__zscore,
110
+ "mod_zscore":self.__modded_zscore,
111
+ "isoforest" : self.__isoforest,
112
+ "lof" : self.__lof
113
+ }
114
+
115
+ return METHOD_CONFIG[self.method]()
116
+
117
+ except KeyError:
118
+ raise ValueError(f"Provide an appropriate method : {self.method}")
119
+ except Exception as err:
120
+ raise ValueError(err)
cherrypick/explain.py ADDED
@@ -0,0 +1,178 @@
1
+ import shap
2
+ import numpy as np
3
+ import pandas as pd
4
+ import matplotlib.pyplot as plt
5
+ from sklearn import tree
6
+ from cherrypick.orchestrator import Orchestrator
7
+ from typing import Literal
8
+
9
+
10
+ def explainer(model, data, impact_type :Literal['pos', 'neg', 'all'] = 'all'):
11
+ """
12
+ Compute SHAP-based feature importance and return sorted impact values.
13
+
14
+ This function uses SHAP's TreeExplainer to calculate feature contributions
15
+ for a given model and dataset. It aggregates SHAP values across samples
16
+ and (if applicable) across multiple classes, returning feature importance
17
+ based on absolute SHAP magnitudes.
18
+
19
+ Parameters
20
+ ----------
21
+ model : object
22
+ A trained tree-based model compatible with shap.TreeExplainer
23
+ (e.g., XGBoost, LightGBM, RandomForest).
24
+
25
+ data : pandas.DataFrame
26
+ Input dataset for which SHAP values are to be computed.
27
+ Must contain only feature columns (no target column).
28
+
29
+ impact_type : {'pos', 'neg', 'all'}, default='all'
30
+ Type of feature impact to return:
31
+ - '**pos**' - Returns features with positive contribution.
32
+ - '**neg**' - Returns features with negative contribution.
33
+ - '**all**' - Returns all features with overall importance
34
+ (absolute SHAP values).
35
+
36
+ Returns
37
+ -------
38
+ result : pandas.DataFrame
39
+ A sorted DataFrame containing feature importance:
40
+ - For 'all' → columns: ['Features', 'Overall_Impact']
41
+ - For 'pos' → columns: ['Features', 'Positive_Impact']
42
+ - For 'neg' → columns: ['Features', 'Negative_Impact']
43
+
44
+ shap_values : shap.Explanation
45
+ Raw SHAP explanation object containing per-sample contributions.
46
+
47
+ Notes
48
+ -----
49
+ - For multi-class models, SHAP values are averaged across classes.
50
+ - Feature importance is computed using mean absolute SHAP values.
51
+ - The function also stores SHAP values globally in `_shap_val`.
52
+
53
+ Raises
54
+ ------
55
+ ValueError
56
+ If `impact_type` is not one of {'pos', 'neg', 'all'}.
57
+
58
+ Example
59
+ -------
60
+ >>> result, shap_vals = explainer(model, X_test, impact_type='all')
61
+ >>> print(result.head())
62
+
63
+ """
64
+
65
+ ## All the Shap values with magnitude based as well!
66
+ features = [ ]
67
+ all_values = [ ]
68
+
69
+ neg_values = [ ]
70
+ neg_feature = [ ]
71
+
72
+ pos_values = [ ]
73
+ pos_feature = [ ]
74
+
75
+ explain = shap.TreeExplainer(model = model)
76
+
77
+ shap_values = explain(X = data)
78
+
79
+ global _shap_val
80
+ _shap_val = shap_values
81
+
82
+ vals = _shap_val.values
83
+
84
+ if vals.ndim >= 3 and impact_type == 'all':
85
+ vals = np.abs(vals).mean(axis = (0, 2))
86
+ elif vals.ndim == 2 and impact_type == 'all':
87
+ vals = np.abs(vals).mean(axis=0)
88
+
89
+ elif (impact_type == "pos" or impact_type == "neg") and vals.ndim >=3:
90
+ vals = vals.mean(axis=(0, 2))
91
+ elif (impact_type == "pos" or impact_type == "neg") and vals.ndim == 2:
92
+ vals = vals.mean(axis = 0)
93
+
94
+ else:
95
+ raise ValueError("Invalid Impact type or dimentions of shap values")
96
+
97
+ for feature, value in zip(data.columns, vals):
98
+
99
+ features.append(feature)
100
+ all_values.append(value)
101
+
102
+ if value < 0:
103
+ neg_values.append(value)
104
+ neg_feature.append(feature)
105
+
106
+ else:
107
+ pos_values.append(value)
108
+ pos_feature.append(feature)
109
+
110
+ if impact_type == 'neg':
111
+ result = pd.DataFrame({
112
+ "Features" : neg_feature,
113
+ "Negative_Impact" : neg_values
114
+ }).sort_values(by="Negative_Impact", ascending=False)
115
+
116
+ elif impact_type == 'pos':
117
+ result = pd.DataFrame({
118
+ "Features" : pos_feature,
119
+ "Positive_Impact" : pos_values
120
+
121
+ }).sort_values(by="Positive_Impact", ascending=False)
122
+
123
+ elif impact_type == 'all':
124
+ result = pd.DataFrame({
125
+ "Features" : features,
126
+ "Overall_Impact" : all_values
127
+ }).sort_values(by="Overall_Impact", ascending=False)
128
+
129
+ else:
130
+ raise ValueError("Invalid Impact type : must be neg, pos or all")
131
+
132
+ return result, shap_values
133
+
134
+
135
+ def summary_plot(data):
136
+ '''
137
+ Summary plot for feature contribution for all the classes.
138
+ '''
139
+
140
+ shap.summary_plot(_shap_val, data)
141
+
142
+
143
+ def bar_plot(n_classes):
144
+ '''
145
+ Bar plot analysis of feature contribution for each class
146
+ '''
147
+ for class_id in range(n_classes):
148
+ plt.title(f"For class_id {class_id}")
149
+ shap.plots.bar(_shap_val[..., class_id])
150
+ plt.tight_layout()
151
+ plt.show()
152
+
153
+
154
+ # def force_plot(shap_values):
155
+ # pass
156
+
157
+
158
+ def tree_plot(model , feature_names, size:tuple):
159
+ plt.figure(figsize=size)
160
+ tree.plot_tree(model, filled=True, feature_names=feature_names, class_names=True)
161
+ plt.tight_layout()
162
+ plt.show()
163
+
164
+
165
+
166
+
167
+
168
+
169
+
170
+
171
+
172
+
173
+
174
+
175
+
176
+
177
+
178
+