explainiverse 0.1.1a1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- explainiverse/__init__.py +45 -1
- explainiverse/adapters/__init__.py +9 -0
- explainiverse/adapters/base_adapter.py +25 -25
- explainiverse/adapters/sklearn_adapter.py +32 -32
- explainiverse/core/__init__.py +22 -0
- explainiverse/core/explainer.py +31 -31
- explainiverse/core/explanation.py +24 -24
- explainiverse/core/registry.py +545 -0
- explainiverse/engine/__init__.py +8 -0
- explainiverse/engine/suite.py +142 -142
- explainiverse/evaluation/__init__.py +8 -0
- explainiverse/evaluation/metrics.py +232 -232
- explainiverse/explainers/__init__.py +38 -0
- explainiverse/explainers/attribution/__init__.py +9 -0
- explainiverse/explainers/attribution/lime_wrapper.py +90 -63
- explainiverse/explainers/attribution/shap_wrapper.py +89 -66
- explainiverse/explainers/counterfactual/__init__.py +8 -0
- explainiverse/explainers/counterfactual/dice_wrapper.py +302 -0
- explainiverse/explainers/global_explainers/__init__.py +23 -0
- explainiverse/explainers/global_explainers/ale.py +191 -0
- explainiverse/explainers/global_explainers/partial_dependence.py +192 -0
- explainiverse/explainers/global_explainers/permutation_importance.py +123 -0
- explainiverse/explainers/global_explainers/sage.py +164 -0
- explainiverse/explainers/rule_based/__init__.py +8 -0
- explainiverse/explainers/rule_based/anchors_wrapper.py +350 -0
- explainiverse-0.2.0.dist-info/METADATA +264 -0
- explainiverse-0.2.0.dist-info/RECORD +29 -0
- explainiverse-0.1.1a1.dist-info/METADATA +0 -128
- explainiverse-0.1.1a1.dist-info/RECORD +0 -19
- {explainiverse-0.1.1a1.dist-info → explainiverse-0.2.0.dist-info}/LICENSE +0 -0
- {explainiverse-0.1.1a1.dist-info → explainiverse-0.2.0.dist-info}/WHEEL +0 -0
|
@@ -1,233 +1,233 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
from explainiverse.core.explanation import Explanation
|
|
3
|
-
from sklearn.metrics import accuracy_score
|
|
4
|
-
import copy
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def compute_aopc(
|
|
8
|
-
model,
|
|
9
|
-
instance: np.ndarray,
|
|
10
|
-
explanation: Explanation,
|
|
11
|
-
num_steps: int = 10,
|
|
12
|
-
baseline_value: float = 0.0
|
|
13
|
-
) -> float:
|
|
14
|
-
"""
|
|
15
|
-
Computes Area Over the Perturbation Curve (AOPC) by iteratively removing top features.
|
|
16
|
-
|
|
17
|
-
Args:
|
|
18
|
-
model: wrapped model with .predict() method
|
|
19
|
-
instance: input sample (1D array)
|
|
20
|
-
explanation: Explanation object
|
|
21
|
-
num_steps: number of top features to remove
|
|
22
|
-
baseline_value: value to replace removed features with (e.g., 0, mean)
|
|
23
|
-
|
|
24
|
-
Returns:
|
|
25
|
-
AOPC score (higher means explanation is more faithful)
|
|
26
|
-
"""
|
|
27
|
-
base_pred = model.predict(instance.reshape(1, -1))[0]
|
|
28
|
-
attributions = explanation.explanation_data.get("feature_attributions", {})
|
|
29
|
-
|
|
30
|
-
if not attributions:
|
|
31
|
-
raise ValueError("No feature attributions found in explanation.")
|
|
32
|
-
|
|
33
|
-
# Sort features by abs importance
|
|
34
|
-
sorted_features = sorted(
|
|
35
|
-
attributions.items(),
|
|
36
|
-
key=lambda x: abs(x[1]),
|
|
37
|
-
reverse=True
|
|
38
|
-
)
|
|
39
|
-
|
|
40
|
-
# Try to map feature names to indices
|
|
41
|
-
feature_indices = []
|
|
42
|
-
for i, (fname, _) in enumerate(sorted_features):
|
|
43
|
-
try:
|
|
44
|
-
idx = explanation.feature_names.index(fname)
|
|
45
|
-
except Exception:
|
|
46
|
-
idx = i # fallback: assume order
|
|
47
|
-
feature_indices.append(idx)
|
|
48
|
-
|
|
49
|
-
deltas = []
|
|
50
|
-
modified = instance.copy()
|
|
51
|
-
|
|
52
|
-
for i in range(min(num_steps, len(feature_indices))):
|
|
53
|
-
idx = feature_indices[i]
|
|
54
|
-
modified[idx] = baseline_value
|
|
55
|
-
new_pred = model.predict(modified.reshape(1, -1))[0]
|
|
56
|
-
delta = abs(base_pred - new_pred)
|
|
57
|
-
deltas.append(delta)
|
|
58
|
-
|
|
59
|
-
return np.mean(deltas)
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
def compute_batch_aopc(
|
|
63
|
-
model,
|
|
64
|
-
X: np.ndarray,
|
|
65
|
-
explanations: dict,
|
|
66
|
-
num_steps: int = 10,
|
|
67
|
-
baseline_value: float = 0.0
|
|
68
|
-
) -> dict:
|
|
69
|
-
"""
|
|
70
|
-
Compute average AOPC for multiple explainers over a batch of instances.
|
|
71
|
-
|
|
72
|
-
Args:
|
|
73
|
-
model: wrapped model
|
|
74
|
-
X: 2D input array
|
|
75
|
-
explanations: dict of {explainer_name: list of Explanation objects}
|
|
76
|
-
num_steps: number of top features to remove
|
|
77
|
-
baseline_value: value to replace features with
|
|
78
|
-
|
|
79
|
-
Returns:
|
|
80
|
-
Dict of {explainer_name: mean AOPC score}
|
|
81
|
-
"""
|
|
82
|
-
results = {}
|
|
83
|
-
|
|
84
|
-
for explainer_name, expl_list in explanations.items():
|
|
85
|
-
scores = []
|
|
86
|
-
for i, exp in enumerate(expl_list):
|
|
87
|
-
instance = X[i]
|
|
88
|
-
score = compute_aopc(model, instance, exp, num_steps, baseline_value)
|
|
89
|
-
scores.append(score)
|
|
90
|
-
results[explainer_name] = np.mean(scores)
|
|
91
|
-
|
|
92
|
-
return results
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
def compute_roar(
|
|
96
|
-
model_class,
|
|
97
|
-
X_train: np.ndarray,
|
|
98
|
-
y_train: np.ndarray,
|
|
99
|
-
X_test: np.ndarray,
|
|
100
|
-
y_test: np.ndarray,
|
|
101
|
-
explanations: list,
|
|
102
|
-
top_k: int = 3,
|
|
103
|
-
baseline_value: float = 0.0,
|
|
104
|
-
model_kwargs: dict = None
|
|
105
|
-
) -> float:
|
|
106
|
-
"""
|
|
107
|
-
Compute ROAR (Remove And Retrain) using top-k important features from explanations.
|
|
108
|
-
|
|
109
|
-
Args:
|
|
110
|
-
model_class: uninstantiated model class (e.g. LogisticRegression)
|
|
111
|
-
X_train: full training data
|
|
112
|
-
y_train: training labels
|
|
113
|
-
X_test: test features
|
|
114
|
-
y_test: test labels
|
|
115
|
-
explanations: list of Explanation objects (one per train instance)
|
|
116
|
-
top_k: number of top features to remove
|
|
117
|
-
baseline_value: what to set removed features to
|
|
118
|
-
model_kwargs: optional kwargs to pass to model_class
|
|
119
|
-
|
|
120
|
-
Returns:
|
|
121
|
-
Accuracy drop (baseline_acc - retrained_acc)
|
|
122
|
-
"""
|
|
123
|
-
model_kwargs = model_kwargs or {}
|
|
124
|
-
|
|
125
|
-
# Baseline model
|
|
126
|
-
baseline_model = model_class(**model_kwargs)
|
|
127
|
-
baseline_model.fit(X_train, y_train)
|
|
128
|
-
baseline_preds = baseline_model.predict(X_test)
|
|
129
|
-
baseline_acc = accuracy_score(y_test, baseline_preds)
|
|
130
|
-
|
|
131
|
-
# Compute top-k feature indices from attributions (use mode)
|
|
132
|
-
feature_counts = {}
|
|
133
|
-
for exp in explanations:
|
|
134
|
-
for fname, val in sorted(exp.explanation_data["feature_attributions"].items(), key=lambda x: abs(x[1]), reverse=True)[:top_k]:
|
|
135
|
-
try:
|
|
136
|
-
idx = exp.feature_names.index(fname)
|
|
137
|
-
feature_counts[idx] = feature_counts.get(idx, 0) + 1
|
|
138
|
-
except:
|
|
139
|
-
continue
|
|
140
|
-
|
|
141
|
-
top_features = sorted(feature_counts.items(), key=lambda x: x[1], reverse=True)[:top_k]
|
|
142
|
-
top_feature_indices = [idx for idx, _ in top_features]
|
|
143
|
-
|
|
144
|
-
# Remove top-k from training and test data
|
|
145
|
-
X_train_mod = copy.deepcopy(X_train)
|
|
146
|
-
X_test_mod = copy.deepcopy(X_test)
|
|
147
|
-
|
|
148
|
-
# Prepare feature-wise baselines
|
|
149
|
-
# Compute or assign feature-wise baseline values
|
|
150
|
-
if not isinstance(
|
|
151
|
-
baseline_value,
|
|
152
|
-
(str, float, int, np.number, np.ndarray)
|
|
153
|
-
) and not callable(baseline_value):
|
|
154
|
-
raise ValueError(f"Invalid baseline_value type: {type(baseline_value)}")
|
|
155
|
-
|
|
156
|
-
if isinstance(baseline_value, str):
|
|
157
|
-
if baseline_value == "mean":
|
|
158
|
-
feature_baseline = np.mean(X_train, axis=0)
|
|
159
|
-
elif baseline_value == "median":
|
|
160
|
-
feature_baseline = np.median(X_train, axis=0)
|
|
161
|
-
else:
|
|
162
|
-
raise ValueError(f"Unsupported string baseline: {baseline_value}")
|
|
163
|
-
elif callable(baseline_value):
|
|
164
|
-
feature_baseline = baseline_value(X_train)
|
|
165
|
-
elif isinstance(baseline_value, np.ndarray):
|
|
166
|
-
if baseline_value.shape != (X_train.shape[1],):
|
|
167
|
-
raise ValueError("baseline_value ndarray must match number of features")
|
|
168
|
-
feature_baseline = baseline_value
|
|
169
|
-
elif isinstance(baseline_value, (float, int, np.number)):
|
|
170
|
-
feature_baseline = np.full(X_train.shape[1], baseline_value)
|
|
171
|
-
else:
|
|
172
|
-
raise ValueError(f"Invalid baseline_value type: {type(baseline_value)}")
|
|
173
|
-
|
|
174
|
-
for idx in top_feature_indices:
|
|
175
|
-
X_train_mod[:, idx] = feature_baseline[idx]
|
|
176
|
-
X_test_mod[:, idx] = feature_baseline[idx]
|
|
177
|
-
# X_train_mod[:, idx] = baseline_value
|
|
178
|
-
# X_test_mod[:, idx] = baseline_value
|
|
179
|
-
|
|
180
|
-
# Retrain and evaluate
|
|
181
|
-
retrained_model = model_class(**model_kwargs)
|
|
182
|
-
retrained_model.fit(X_train_mod, y_train)
|
|
183
|
-
retrained_preds = retrained_model.predict(X_test_mod)
|
|
184
|
-
retrained_acc = accuracy_score(y_test, retrained_preds)
|
|
185
|
-
|
|
186
|
-
return baseline_acc - retrained_acc
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
def compute_roar_curve(
|
|
190
|
-
model_class,
|
|
191
|
-
X_train,
|
|
192
|
-
y_train,
|
|
193
|
-
X_test,
|
|
194
|
-
y_test,
|
|
195
|
-
explanations,
|
|
196
|
-
max_k=5,
|
|
197
|
-
baseline_value="mean",
|
|
198
|
-
model_kwargs=None
|
|
199
|
-
) -> dict:
|
|
200
|
-
"""
|
|
201
|
-
Compute ROAR accuracy drops across a range of top-k features removed.
|
|
202
|
-
|
|
203
|
-
Args:
|
|
204
|
-
model_class: model type (e.g. LogisticRegression)
|
|
205
|
-
X_train, y_train, X_test, y_test: full dataset
|
|
206
|
-
explanations: list of Explanation objects
|
|
207
|
-
max_k: maximum top-k to try
|
|
208
|
-
baseline_value: string, scalar, ndarray, or callable
|
|
209
|
-
model_kwargs: passed to model class
|
|
210
|
-
|
|
211
|
-
Returns:
|
|
212
|
-
Dict of {k: accuracy drop} for k in 1..max_k
|
|
213
|
-
"""
|
|
214
|
-
from copy import deepcopy
|
|
215
|
-
|
|
216
|
-
model_kwargs = model_kwargs or {}
|
|
217
|
-
curve = {}
|
|
218
|
-
|
|
219
|
-
for k in range(1, max_k + 1):
|
|
220
|
-
acc_drop = compute_roar(
|
|
221
|
-
model_class=model_class,
|
|
222
|
-
X_train=deepcopy(X_train),
|
|
223
|
-
y_train=deepcopy(y_train),
|
|
224
|
-
X_test=deepcopy(X_test),
|
|
225
|
-
y_test=deepcopy(y_test),
|
|
226
|
-
explanations=deepcopy(explanations),
|
|
227
|
-
top_k=k,
|
|
228
|
-
baseline_value=baseline_value,
|
|
229
|
-
model_kwargs=deepcopy(model_kwargs)
|
|
230
|
-
)
|
|
231
|
-
curve[k] = acc_drop
|
|
232
|
-
|
|
1
|
+
import numpy as np
|
|
2
|
+
from explainiverse.core.explanation import Explanation
|
|
3
|
+
from sklearn.metrics import accuracy_score
|
|
4
|
+
import copy
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def compute_aopc(
|
|
8
|
+
model,
|
|
9
|
+
instance: np.ndarray,
|
|
10
|
+
explanation: Explanation,
|
|
11
|
+
num_steps: int = 10,
|
|
12
|
+
baseline_value: float = 0.0
|
|
13
|
+
) -> float:
|
|
14
|
+
"""
|
|
15
|
+
Computes Area Over the Perturbation Curve (AOPC) by iteratively removing top features.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
model: wrapped model with .predict() method
|
|
19
|
+
instance: input sample (1D array)
|
|
20
|
+
explanation: Explanation object
|
|
21
|
+
num_steps: number of top features to remove
|
|
22
|
+
baseline_value: value to replace removed features with (e.g., 0, mean)
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
AOPC score (higher means explanation is more faithful)
|
|
26
|
+
"""
|
|
27
|
+
base_pred = model.predict(instance.reshape(1, -1))[0]
|
|
28
|
+
attributions = explanation.explanation_data.get("feature_attributions", {})
|
|
29
|
+
|
|
30
|
+
if not attributions:
|
|
31
|
+
raise ValueError("No feature attributions found in explanation.")
|
|
32
|
+
|
|
33
|
+
# Sort features by abs importance
|
|
34
|
+
sorted_features = sorted(
|
|
35
|
+
attributions.items(),
|
|
36
|
+
key=lambda x: abs(x[1]),
|
|
37
|
+
reverse=True
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# Try to map feature names to indices
|
|
41
|
+
feature_indices = []
|
|
42
|
+
for i, (fname, _) in enumerate(sorted_features):
|
|
43
|
+
try:
|
|
44
|
+
idx = explanation.feature_names.index(fname)
|
|
45
|
+
except Exception:
|
|
46
|
+
idx = i # fallback: assume order
|
|
47
|
+
feature_indices.append(idx)
|
|
48
|
+
|
|
49
|
+
deltas = []
|
|
50
|
+
modified = instance.copy()
|
|
51
|
+
|
|
52
|
+
for i in range(min(num_steps, len(feature_indices))):
|
|
53
|
+
idx = feature_indices[i]
|
|
54
|
+
modified[idx] = baseline_value
|
|
55
|
+
new_pred = model.predict(modified.reshape(1, -1))[0]
|
|
56
|
+
delta = abs(base_pred - new_pred)
|
|
57
|
+
deltas.append(delta)
|
|
58
|
+
|
|
59
|
+
return np.mean(deltas)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def compute_batch_aopc(
|
|
63
|
+
model,
|
|
64
|
+
X: np.ndarray,
|
|
65
|
+
explanations: dict,
|
|
66
|
+
num_steps: int = 10,
|
|
67
|
+
baseline_value: float = 0.0
|
|
68
|
+
) -> dict:
|
|
69
|
+
"""
|
|
70
|
+
Compute average AOPC for multiple explainers over a batch of instances.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
model: wrapped model
|
|
74
|
+
X: 2D input array
|
|
75
|
+
explanations: dict of {explainer_name: list of Explanation objects}
|
|
76
|
+
num_steps: number of top features to remove
|
|
77
|
+
baseline_value: value to replace features with
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Dict of {explainer_name: mean AOPC score}
|
|
81
|
+
"""
|
|
82
|
+
results = {}
|
|
83
|
+
|
|
84
|
+
for explainer_name, expl_list in explanations.items():
|
|
85
|
+
scores = []
|
|
86
|
+
for i, exp in enumerate(expl_list):
|
|
87
|
+
instance = X[i]
|
|
88
|
+
score = compute_aopc(model, instance, exp, num_steps, baseline_value)
|
|
89
|
+
scores.append(score)
|
|
90
|
+
results[explainer_name] = np.mean(scores)
|
|
91
|
+
|
|
92
|
+
return results
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def compute_roar(
|
|
96
|
+
model_class,
|
|
97
|
+
X_train: np.ndarray,
|
|
98
|
+
y_train: np.ndarray,
|
|
99
|
+
X_test: np.ndarray,
|
|
100
|
+
y_test: np.ndarray,
|
|
101
|
+
explanations: list,
|
|
102
|
+
top_k: int = 3,
|
|
103
|
+
baseline_value: float = 0.0,
|
|
104
|
+
model_kwargs: dict = None
|
|
105
|
+
) -> float:
|
|
106
|
+
"""
|
|
107
|
+
Compute ROAR (Remove And Retrain) using top-k important features from explanations.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
model_class: uninstantiated model class (e.g. LogisticRegression)
|
|
111
|
+
X_train: full training data
|
|
112
|
+
y_train: training labels
|
|
113
|
+
X_test: test features
|
|
114
|
+
y_test: test labels
|
|
115
|
+
explanations: list of Explanation objects (one per train instance)
|
|
116
|
+
top_k: number of top features to remove
|
|
117
|
+
baseline_value: what to set removed features to
|
|
118
|
+
model_kwargs: optional kwargs to pass to model_class
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
Accuracy drop (baseline_acc - retrained_acc)
|
|
122
|
+
"""
|
|
123
|
+
model_kwargs = model_kwargs or {}
|
|
124
|
+
|
|
125
|
+
# Baseline model
|
|
126
|
+
baseline_model = model_class(**model_kwargs)
|
|
127
|
+
baseline_model.fit(X_train, y_train)
|
|
128
|
+
baseline_preds = baseline_model.predict(X_test)
|
|
129
|
+
baseline_acc = accuracy_score(y_test, baseline_preds)
|
|
130
|
+
|
|
131
|
+
# Compute top-k feature indices from attributions (use mode)
|
|
132
|
+
feature_counts = {}
|
|
133
|
+
for exp in explanations:
|
|
134
|
+
for fname, val in sorted(exp.explanation_data["feature_attributions"].items(), key=lambda x: abs(x[1]), reverse=True)[:top_k]:
|
|
135
|
+
try:
|
|
136
|
+
idx = exp.feature_names.index(fname)
|
|
137
|
+
feature_counts[idx] = feature_counts.get(idx, 0) + 1
|
|
138
|
+
except:
|
|
139
|
+
continue
|
|
140
|
+
|
|
141
|
+
top_features = sorted(feature_counts.items(), key=lambda x: x[1], reverse=True)[:top_k]
|
|
142
|
+
top_feature_indices = [idx for idx, _ in top_features]
|
|
143
|
+
|
|
144
|
+
# Remove top-k from training and test data
|
|
145
|
+
X_train_mod = copy.deepcopy(X_train)
|
|
146
|
+
X_test_mod = copy.deepcopy(X_test)
|
|
147
|
+
|
|
148
|
+
# Prepare feature-wise baselines
|
|
149
|
+
# Compute or assign feature-wise baseline values
|
|
150
|
+
if not isinstance(
|
|
151
|
+
baseline_value,
|
|
152
|
+
(str, float, int, np.number, np.ndarray)
|
|
153
|
+
) and not callable(baseline_value):
|
|
154
|
+
raise ValueError(f"Invalid baseline_value type: {type(baseline_value)}")
|
|
155
|
+
|
|
156
|
+
if isinstance(baseline_value, str):
|
|
157
|
+
if baseline_value == "mean":
|
|
158
|
+
feature_baseline = np.mean(X_train, axis=0)
|
|
159
|
+
elif baseline_value == "median":
|
|
160
|
+
feature_baseline = np.median(X_train, axis=0)
|
|
161
|
+
else:
|
|
162
|
+
raise ValueError(f"Unsupported string baseline: {baseline_value}")
|
|
163
|
+
elif callable(baseline_value):
|
|
164
|
+
feature_baseline = baseline_value(X_train)
|
|
165
|
+
elif isinstance(baseline_value, np.ndarray):
|
|
166
|
+
if baseline_value.shape != (X_train.shape[1],):
|
|
167
|
+
raise ValueError("baseline_value ndarray must match number of features")
|
|
168
|
+
feature_baseline = baseline_value
|
|
169
|
+
elif isinstance(baseline_value, (float, int, np.number)):
|
|
170
|
+
feature_baseline = np.full(X_train.shape[1], baseline_value)
|
|
171
|
+
else:
|
|
172
|
+
raise ValueError(f"Invalid baseline_value type: {type(baseline_value)}")
|
|
173
|
+
|
|
174
|
+
for idx in top_feature_indices:
|
|
175
|
+
X_train_mod[:, idx] = feature_baseline[idx]
|
|
176
|
+
X_test_mod[:, idx] = feature_baseline[idx]
|
|
177
|
+
# X_train_mod[:, idx] = baseline_value
|
|
178
|
+
# X_test_mod[:, idx] = baseline_value
|
|
179
|
+
|
|
180
|
+
# Retrain and evaluate
|
|
181
|
+
retrained_model = model_class(**model_kwargs)
|
|
182
|
+
retrained_model.fit(X_train_mod, y_train)
|
|
183
|
+
retrained_preds = retrained_model.predict(X_test_mod)
|
|
184
|
+
retrained_acc = accuracy_score(y_test, retrained_preds)
|
|
185
|
+
|
|
186
|
+
return baseline_acc - retrained_acc
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def compute_roar_curve(
|
|
190
|
+
model_class,
|
|
191
|
+
X_train,
|
|
192
|
+
y_train,
|
|
193
|
+
X_test,
|
|
194
|
+
y_test,
|
|
195
|
+
explanations,
|
|
196
|
+
max_k=5,
|
|
197
|
+
baseline_value="mean",
|
|
198
|
+
model_kwargs=None
|
|
199
|
+
) -> dict:
|
|
200
|
+
"""
|
|
201
|
+
Compute ROAR accuracy drops across a range of top-k features removed.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
model_class: model type (e.g. LogisticRegression)
|
|
205
|
+
X_train, y_train, X_test, y_test: full dataset
|
|
206
|
+
explanations: list of Explanation objects
|
|
207
|
+
max_k: maximum top-k to try
|
|
208
|
+
baseline_value: string, scalar, ndarray, or callable
|
|
209
|
+
model_kwargs: passed to model class
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
Dict of {k: accuracy drop} for k in 1..max_k
|
|
213
|
+
"""
|
|
214
|
+
from copy import deepcopy
|
|
215
|
+
|
|
216
|
+
model_kwargs = model_kwargs or {}
|
|
217
|
+
curve = {}
|
|
218
|
+
|
|
219
|
+
for k in range(1, max_k + 1):
|
|
220
|
+
acc_drop = compute_roar(
|
|
221
|
+
model_class=model_class,
|
|
222
|
+
X_train=deepcopy(X_train),
|
|
223
|
+
y_train=deepcopy(y_train),
|
|
224
|
+
X_test=deepcopy(X_test),
|
|
225
|
+
y_test=deepcopy(y_test),
|
|
226
|
+
explanations=deepcopy(explanations),
|
|
227
|
+
top_k=k,
|
|
228
|
+
baseline_value=baseline_value,
|
|
229
|
+
model_kwargs=deepcopy(model_kwargs)
|
|
230
|
+
)
|
|
231
|
+
curve[k] = acc_drop
|
|
232
|
+
|
|
233
233
|
return curve
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# src/explainiverse/explainers/__init__.py
|
|
2
|
+
"""
|
|
3
|
+
Explainiverse Explainers - comprehensive XAI method implementations.
|
|
4
|
+
|
|
5
|
+
Local Explainers (instance-level):
|
|
6
|
+
- LIME: Local Interpretable Model-agnostic Explanations
|
|
7
|
+
- SHAP: SHapley Additive exPlanations
|
|
8
|
+
- Anchors: High-precision rule-based explanations
|
|
9
|
+
- Counterfactual: Diverse counterfactual explanations
|
|
10
|
+
|
|
11
|
+
Global Explainers (model-level):
|
|
12
|
+
- Permutation Importance: Feature importance via permutation
|
|
13
|
+
- Partial Dependence: Marginal feature effects (PDP)
|
|
14
|
+
- ALE: Accumulated Local Effects (unbiased for correlated features)
|
|
15
|
+
- SAGE: Shapley Additive Global importancE
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from explainiverse.explainers.attribution.lime_wrapper import LimeExplainer
|
|
19
|
+
from explainiverse.explainers.attribution.shap_wrapper import ShapExplainer
|
|
20
|
+
from explainiverse.explainers.rule_based.anchors_wrapper import AnchorsExplainer
|
|
21
|
+
from explainiverse.explainers.counterfactual.dice_wrapper import CounterfactualExplainer
|
|
22
|
+
from explainiverse.explainers.global_explainers.permutation_importance import PermutationImportanceExplainer
|
|
23
|
+
from explainiverse.explainers.global_explainers.partial_dependence import PartialDependenceExplainer
|
|
24
|
+
from explainiverse.explainers.global_explainers.ale import ALEExplainer
|
|
25
|
+
from explainiverse.explainers.global_explainers.sage import SAGEExplainer
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
# Local explainers
|
|
29
|
+
"LimeExplainer",
|
|
30
|
+
"ShapExplainer",
|
|
31
|
+
"AnchorsExplainer",
|
|
32
|
+
"CounterfactualExplainer",
|
|
33
|
+
# Global explainers
|
|
34
|
+
"PermutationImportanceExplainer",
|
|
35
|
+
"PartialDependenceExplainer",
|
|
36
|
+
"ALEExplainer",
|
|
37
|
+
"SAGEExplainer",
|
|
38
|
+
]
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# src/explainiverse/explainers/attribution/__init__.py
|
|
2
|
+
"""
|
|
3
|
+
Attribution-based explainers - feature importance explanations.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from explainiverse.explainers.attribution.lime_wrapper import LimeExplainer
|
|
7
|
+
from explainiverse.explainers.attribution.shap_wrapper import ShapExplainer
|
|
8
|
+
|
|
9
|
+
__all__ = ["LimeExplainer", "ShapExplainer"]
|
|
@@ -1,63 +1,90 @@
|
|
|
1
|
-
# src/explainiverse/explainers/attribution/lime_wrapper.py
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
)
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
1
|
+
# src/explainiverse/explainers/attribution/lime_wrapper.py
|
|
2
|
+
"""
|
|
3
|
+
LIME Explainer - Local Interpretable Model-agnostic Explanations.
|
|
4
|
+
|
|
5
|
+
LIME explains individual predictions by fitting a simple interpretable
|
|
6
|
+
model (linear regression) to perturbed samples around the instance.
|
|
7
|
+
|
|
8
|
+
Reference:
|
|
9
|
+
Ribeiro, M.T., Singh, S., & Guestrin, C. (2016). "Why Should I Trust You?":
|
|
10
|
+
Explaining the Predictions of Any Classifier. KDD 2016.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
from lime.lime_tabular import LimeTabularExplainer
|
|
15
|
+
|
|
16
|
+
from explainiverse.core.explainer import BaseExplainer
|
|
17
|
+
from explainiverse.core.explanation import Explanation
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class LimeExplainer(BaseExplainer):
|
|
21
|
+
"""
|
|
22
|
+
LIME explainer for local, model-agnostic explanations.
|
|
23
|
+
|
|
24
|
+
LIME (Local Interpretable Model-agnostic Explanations) explains individual
|
|
25
|
+
predictions by approximating the model locally with an interpretable model.
|
|
26
|
+
It generates perturbed samples around the instance and fits a weighted
|
|
27
|
+
linear model to understand feature contributions.
|
|
28
|
+
|
|
29
|
+
This implementation wraps the official LIME library for tabular data.
|
|
30
|
+
|
|
31
|
+
Attributes:
|
|
32
|
+
model: Model adapter with .predict() method
|
|
33
|
+
feature_names: List of feature names
|
|
34
|
+
class_names: List of class names
|
|
35
|
+
mode: 'classification' or 'regression'
|
|
36
|
+
explainer: The underlying LimeTabularExplainer
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(self, model, training_data, feature_names, class_names, mode="classification"):
|
|
40
|
+
"""
|
|
41
|
+
Initialize the LIME explainer.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
model: A model adapter (implements .predict()).
|
|
45
|
+
training_data: The data used to initialize LIME (2D np.ndarray).
|
|
46
|
+
Used to compute statistics for perturbation generation.
|
|
47
|
+
feature_names: List of feature names.
|
|
48
|
+
class_names: List of class names.
|
|
49
|
+
mode: 'classification' or 'regression'.
|
|
50
|
+
"""
|
|
51
|
+
super().__init__(model)
|
|
52
|
+
self.feature_names = list(feature_names)
|
|
53
|
+
self.class_names = list(class_names)
|
|
54
|
+
self.mode = mode
|
|
55
|
+
|
|
56
|
+
self.explainer = LimeTabularExplainer(
|
|
57
|
+
training_data=training_data,
|
|
58
|
+
feature_names=feature_names,
|
|
59
|
+
class_names=class_names,
|
|
60
|
+
mode=mode
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
def explain(self, instance, num_features=5, top_labels=1):
|
|
64
|
+
"""
|
|
65
|
+
Generate a local explanation for the given instance.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
instance: 1D numpy array (single row) to explain
|
|
69
|
+
num_features: Number of top features to include in explanation
|
|
70
|
+
top_labels: Number of top predicted labels to explain
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Explanation object with feature attributions
|
|
74
|
+
"""
|
|
75
|
+
lime_exp = self.explainer.explain_instance(
|
|
76
|
+
data_row=instance,
|
|
77
|
+
predict_fn=self.model.predict,
|
|
78
|
+
num_features=num_features,
|
|
79
|
+
top_labels=top_labels
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
label_index = lime_exp.top_labels[0]
|
|
83
|
+
label_name = self.class_names[label_index]
|
|
84
|
+
attributions = dict(lime_exp.as_list(label=label_index))
|
|
85
|
+
|
|
86
|
+
return Explanation(
|
|
87
|
+
explainer_name="LIME",
|
|
88
|
+
target_class=label_name,
|
|
89
|
+
explanation_data={"feature_attributions": attributions}
|
|
90
|
+
)
|