explainiverse 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- explainiverse/__init__.py +1 -1
- explainiverse/core/registry.py +36 -0
- explainiverse/explainers/gradient/__init__.py +14 -0
- explainiverse/explainers/gradient/saliency.py +293 -0
- explainiverse/explainers/gradient/tcav.py +865 -0
- {explainiverse-0.5.0.dist-info → explainiverse-0.7.0.dist-info}/METADATA +97 -11
- {explainiverse-0.5.0.dist-info → explainiverse-0.7.0.dist-info}/RECORD +9 -7
- {explainiverse-0.5.0.dist-info → explainiverse-0.7.0.dist-info}/LICENSE +0 -0
- {explainiverse-0.5.0.dist-info → explainiverse-0.7.0.dist-info}/WHEEL +0 -0
explainiverse/__init__.py
CHANGED
explainiverse/core/registry.py
CHANGED
|
@@ -373,6 +373,8 @@ def _create_default_registry() -> ExplainerRegistry:
|
|
|
373
373
|
from explainiverse.explainers.gradient.gradcam import GradCAMExplainer
|
|
374
374
|
from explainiverse.explainers.gradient.deeplift import DeepLIFTExplainer, DeepLIFTShapExplainer
|
|
375
375
|
from explainiverse.explainers.gradient.smoothgrad import SmoothGradExplainer
|
|
376
|
+
from explainiverse.explainers.gradient.saliency import SaliencyExplainer
|
|
377
|
+
from explainiverse.explainers.gradient.tcav import TCAVExplainer
|
|
376
378
|
from explainiverse.explainers.example_based.protodash import ProtoDashExplainer
|
|
377
379
|
|
|
378
380
|
registry = ExplainerRegistry()
|
|
@@ -551,6 +553,40 @@ def _create_default_registry() -> ExplainerRegistry:
|
|
|
551
553
|
)
|
|
552
554
|
)
|
|
553
555
|
|
|
556
|
+
# Register Saliency Maps (for neural networks)
|
|
557
|
+
registry.register(
|
|
558
|
+
name="saliency",
|
|
559
|
+
explainer_class=SaliencyExplainer,
|
|
560
|
+
meta=ExplainerMeta(
|
|
561
|
+
scope="local",
|
|
562
|
+
model_types=["neural"],
|
|
563
|
+
data_types=["tabular", "image"],
|
|
564
|
+
task_types=["classification", "regression"],
|
|
565
|
+
description="Saliency Maps - gradient-based feature attribution (requires PyTorch)",
|
|
566
|
+
paper_reference="Simonyan et al., 2014 - 'Deep Inside Convolutional Networks' (ICLR Workshop)",
|
|
567
|
+
complexity="O(forward_pass + backward_pass)",
|
|
568
|
+
requires_training_data=False,
|
|
569
|
+
supports_batching=True
|
|
570
|
+
)
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
# Register TCAV (Concept-based explanations for neural networks)
|
|
574
|
+
registry.register(
|
|
575
|
+
name="tcav",
|
|
576
|
+
explainer_class=TCAVExplainer,
|
|
577
|
+
meta=ExplainerMeta(
|
|
578
|
+
scope="local",
|
|
579
|
+
model_types=["neural"],
|
|
580
|
+
data_types=["tabular", "image"],
|
|
581
|
+
task_types=["classification"],
|
|
582
|
+
description="TCAV - Testing with Concept Activation Vectors for concept-based explanations (requires PyTorch)",
|
|
583
|
+
paper_reference="Kim et al., 2018 - 'Interpretability Beyond Feature Attribution: Quantitative Testing with Concept Activation Vectors' (ICML)",
|
|
584
|
+
complexity="O(n_concepts * n_test_inputs * forward_pass)",
|
|
585
|
+
requires_training_data=True,
|
|
586
|
+
supports_batching=True
|
|
587
|
+
)
|
|
588
|
+
)
|
|
589
|
+
|
|
554
590
|
# =========================================================================
|
|
555
591
|
# Global Explainers (model-level)
|
|
556
592
|
# =========================================================================
|
|
@@ -4,12 +4,23 @@ Gradient-based explainers for neural networks.
|
|
|
4
4
|
|
|
5
5
|
These explainers require models that support gradient computation,
|
|
6
6
|
typically via the PyTorchAdapter.
|
|
7
|
+
|
|
8
|
+
Explainers:
|
|
9
|
+
- IntegratedGradientsExplainer: Axiomatic attributions via path integration
|
|
10
|
+
- GradCAMExplainer: Visual explanations for CNNs
|
|
11
|
+
- DeepLIFTExplainer: Reference-based attribution
|
|
12
|
+
- DeepLIFTShapExplainer: DeepLIFT + SHAP combination
|
|
13
|
+
- SmoothGradExplainer: Noise-averaged gradients
|
|
14
|
+
- SaliencyExplainer: Basic gradient attribution
|
|
15
|
+
- TCAVExplainer: Concept-based explanations (TCAV)
|
|
7
16
|
"""
|
|
8
17
|
|
|
9
18
|
from explainiverse.explainers.gradient.integrated_gradients import IntegratedGradientsExplainer
|
|
10
19
|
from explainiverse.explainers.gradient.gradcam import GradCAMExplainer
|
|
11
20
|
from explainiverse.explainers.gradient.deeplift import DeepLIFTExplainer, DeepLIFTShapExplainer
|
|
12
21
|
from explainiverse.explainers.gradient.smoothgrad import SmoothGradExplainer
|
|
22
|
+
from explainiverse.explainers.gradient.saliency import SaliencyExplainer
|
|
23
|
+
from explainiverse.explainers.gradient.tcav import TCAVExplainer, ConceptActivationVector
|
|
13
24
|
|
|
14
25
|
__all__ = [
|
|
15
26
|
"IntegratedGradientsExplainer",
|
|
@@ -17,4 +28,7 @@ __all__ = [
|
|
|
17
28
|
"DeepLIFTExplainer",
|
|
18
29
|
"DeepLIFTShapExplainer",
|
|
19
30
|
"SmoothGradExplainer",
|
|
31
|
+
"SaliencyExplainer",
|
|
32
|
+
"TCAVExplainer",
|
|
33
|
+
"ConceptActivationVector",
|
|
20
34
|
]
|
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
# src/explainiverse/explainers/gradient/saliency.py
|
|
2
|
+
"""
|
|
3
|
+
Saliency Maps - Gradient-Based Feature Attribution.
|
|
4
|
+
|
|
5
|
+
Saliency Maps compute feature attributions using the gradient of the output
|
|
6
|
+
with respect to the input. This is one of the simplest and fastest gradient-based
|
|
7
|
+
attribution methods, requiring only a single forward and backward pass.
|
|
8
|
+
|
|
9
|
+
Key Properties:
|
|
10
|
+
- Simple: Just compute the gradient of output w.r.t. input
|
|
11
|
+
- Fast: Single forward + backward pass
|
|
12
|
+
- Foundation: Base method that other gradient methods build upon
|
|
13
|
+
- Variants: Absolute saliency, signed saliency, input × gradient
|
|
14
|
+
|
|
15
|
+
Variants:
|
|
16
|
+
- Saliency (absolute): |∂f(x)/∂x| - magnitude of sensitivity
|
|
17
|
+
- Saliency (signed): ∂f(x)/∂x - direction and magnitude
|
|
18
|
+
- Input × Gradient: x ⊙ ∂f(x)/∂x - scaled by input values
|
|
19
|
+
|
|
20
|
+
Reference:
|
|
21
|
+
Simonyan, K., Vedaldi, A., & Zisserman, A. (2014).
|
|
22
|
+
Deep Inside Convolutional Networks: Visualising Image Classification
|
|
23
|
+
Models and Saliency Maps.
|
|
24
|
+
ICLR Workshop 2014.
|
|
25
|
+
https://arxiv.org/abs/1312.6034
|
|
26
|
+
|
|
27
|
+
Example:
|
|
28
|
+
from explainiverse.explainers.gradient import SaliencyExplainer
|
|
29
|
+
from explainiverse.adapters import PyTorchAdapter
|
|
30
|
+
|
|
31
|
+
adapter = PyTorchAdapter(model, task="classification")
|
|
32
|
+
|
|
33
|
+
explainer = SaliencyExplainer(
|
|
34
|
+
model=adapter,
|
|
35
|
+
feature_names=feature_names
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
explanation = explainer.explain(instance)
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
import numpy as np
|
|
42
|
+
from typing import List, Optional
|
|
43
|
+
|
|
44
|
+
from explainiverse.core.explainer import BaseExplainer
|
|
45
|
+
from explainiverse.core.explanation import Explanation
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class SaliencyExplainer(BaseExplainer):
|
|
49
|
+
"""
|
|
50
|
+
Saliency Maps explainer for neural networks.
|
|
51
|
+
|
|
52
|
+
Computes attributions using the gradient of the model output with respect
|
|
53
|
+
to the input features. This is the simplest gradient-based attribution
|
|
54
|
+
method and serves as the foundation for more sophisticated techniques.
|
|
55
|
+
|
|
56
|
+
Algorithm:
|
|
57
|
+
Saliency(x) = ∂f(x)/∂x (signed)
|
|
58
|
+
Saliency(x) = |∂f(x)/∂x| (absolute, default)
|
|
59
|
+
InputTimesGradient(x) = x ⊙ ∂f(x)/∂x
|
|
60
|
+
|
|
61
|
+
Attributes:
|
|
62
|
+
model: Model adapter with predict_with_gradients() method
|
|
63
|
+
feature_names: List of feature names
|
|
64
|
+
class_names: List of class names (for classification)
|
|
65
|
+
absolute_value: Whether to take absolute value of gradients
|
|
66
|
+
|
|
67
|
+
Example:
|
|
68
|
+
>>> explainer = SaliencyExplainer(adapter, feature_names)
|
|
69
|
+
>>> explanation = explainer.explain(instance)
|
|
70
|
+
>>> print(explanation.explanation_data["feature_attributions"])
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
def __init__(
|
|
74
|
+
self,
|
|
75
|
+
model,
|
|
76
|
+
feature_names: List[str],
|
|
77
|
+
class_names: Optional[List[str]] = None,
|
|
78
|
+
absolute_value: bool = True
|
|
79
|
+
):
|
|
80
|
+
"""
|
|
81
|
+
Initialize the Saliency explainer.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
model: A model adapter with predict_with_gradients() method.
|
|
85
|
+
Use PyTorchAdapter for PyTorch models.
|
|
86
|
+
feature_names: List of input feature names.
|
|
87
|
+
class_names: List of class names (for classification tasks).
|
|
88
|
+
absolute_value: If True (default), return absolute value of
|
|
89
|
+
gradients. Set to False for signed saliency.
|
|
90
|
+
|
|
91
|
+
Raises:
|
|
92
|
+
TypeError: If model doesn't have predict_with_gradients method.
|
|
93
|
+
"""
|
|
94
|
+
super().__init__(model)
|
|
95
|
+
|
|
96
|
+
# Validate model has gradient capability
|
|
97
|
+
if not hasattr(model, 'predict_with_gradients'):
|
|
98
|
+
raise TypeError(
|
|
99
|
+
"Model adapter must have predict_with_gradients() method. "
|
|
100
|
+
"Use PyTorchAdapter for PyTorch models."
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
self.feature_names = list(feature_names)
|
|
104
|
+
self.class_names = list(class_names) if class_names else None
|
|
105
|
+
self.absolute_value = absolute_value
|
|
106
|
+
|
|
107
|
+
def _compute_saliency(
|
|
108
|
+
self,
|
|
109
|
+
instance: np.ndarray,
|
|
110
|
+
target_class: Optional[int] = None,
|
|
111
|
+
method: str = "saliency"
|
|
112
|
+
) -> np.ndarray:
|
|
113
|
+
"""
|
|
114
|
+
Compute saliency attributions for a single instance.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
instance: Input instance (1D array).
|
|
118
|
+
target_class: Target class for gradient computation.
|
|
119
|
+
method: Attribution method:
|
|
120
|
+
- "saliency": Raw gradient (default)
|
|
121
|
+
- "input_times_gradient": Gradient multiplied by input
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
Array of attribution scores for each input feature.
|
|
125
|
+
"""
|
|
126
|
+
instance = instance.flatten().astype(np.float32)
|
|
127
|
+
|
|
128
|
+
# Compute gradient
|
|
129
|
+
_, gradients = self.model.predict_with_gradients(
|
|
130
|
+
instance.reshape(1, -1),
|
|
131
|
+
target_class=target_class
|
|
132
|
+
)
|
|
133
|
+
gradients = gradients.flatten()
|
|
134
|
+
|
|
135
|
+
# Apply method
|
|
136
|
+
if method == "saliency":
|
|
137
|
+
attributions = gradients
|
|
138
|
+
elif method == "input_times_gradient":
|
|
139
|
+
attributions = instance * gradients
|
|
140
|
+
else:
|
|
141
|
+
raise ValueError(
|
|
142
|
+
f"Unknown method: '{method}'. "
|
|
143
|
+
f"Use 'saliency' or 'input_times_gradient'."
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# Apply absolute value if configured
|
|
147
|
+
if self.absolute_value and method == "saliency":
|
|
148
|
+
attributions = np.abs(attributions)
|
|
149
|
+
|
|
150
|
+
return attributions
|
|
151
|
+
|
|
152
|
+
def explain(
|
|
153
|
+
self,
|
|
154
|
+
instance: np.ndarray,
|
|
155
|
+
target_class: Optional[int] = None,
|
|
156
|
+
method: str = "saliency"
|
|
157
|
+
) -> Explanation:
|
|
158
|
+
"""
|
|
159
|
+
Generate Saliency explanation for an instance.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
instance: 1D numpy array of input features.
|
|
163
|
+
target_class: For classification, which class to explain.
|
|
164
|
+
If None, uses the predicted class.
|
|
165
|
+
method: Attribution method:
|
|
166
|
+
- "saliency": Gradient-based saliency (default)
|
|
167
|
+
- "input_times_gradient": Gradient × input
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
Explanation object with feature attributions.
|
|
171
|
+
|
|
172
|
+
Example:
|
|
173
|
+
>>> explanation = explainer.explain(instance)
|
|
174
|
+
>>> print(explanation.explanation_data["feature_attributions"])
|
|
175
|
+
"""
|
|
176
|
+
instance = np.array(instance).flatten().astype(np.float32)
|
|
177
|
+
|
|
178
|
+
# Determine target class if not specified
|
|
179
|
+
if target_class is None and self.class_names:
|
|
180
|
+
predictions = self.model.predict(instance.reshape(1, -1))
|
|
181
|
+
target_class = int(np.argmax(predictions))
|
|
182
|
+
|
|
183
|
+
# Compute saliency
|
|
184
|
+
attributions = self._compute_saliency(instance, target_class, method)
|
|
185
|
+
|
|
186
|
+
# Build attributions dict
|
|
187
|
+
attributions_dict = {
|
|
188
|
+
fname: float(attributions[i])
|
|
189
|
+
for i, fname in enumerate(self.feature_names)
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
# Determine explainer name based on method
|
|
193
|
+
if method == "saliency":
|
|
194
|
+
explainer_name = "Saliency"
|
|
195
|
+
elif method == "input_times_gradient":
|
|
196
|
+
explainer_name = "InputTimesGradient"
|
|
197
|
+
else:
|
|
198
|
+
explainer_name = f"Saliency_{method}"
|
|
199
|
+
|
|
200
|
+
# Determine class name
|
|
201
|
+
if self.class_names and target_class is not None:
|
|
202
|
+
label_name = self.class_names[target_class]
|
|
203
|
+
else:
|
|
204
|
+
label_name = f"class_{target_class}" if target_class is not None else "output"
|
|
205
|
+
|
|
206
|
+
explanation_data = {
|
|
207
|
+
"feature_attributions": attributions_dict,
|
|
208
|
+
"attributions_raw": attributions.tolist(),
|
|
209
|
+
"method": method,
|
|
210
|
+
"absolute_value": self.absolute_value if method == "saliency" else False
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
return Explanation(
|
|
214
|
+
explainer_name=explainer_name,
|
|
215
|
+
target_class=label_name,
|
|
216
|
+
explanation_data=explanation_data
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
def explain_batch(
|
|
220
|
+
self,
|
|
221
|
+
X: np.ndarray,
|
|
222
|
+
target_class: Optional[int] = None,
|
|
223
|
+
method: str = "saliency"
|
|
224
|
+
) -> List[Explanation]:
|
|
225
|
+
"""
|
|
226
|
+
Generate explanations for multiple instances.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
X: 2D numpy array of instances (n_samples, n_features),
|
|
230
|
+
or 1D array for single instance.
|
|
231
|
+
target_class: Target class for all instances. If None,
|
|
232
|
+
uses predicted class for each instance.
|
|
233
|
+
method: Attribution method (see explain()).
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
List of Explanation objects.
|
|
237
|
+
|
|
238
|
+
Example:
|
|
239
|
+
>>> explanations = explainer.explain_batch(X_test[:10])
|
|
240
|
+
>>> for exp in explanations:
|
|
241
|
+
... print(exp.target_class)
|
|
242
|
+
"""
|
|
243
|
+
X = np.array(X)
|
|
244
|
+
if X.ndim == 1:
|
|
245
|
+
X = X.reshape(1, -1)
|
|
246
|
+
|
|
247
|
+
return [
|
|
248
|
+
self.explain(X[i], target_class=target_class, method=method)
|
|
249
|
+
for i in range(X.shape[0])
|
|
250
|
+
]
|
|
251
|
+
|
|
252
|
+
def compute_all_variants(
|
|
253
|
+
self,
|
|
254
|
+
instance: np.ndarray,
|
|
255
|
+
target_class: Optional[int] = None
|
|
256
|
+
) -> dict:
|
|
257
|
+
"""
|
|
258
|
+
Compute all saliency variants for comparison.
|
|
259
|
+
|
|
260
|
+
Useful for analyzing which variant provides the best explanation
|
|
261
|
+
for a given instance or model architecture.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
instance: Input instance.
|
|
265
|
+
target_class: Target class for gradient computation.
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
Dictionary containing:
|
|
269
|
+
- saliency_absolute: |∂f/∂x|
|
|
270
|
+
- saliency_signed: ∂f/∂x
|
|
271
|
+
- input_times_gradient: x ⊙ ∂f/∂x
|
|
272
|
+
"""
|
|
273
|
+
instance = np.array(instance).flatten().astype(np.float32)
|
|
274
|
+
|
|
275
|
+
# Determine target class
|
|
276
|
+
if target_class is None and self.class_names:
|
|
277
|
+
predictions = self.model.predict(instance.reshape(1, -1))
|
|
278
|
+
target_class = int(np.argmax(predictions))
|
|
279
|
+
|
|
280
|
+
# Compute gradient (only once)
|
|
281
|
+
_, gradients = self.model.predict_with_gradients(
|
|
282
|
+
instance.reshape(1, -1),
|
|
283
|
+
target_class=target_class
|
|
284
|
+
)
|
|
285
|
+
gradients = gradients.flatten()
|
|
286
|
+
|
|
287
|
+
return {
|
|
288
|
+
"saliency_absolute": np.abs(gradients).tolist(),
|
|
289
|
+
"saliency_signed": gradients.tolist(),
|
|
290
|
+
"input_times_gradient": (instance * gradients).tolist(),
|
|
291
|
+
"feature_names": self.feature_names,
|
|
292
|
+
"target_class": target_class
|
|
293
|
+
}
|