explainiverse 0.5.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
explainiverse/__init__.py CHANGED
@@ -33,7 +33,7 @@ from explainiverse.adapters.sklearn_adapter import SklearnAdapter
33
33
  from explainiverse.adapters import TORCH_AVAILABLE
34
34
  from explainiverse.engine.suite import ExplanationSuite
35
35
 
36
- __version__ = "0.5.0"
36
+ __version__ = "0.7.0"
37
37
 
38
38
  __all__ = [
39
39
  # Core
@@ -373,6 +373,8 @@ def _create_default_registry() -> ExplainerRegistry:
373
373
  from explainiverse.explainers.gradient.gradcam import GradCAMExplainer
374
374
  from explainiverse.explainers.gradient.deeplift import DeepLIFTExplainer, DeepLIFTShapExplainer
375
375
  from explainiverse.explainers.gradient.smoothgrad import SmoothGradExplainer
376
+ from explainiverse.explainers.gradient.saliency import SaliencyExplainer
377
+ from explainiverse.explainers.gradient.tcav import TCAVExplainer
376
378
  from explainiverse.explainers.example_based.protodash import ProtoDashExplainer
377
379
 
378
380
  registry = ExplainerRegistry()
@@ -551,6 +553,40 @@ def _create_default_registry() -> ExplainerRegistry:
551
553
  )
552
554
  )
553
555
 
556
+ # Register Saliency Maps (for neural networks)
557
+ registry.register(
558
+ name="saliency",
559
+ explainer_class=SaliencyExplainer,
560
+ meta=ExplainerMeta(
561
+ scope="local",
562
+ model_types=["neural"],
563
+ data_types=["tabular", "image"],
564
+ task_types=["classification", "regression"],
565
+ description="Saliency Maps - gradient-based feature attribution (requires PyTorch)",
566
+ paper_reference="Simonyan et al., 2014 - 'Deep Inside Convolutional Networks' (ICLR Workshop)",
567
+ complexity="O(forward_pass + backward_pass)",
568
+ requires_training_data=False,
569
+ supports_batching=True
570
+ )
571
+ )
572
+
573
+ # Register TCAV (Concept-based explanations for neural networks)
574
+ registry.register(
575
+ name="tcav",
576
+ explainer_class=TCAVExplainer,
577
+ meta=ExplainerMeta(
578
+ scope="local",
579
+ model_types=["neural"],
580
+ data_types=["tabular", "image"],
581
+ task_types=["classification"],
582
+ description="TCAV - Testing with Concept Activation Vectors for concept-based explanations (requires PyTorch)",
583
+ paper_reference="Kim et al., 2018 - 'Interpretability Beyond Feature Attribution: Quantitative Testing with Concept Activation Vectors' (ICML)",
584
+ complexity="O(n_concepts * n_test_inputs * forward_pass)",
585
+ requires_training_data=True,
586
+ supports_batching=True
587
+ )
588
+ )
589
+
554
590
  # =========================================================================
555
591
  # Global Explainers (model-level)
556
592
  # =========================================================================
@@ -4,12 +4,23 @@ Gradient-based explainers for neural networks.
4
4
 
5
5
  These explainers require models that support gradient computation,
6
6
  typically via the PyTorchAdapter.
7
+
8
+ Explainers:
9
+ - IntegratedGradientsExplainer: Axiomatic attributions via path integration
10
+ - GradCAMExplainer: Visual explanations for CNNs
11
+ - DeepLIFTExplainer: Reference-based attribution
12
+ - DeepLIFTShapExplainer: DeepLIFT + SHAP combination
13
+ - SmoothGradExplainer: Noise-averaged gradients
14
+ - SaliencyExplainer: Basic gradient attribution
15
+ - TCAVExplainer: Concept-based explanations (TCAV)
7
16
  """
8
17
 
9
18
  from explainiverse.explainers.gradient.integrated_gradients import IntegratedGradientsExplainer
10
19
  from explainiverse.explainers.gradient.gradcam import GradCAMExplainer
11
20
  from explainiverse.explainers.gradient.deeplift import DeepLIFTExplainer, DeepLIFTShapExplainer
12
21
  from explainiverse.explainers.gradient.smoothgrad import SmoothGradExplainer
22
+ from explainiverse.explainers.gradient.saliency import SaliencyExplainer
23
+ from explainiverse.explainers.gradient.tcav import TCAVExplainer, ConceptActivationVector
13
24
 
14
25
  __all__ = [
15
26
  "IntegratedGradientsExplainer",
@@ -17,4 +28,7 @@ __all__ = [
17
28
  "DeepLIFTExplainer",
18
29
  "DeepLIFTShapExplainer",
19
30
  "SmoothGradExplainer",
31
+ "SaliencyExplainer",
32
+ "TCAVExplainer",
33
+ "ConceptActivationVector",
20
34
  ]
@@ -0,0 +1,293 @@
1
+ # src/explainiverse/explainers/gradient/saliency.py
2
+ """
3
+ Saliency Maps - Gradient-Based Feature Attribution.
4
+
5
+ Saliency Maps compute feature attributions using the gradient of the output
6
+ with respect to the input. This is one of the simplest and fastest gradient-based
7
+ attribution methods, requiring only a single forward and backward pass.
8
+
9
+ Key Properties:
10
+ - Simple: Just compute the gradient of output w.r.t. input
11
+ - Fast: Single forward + backward pass
12
+ - Foundation: Base method that other gradient methods build upon
13
+ - Variants: Absolute saliency, signed saliency, input × gradient
14
+
15
+ Variants:
16
+ - Saliency (absolute): |∂f(x)/∂x| - magnitude of sensitivity
17
+ - Saliency (signed): ∂f(x)/∂x - direction and magnitude
18
+ - Input × Gradient: x ⊙ ∂f(x)/∂x - scaled by input values
19
+
20
+ Reference:
21
+ Simonyan, K., Vedaldi, A., & Zisserman, A. (2014).
22
+ Deep Inside Convolutional Networks: Visualising Image Classification
23
+ Models and Saliency Maps.
24
+ ICLR Workshop 2014.
25
+ https://arxiv.org/abs/1312.6034
26
+
27
+ Example:
28
+ from explainiverse.explainers.gradient import SaliencyExplainer
29
+ from explainiverse.adapters import PyTorchAdapter
30
+
31
+ adapter = PyTorchAdapter(model, task="classification")
32
+
33
+ explainer = SaliencyExplainer(
34
+ model=adapter,
35
+ feature_names=feature_names
36
+ )
37
+
38
+ explanation = explainer.explain(instance)
39
+ """
40
+
41
+ import numpy as np
42
+ from typing import List, Optional
43
+
44
+ from explainiverse.core.explainer import BaseExplainer
45
+ from explainiverse.core.explanation import Explanation
46
+
47
+
48
+ class SaliencyExplainer(BaseExplainer):
49
+ """
50
+ Saliency Maps explainer for neural networks.
51
+
52
+ Computes attributions using the gradient of the model output with respect
53
+ to the input features. This is the simplest gradient-based attribution
54
+ method and serves as the foundation for more sophisticated techniques.
55
+
56
+ Algorithm:
57
+ Saliency(x) = ∂f(x)/∂x (signed)
58
+ Saliency(x) = |∂f(x)/∂x| (absolute, default)
59
+ InputTimesGradient(x) = x ⊙ ∂f(x)/∂x
60
+
61
+ Attributes:
62
+ model: Model adapter with predict_with_gradients() method
63
+ feature_names: List of feature names
64
+ class_names: List of class names (for classification)
65
+ absolute_value: Whether to take absolute value of gradients
66
+
67
+ Example:
68
+ >>> explainer = SaliencyExplainer(adapter, feature_names)
69
+ >>> explanation = explainer.explain(instance)
70
+ >>> print(explanation.explanation_data["feature_attributions"])
71
+ """
72
+
73
+ def __init__(
74
+ self,
75
+ model,
76
+ feature_names: List[str],
77
+ class_names: Optional[List[str]] = None,
78
+ absolute_value: bool = True
79
+ ):
80
+ """
81
+ Initialize the Saliency explainer.
82
+
83
+ Args:
84
+ model: A model adapter with predict_with_gradients() method.
85
+ Use PyTorchAdapter for PyTorch models.
86
+ feature_names: List of input feature names.
87
+ class_names: List of class names (for classification tasks).
88
+ absolute_value: If True (default), return absolute value of
89
+ gradients. Set to False for signed saliency.
90
+
91
+ Raises:
92
+ TypeError: If model doesn't have predict_with_gradients method.
93
+ """
94
+ super().__init__(model)
95
+
96
+ # Validate model has gradient capability
97
+ if not hasattr(model, 'predict_with_gradients'):
98
+ raise TypeError(
99
+ "Model adapter must have predict_with_gradients() method. "
100
+ "Use PyTorchAdapter for PyTorch models."
101
+ )
102
+
103
+ self.feature_names = list(feature_names)
104
+ self.class_names = list(class_names) if class_names else None
105
+ self.absolute_value = absolute_value
106
+
107
+ def _compute_saliency(
108
+ self,
109
+ instance: np.ndarray,
110
+ target_class: Optional[int] = None,
111
+ method: str = "saliency"
112
+ ) -> np.ndarray:
113
+ """
114
+ Compute saliency attributions for a single instance.
115
+
116
+ Args:
117
+ instance: Input instance (1D array).
118
+ target_class: Target class for gradient computation.
119
+ method: Attribution method:
120
+ - "saliency": Raw gradient (default)
121
+ - "input_times_gradient": Gradient multiplied by input
122
+
123
+ Returns:
124
+ Array of attribution scores for each input feature.
125
+ """
126
+ instance = instance.flatten().astype(np.float32)
127
+
128
+ # Compute gradient
129
+ _, gradients = self.model.predict_with_gradients(
130
+ instance.reshape(1, -1),
131
+ target_class=target_class
132
+ )
133
+ gradients = gradients.flatten()
134
+
135
+ # Apply method
136
+ if method == "saliency":
137
+ attributions = gradients
138
+ elif method == "input_times_gradient":
139
+ attributions = instance * gradients
140
+ else:
141
+ raise ValueError(
142
+ f"Unknown method: '{method}'. "
143
+ f"Use 'saliency' or 'input_times_gradient'."
144
+ )
145
+
146
+ # Apply absolute value if configured
147
+ if self.absolute_value and method == "saliency":
148
+ attributions = np.abs(attributions)
149
+
150
+ return attributions
151
+
152
+ def explain(
153
+ self,
154
+ instance: np.ndarray,
155
+ target_class: Optional[int] = None,
156
+ method: str = "saliency"
157
+ ) -> Explanation:
158
+ """
159
+ Generate Saliency explanation for an instance.
160
+
161
+ Args:
162
+ instance: 1D numpy array of input features.
163
+ target_class: For classification, which class to explain.
164
+ If None, uses the predicted class.
165
+ method: Attribution method:
166
+ - "saliency": Gradient-based saliency (default)
167
+ - "input_times_gradient": Gradient × input
168
+
169
+ Returns:
170
+ Explanation object with feature attributions.
171
+
172
+ Example:
173
+ >>> explanation = explainer.explain(instance)
174
+ >>> print(explanation.explanation_data["feature_attributions"])
175
+ """
176
+ instance = np.array(instance).flatten().astype(np.float32)
177
+
178
+ # Determine target class if not specified
179
+ if target_class is None and self.class_names:
180
+ predictions = self.model.predict(instance.reshape(1, -1))
181
+ target_class = int(np.argmax(predictions))
182
+
183
+ # Compute saliency
184
+ attributions = self._compute_saliency(instance, target_class, method)
185
+
186
+ # Build attributions dict
187
+ attributions_dict = {
188
+ fname: float(attributions[i])
189
+ for i, fname in enumerate(self.feature_names)
190
+ }
191
+
192
+ # Determine explainer name based on method
193
+ if method == "saliency":
194
+ explainer_name = "Saliency"
195
+ elif method == "input_times_gradient":
196
+ explainer_name = "InputTimesGradient"
197
+ else:
198
+ explainer_name = f"Saliency_{method}"
199
+
200
+ # Determine class name
201
+ if self.class_names and target_class is not None:
202
+ label_name = self.class_names[target_class]
203
+ else:
204
+ label_name = f"class_{target_class}" if target_class is not None else "output"
205
+
206
+ explanation_data = {
207
+ "feature_attributions": attributions_dict,
208
+ "attributions_raw": attributions.tolist(),
209
+ "method": method,
210
+ "absolute_value": self.absolute_value if method == "saliency" else False
211
+ }
212
+
213
+ return Explanation(
214
+ explainer_name=explainer_name,
215
+ target_class=label_name,
216
+ explanation_data=explanation_data
217
+ )
218
+
219
+ def explain_batch(
220
+ self,
221
+ X: np.ndarray,
222
+ target_class: Optional[int] = None,
223
+ method: str = "saliency"
224
+ ) -> List[Explanation]:
225
+ """
226
+ Generate explanations for multiple instances.
227
+
228
+ Args:
229
+ X: 2D numpy array of instances (n_samples, n_features),
230
+ or 1D array for single instance.
231
+ target_class: Target class for all instances. If None,
232
+ uses predicted class for each instance.
233
+ method: Attribution method (see explain()).
234
+
235
+ Returns:
236
+ List of Explanation objects.
237
+
238
+ Example:
239
+ >>> explanations = explainer.explain_batch(X_test[:10])
240
+ >>> for exp in explanations:
241
+ ... print(exp.target_class)
242
+ """
243
+ X = np.array(X)
244
+ if X.ndim == 1:
245
+ X = X.reshape(1, -1)
246
+
247
+ return [
248
+ self.explain(X[i], target_class=target_class, method=method)
249
+ for i in range(X.shape[0])
250
+ ]
251
+
252
+ def compute_all_variants(
253
+ self,
254
+ instance: np.ndarray,
255
+ target_class: Optional[int] = None
256
+ ) -> dict:
257
+ """
258
+ Compute all saliency variants for comparison.
259
+
260
+ Useful for analyzing which variant provides the best explanation
261
+ for a given instance or model architecture.
262
+
263
+ Args:
264
+ instance: Input instance.
265
+ target_class: Target class for gradient computation.
266
+
267
+ Returns:
268
+ Dictionary containing:
269
+ - saliency_absolute: |∂f/∂x|
270
+ - saliency_signed: ∂f/∂x
271
+ - input_times_gradient: x ⊙ ∂f/∂x
272
+ """
273
+ instance = np.array(instance).flatten().astype(np.float32)
274
+
275
+ # Determine target class
276
+ if target_class is None and self.class_names:
277
+ predictions = self.model.predict(instance.reshape(1, -1))
278
+ target_class = int(np.argmax(predictions))
279
+
280
+ # Compute gradient (only once)
281
+ _, gradients = self.model.predict_with_gradients(
282
+ instance.reshape(1, -1),
283
+ target_class=target_class
284
+ )
285
+ gradients = gradients.flatten()
286
+
287
+ return {
288
+ "saliency_absolute": np.abs(gradients).tolist(),
289
+ "saliency_signed": gradients.tolist(),
290
+ "input_times_gradient": (instance * gradients).tolist(),
291
+ "feature_names": self.feature_names,
292
+ "target_class": target_class
293
+ }