explainiverse 0.8.1__tar.gz → 0.8.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {explainiverse-0.8.1 → explainiverse-0.8.3}/PKG-INFO +1 -1
  2. {explainiverse-0.8.1 → explainiverse-0.8.3}/pyproject.toml +1 -1
  3. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/__init__.py +1 -1
  4. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/evaluation/__init__.py +20 -3
  5. explainiverse-0.8.3/src/explainiverse/evaluation/faithfulness_extended.py +585 -0
  6. {explainiverse-0.8.1 → explainiverse-0.8.3}/LICENSE +0 -0
  7. {explainiverse-0.8.1 → explainiverse-0.8.3}/README.md +0 -0
  8. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/adapters/__init__.py +0 -0
  9. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/adapters/base_adapter.py +0 -0
  10. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/adapters/pytorch_adapter.py +0 -0
  11. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/adapters/sklearn_adapter.py +0 -0
  12. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/core/__init__.py +0 -0
  13. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/core/explainer.py +0 -0
  14. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/core/explanation.py +0 -0
  15. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/core/registry.py +0 -0
  16. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/engine/__init__.py +0 -0
  17. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/engine/suite.py +0 -0
  18. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/evaluation/_utils.py +0 -0
  19. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/evaluation/faithfulness.py +0 -0
  20. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/evaluation/metrics.py +0 -0
  21. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/evaluation/stability.py +0 -0
  22. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/explainers/__init__.py +0 -0
  23. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/explainers/attribution/__init__.py +0 -0
  24. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/explainers/attribution/lime_wrapper.py +0 -0
  25. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/explainers/attribution/shap_wrapper.py +0 -0
  26. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/explainers/attribution/treeshap_wrapper.py +0 -0
  27. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/explainers/counterfactual/__init__.py +0 -0
  28. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/explainers/counterfactual/dice_wrapper.py +0 -0
  29. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/explainers/example_based/__init__.py +0 -0
  30. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/explainers/example_based/protodash.py +0 -0
  31. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/explainers/global_explainers/__init__.py +0 -0
  32. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/explainers/global_explainers/ale.py +0 -0
  33. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/explainers/global_explainers/partial_dependence.py +0 -0
  34. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/explainers/global_explainers/permutation_importance.py +0 -0
  35. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/explainers/global_explainers/sage.py +0 -0
  36. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/explainers/gradient/__init__.py +0 -0
  37. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/explainers/gradient/deeplift.py +0 -0
  38. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/explainers/gradient/gradcam.py +0 -0
  39. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/explainers/gradient/integrated_gradients.py +0 -0
  40. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/explainers/gradient/lrp.py +0 -0
  41. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/explainers/gradient/saliency.py +0 -0
  42. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/explainers/gradient/smoothgrad.py +0 -0
  43. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/explainers/gradient/tcav.py +0 -0
  44. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/explainers/rule_based/__init__.py +0 -0
  45. {explainiverse-0.8.1 → explainiverse-0.8.3}/src/explainiverse/explainers/rule_based/anchors_wrapper.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: explainiverse
3
- Version: 0.8.1
3
+ Version: 0.8.3
4
4
  Summary: Unified, extensible explainability framework supporting 18 XAI methods including LIME, SHAP, LRP, TCAV, GradCAM, and more
5
5
  Home-page: https://github.com/jemsbhai/explainiverse
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "explainiverse"
3
- version = "0.8.1"
3
+ version = "0.8.3"
4
4
  description = "Unified, extensible explainability framework supporting 18 XAI methods including LIME, SHAP, LRP, TCAV, GradCAM, and more"
5
5
  authors = ["Muntaser Syed <jemsbhai@gmail.com>"]
6
6
  license = "MIT"
@@ -34,7 +34,7 @@ from explainiverse.adapters.sklearn_adapter import SklearnAdapter
34
34
  from explainiverse.adapters import TORCH_AVAILABLE
35
35
  from explainiverse.engine.suite import ExplanationSuite
36
36
 
37
- __version__ = "0.8.1"
37
+ __version__ = "0.8.3"
38
38
 
39
39
  __all__ = [
40
40
  # Core
@@ -3,9 +3,10 @@
3
3
  Evaluation metrics for explanation quality.
4
4
 
5
5
  Includes:
6
- - Faithfulness metrics (PGI, PGU, Comprehensiveness, Sufficiency)
6
+ - Faithfulness metrics (PGI, PGU, Comprehensiveness, Sufficiency, Faithfulness Estimate)
7
7
  - Stability metrics (RIS, ROS, Lipschitz)
8
8
  - Perturbation metrics (AOPC, ROAR)
9
+ - Extended faithfulness metrics (Phase 1 expansion)
9
10
  """
10
11
 
11
12
  from explainiverse.evaluation.metrics import (
@@ -35,13 +36,22 @@ from explainiverse.evaluation.stability import (
35
36
  compare_explainer_stability,
36
37
  )
37
38
 
39
+ from explainiverse.evaluation.faithfulness_extended import (
40
+ compute_faithfulness_estimate,
41
+ compute_batch_faithfulness_estimate,
42
+ compute_monotonicity,
43
+ compute_batch_monotonicity,
44
+ compute_monotonicity_nguyen,
45
+ compute_batch_monotonicity_nguyen,
46
+ )
47
+
38
48
  __all__ = [
39
49
  # Perturbation metrics (existing)
40
50
  "compute_aopc",
41
51
  "compute_batch_aopc",
42
52
  "compute_roar",
43
53
  "compute_roar_curve",
44
- # Faithfulness metrics (new)
54
+ # Faithfulness metrics (core)
45
55
  "compute_pgi",
46
56
  "compute_pgu",
47
57
  "compute_faithfulness_score",
@@ -50,11 +60,18 @@ __all__ = [
50
60
  "compute_faithfulness_correlation",
51
61
  "compare_explainer_faithfulness",
52
62
  "compute_batch_faithfulness",
53
- # Stability metrics (new)
63
+ # Stability metrics
54
64
  "compute_ris",
55
65
  "compute_ros",
56
66
  "compute_lipschitz_estimate",
57
67
  "compute_stability_metrics",
58
68
  "compute_batch_stability",
59
69
  "compare_explainer_stability",
70
+ # Extended faithfulness metrics (Phase 1)
71
+ "compute_faithfulness_estimate",
72
+ "compute_batch_faithfulness_estimate",
73
+ "compute_monotonicity",
74
+ "compute_batch_monotonicity",
75
+ "compute_monotonicity_nguyen",
76
+ "compute_batch_monotonicity_nguyen",
60
77
  ]
@@ -0,0 +1,585 @@
1
+ # src/explainiverse/evaluation/faithfulness_extended.py
2
+ """
3
+ Extended faithfulness evaluation metrics.
4
+
5
+ Phase 1 metrics for exceeding OpenXAI/Quantus:
6
+ - Faithfulness Estimate (Alvarez-Melis et al., 2018)
7
+ - Monotonicity (Arya et al., 2019)
8
+ - Monotonicity-Nguyen (Nguyen et al., 2020)
9
+ - Pixel Flipping (Bach et al., 2015)
10
+ - Region Perturbation (Samek et al., 2015)
11
+ - Selectivity (Montavon et al., 2018)
12
+ - Sensitivity-n (Ancona et al., 2018)
13
+ - IROF (Rieger & Hansen, 2020)
14
+ - Infidelity (Yeh et al., 2019)
15
+ - ROAD (Rong et al., 2022)
16
+ - Insertion AUC (Petsiuk et al., 2018)
17
+ - Deletion AUC (Petsiuk et al., 2018)
18
+ """
19
+ import numpy as np
20
+ import re
21
+ from typing import Union, Callable, List, Dict, Optional, Tuple
22
+ from scipy import stats
23
+
24
+ from explainiverse.core.explanation import Explanation
25
+ from explainiverse.evaluation._utils import (
26
+ get_sorted_feature_indices,
27
+ compute_baseline_values,
28
+ apply_feature_mask,
29
+ resolve_k,
30
+ get_prediction_value,
31
+ compute_prediction_change,
32
+ )
33
+
34
+
35
+ def _extract_attribution_array(
36
+ explanation: Explanation,
37
+ n_features: int
38
+ ) -> np.ndarray:
39
+ """
40
+ Extract attribution values as a numpy array in feature index order.
41
+
42
+ Args:
43
+ explanation: Explanation object with feature_attributions
44
+ n_features: Expected number of features
45
+
46
+ Returns:
47
+ 1D numpy array of attribution values ordered by feature index
48
+ """
49
+ attributions = explanation.explanation_data.get("feature_attributions", {})
50
+ feature_names = getattr(explanation, 'feature_names', None)
51
+
52
+ if not attributions:
53
+ raise ValueError("No feature attributions found in explanation.")
54
+
55
+ # Build attribution array in feature order
56
+ attr_array = np.zeros(n_features)
57
+
58
+ if feature_names is not None:
59
+ for fname, value in attributions.items():
60
+ # Try to find the index for this feature name
61
+ for i, fn in enumerate(feature_names):
62
+ if fn == fname or fn in fname or fname in fn:
63
+ attr_array[i] = value
64
+ break
65
+ else:
66
+ # Try extracting index from name pattern
67
+ for pattern in [r'feature[_\s]*(\d+)', r'feat[_\s]*(\d+)', r'^f(\d+)', r'^x(\d+)']:
68
+ match = re.search(pattern, fname, re.IGNORECASE)
69
+ if match:
70
+ idx = int(match.group(1))
71
+ if 0 <= idx < n_features:
72
+ attr_array[idx] = value
73
+ break
74
+ else:
75
+ # No feature names - try to extract indices from keys
76
+ for fname, value in attributions.items():
77
+ for pattern in [r'feature[_\s]*(\d+)', r'feat[_\s]*(\d+)', r'^f(\d+)', r'^x(\d+)']:
78
+ match = re.search(pattern, fname, re.IGNORECASE)
79
+ if match:
80
+ idx = int(match.group(1))
81
+ if 0 <= idx < n_features:
82
+ attr_array[idx] = value
83
+ break
84
+
85
+ return attr_array
86
+
87
+
88
+ # =============================================================================
89
+ # Metric 1: Faithfulness Estimate (Alvarez-Melis & Jaakkola, 2018)
90
+ # =============================================================================
91
+
92
+ def compute_faithfulness_estimate(
93
+ model,
94
+ instance: np.ndarray,
95
+ explanation: Explanation,
96
+ baseline: Union[str, float, np.ndarray, Callable] = "mean",
97
+ background_data: np.ndarray = None,
98
+ subset_size: int = None,
99
+ n_subsets: int = 100,
100
+ seed: int = None,
101
+ ) -> float:
102
+ """
103
+ Compute Faithfulness Estimate (Alvarez-Melis & Jaakkola, 2018).
104
+
105
+ Measures the correlation between feature attributions and the actual
106
+ impact on predictions when individual features are perturbed. For each
107
+ feature, computes the prediction change when that feature is replaced
108
+ with baseline, then correlates these changes with attribution magnitudes.
109
+
110
+ Higher correlation indicates the explanation correctly identifies
111
+ which features actually matter for the prediction.
112
+
113
+ Args:
114
+ model: Model adapter with predict/predict_proba method
115
+ instance: Input instance (1D array)
116
+ explanation: Explanation object with feature_attributions
117
+ baseline: Baseline for feature replacement ("mean", "median", scalar, array, callable)
118
+ background_data: Reference data for computing baseline (required for "mean"/"median")
119
+ subset_size: Size of random subsets to perturb (default: 1 for single-feature)
120
+ n_subsets: Number of random subsets to evaluate (used when subset_size > 1)
121
+ seed: Random seed for reproducibility
122
+
123
+ Returns:
124
+ Faithfulness estimate score (Pearson correlation, -1 to 1, higher is better)
125
+
126
+ References:
127
+ Alvarez-Melis, D., & Jaakkola, T. S. (2018). Towards Robust Interpretability
128
+ with Self-Explaining Neural Networks. NeurIPS.
129
+ """
130
+ if seed is not None:
131
+ np.random.seed(seed)
132
+
133
+ instance = np.asarray(instance).flatten()
134
+ n_features = len(instance)
135
+
136
+ # Get baseline values
137
+ baseline_values = compute_baseline_values(
138
+ baseline, background_data, n_features
139
+ )
140
+
141
+ # Extract attributions as array
142
+ attr_array = _extract_attribution_array(explanation, n_features)
143
+
144
+ # Default subset_size is 1 (single-feature perturbation)
145
+ if subset_size is None:
146
+ subset_size = 1
147
+
148
+ if subset_size == 1:
149
+ # Single-feature perturbation: evaluate each feature individually
150
+ prediction_changes = []
151
+ attribution_values = []
152
+
153
+ for i in range(n_features):
154
+ # Skip features with zero attribution (they won't affect correlation)
155
+ if abs(attr_array[i]) < 1e-10:
156
+ continue
157
+
158
+ # Perturb single feature
159
+ perturbed = apply_feature_mask(instance, [i], baseline_values)
160
+
161
+ # Compute prediction change
162
+ change = compute_prediction_change(model, instance, perturbed, metric="absolute")
163
+
164
+ prediction_changes.append(change)
165
+ attribution_values.append(abs(attr_array[i]))
166
+
167
+ if len(prediction_changes) < 2:
168
+ return 0.0 # Not enough data points for correlation
169
+
170
+ # Compute Pearson correlation
171
+ corr, _ = stats.pearsonr(attribution_values, prediction_changes)
172
+
173
+ return float(corr) if not np.isnan(corr) else 0.0
174
+
175
+ else:
176
+ # Random subset perturbation
177
+ prediction_changes = []
178
+ attribution_sums = []
179
+
180
+ for _ in range(n_subsets):
181
+ # Sample random subset of features
182
+ subset_indices = np.random.choice(
183
+ n_features, size=min(subset_size, n_features), replace=False
184
+ )
185
+
186
+ # Perturb subset
187
+ perturbed = apply_feature_mask(instance, subset_indices.tolist(), baseline_values)
188
+
189
+ # Compute prediction change
190
+ change = compute_prediction_change(model, instance, perturbed, metric="absolute")
191
+
192
+ # Sum of attributions in subset
193
+ attr_sum = np.sum(np.abs(attr_array[subset_indices]))
194
+
195
+ prediction_changes.append(change)
196
+ attribution_sums.append(attr_sum)
197
+
198
+ if len(prediction_changes) < 2:
199
+ return 0.0
200
+
201
+ # Compute Pearson correlation
202
+ corr, _ = stats.pearsonr(attribution_sums, prediction_changes)
203
+
204
+ return float(corr) if not np.isnan(corr) else 0.0
205
+
206
+
207
+ def compute_batch_faithfulness_estimate(
208
+ model,
209
+ X: np.ndarray,
210
+ explanations: List[Explanation],
211
+ baseline: Union[str, float, np.ndarray, Callable] = "mean",
212
+ max_samples: int = None,
213
+ seed: int = None,
214
+ ) -> Dict[str, float]:
215
+ """
216
+ Compute average Faithfulness Estimate over a batch of instances.
217
+
218
+ Args:
219
+ model: Model adapter
220
+ X: Input data (2D array)
221
+ explanations: List of Explanation objects (one per instance)
222
+ baseline: Baseline for feature replacement
223
+ max_samples: Maximum number of samples to evaluate
224
+ seed: Random seed
225
+
226
+ Returns:
227
+ Dictionary with mean, std, min, max, and count of valid scores
228
+ """
229
+ n_samples = len(explanations)
230
+ if max_samples:
231
+ n_samples = min(n_samples, max_samples)
232
+
233
+ scores = []
234
+
235
+ for i in range(n_samples):
236
+ try:
237
+ score = compute_faithfulness_estimate(
238
+ model, X[i], explanations[i],
239
+ baseline=baseline, background_data=X,
240
+ seed=seed
241
+ )
242
+ if not np.isnan(score):
243
+ scores.append(score)
244
+ except Exception:
245
+ continue
246
+
247
+ if not scores:
248
+ return {"mean": 0.0, "std": 0.0, "min": 0.0, "max": 0.0, "n_samples": 0}
249
+
250
+ return {
251
+ "mean": float(np.mean(scores)),
252
+ "std": float(np.std(scores)),
253
+ "min": float(np.min(scores)),
254
+ "max": float(np.max(scores)),
255
+ "n_samples": len(scores),
256
+ }
257
+
258
+
259
+ # =============================================================================
260
+ # Metric 3: Monotonicity-Nguyen (Nguyen et al., 2020)
261
+ # =============================================================================
262
+
263
+ def compute_monotonicity_nguyen(
264
+ model,
265
+ instance: np.ndarray,
266
+ explanation: Explanation,
267
+ baseline: Union[str, float, np.ndarray, Callable] = "mean",
268
+ background_data: np.ndarray = None,
269
+ target_class: int = None,
270
+ use_absolute: bool = True,
271
+ ) -> float:
272
+ """
273
+ Compute Monotonicity Correlation (Nguyen et al., 2020).
274
+
275
+ Measures the Spearman rank correlation between attribution magnitudes
276
+ and the prediction changes when each feature is individually removed
277
+ (replaced with baseline). A faithful explanation should show that
278
+ features with higher attributions cause larger prediction changes
279
+ when removed.
280
+
281
+ Unlike Arya's Monotonicity (sequential feature addition), this metric
282
+ evaluates each feature independently and uses rank correlation to
283
+ measure agreement between attributed importance and actual impact.
284
+
285
+ Args:
286
+ model: Model adapter with predict/predict_proba method
287
+ instance: Input instance (1D array)
288
+ explanation: Explanation object with feature_attributions
289
+ baseline: Baseline for feature removal ("mean", "median", scalar, array, callable)
290
+ background_data: Reference data for computing baseline (required for "mean"/"median")
291
+ target_class: Target class index for probability (default: predicted class)
292
+ use_absolute: If True, use absolute attribution values (default: True)
293
+
294
+ Returns:
295
+ Monotonicity correlation score (Spearman rho, -1 to 1, higher is better)
296
+
297
+ References:
298
+ Nguyen, A. P., & Martinez, M. R. (2020). Quantitative Evaluation of
299
+ Machine Learning Explanations: A Human-Grounded Benchmark.
300
+ arXiv:2010.07455.
301
+ """
302
+ instance = np.asarray(instance).flatten()
303
+ n_features = len(instance)
304
+
305
+ # Get baseline values
306
+ baseline_values = compute_baseline_values(
307
+ baseline, background_data, n_features
308
+ )
309
+
310
+ # Extract attributions as array
311
+ attr_array = _extract_attribution_array(explanation, n_features)
312
+
313
+ # Determine target class
314
+ if target_class is None:
315
+ pred = get_prediction_value(model, instance.reshape(1, -1))
316
+ if isinstance(pred, np.ndarray) and pred.ndim > 0:
317
+ target_class = int(np.argmax(pred))
318
+ else:
319
+ target_class = 0
320
+
321
+ # Get original prediction for the target class
322
+ original_pred = get_prediction_value(model, instance.reshape(1, -1))
323
+ if isinstance(original_pred, np.ndarray) and original_pred.ndim > 0 and len(original_pred) > target_class:
324
+ original_value = original_pred[target_class]
325
+ else:
326
+ original_value = float(original_pred)
327
+
328
+ # Compute prediction change for each feature when removed
329
+ prediction_changes = []
330
+ attribution_values = []
331
+
332
+ for i in range(n_features):
333
+ # Create perturbed instance with feature i replaced by baseline
334
+ perturbed = instance.copy()
335
+ perturbed[i] = baseline_values[i]
336
+
337
+ # Get prediction for perturbed instance
338
+ perturbed_pred = get_prediction_value(model, perturbed.reshape(1, -1))
339
+ if isinstance(perturbed_pred, np.ndarray) and perturbed_pred.ndim > 0 and len(perturbed_pred) > target_class:
340
+ perturbed_value = perturbed_pred[target_class]
341
+ else:
342
+ perturbed_value = float(perturbed_pred)
343
+
344
+ # Prediction change (drop in confidence when feature is removed)
345
+ # Positive change means removing the feature decreased prediction
346
+ change = original_value - perturbed_value
347
+ prediction_changes.append(abs(change))
348
+
349
+ # Attribution value
350
+ if use_absolute:
351
+ attribution_values.append(abs(attr_array[i]))
352
+ else:
353
+ attribution_values.append(attr_array[i])
354
+
355
+ prediction_changes = np.array(prediction_changes)
356
+ attribution_values = np.array(attribution_values)
357
+
358
+ # Handle edge cases
359
+ if len(prediction_changes) < 2:
360
+ return 0.0
361
+
362
+ # Check for constant arrays (would cause division by zero in correlation)
363
+ if np.std(prediction_changes) < 1e-10 or np.std(attribution_values) < 1e-10:
364
+ # If both are constant, consider it perfect correlation
365
+ if np.std(prediction_changes) < 1e-10 and np.std(attribution_values) < 1e-10:
366
+ return 1.0
367
+ # If only one is constant, correlation is undefined
368
+ return 0.0
369
+
370
+ # Compute Spearman rank correlation
371
+ corr, _ = stats.spearmanr(attribution_values, prediction_changes)
372
+
373
+ return float(corr) if not np.isnan(corr) else 0.0
374
+
375
+
376
+ def compute_batch_monotonicity_nguyen(
377
+ model,
378
+ X: np.ndarray,
379
+ explanations: List[Explanation],
380
+ baseline: Union[str, float, np.ndarray, Callable] = "mean",
381
+ max_samples: int = None,
382
+ use_absolute: bool = True,
383
+ ) -> Dict[str, float]:
384
+ """
385
+ Compute average Monotonicity-Nguyen over a batch of instances.
386
+
387
+ Args:
388
+ model: Model adapter
389
+ X: Input data (2D array)
390
+ explanations: List of Explanation objects (one per instance)
391
+ baseline: Baseline for feature removal
392
+ max_samples: Maximum number of samples to evaluate
393
+ use_absolute: If True, use absolute attribution values
394
+
395
+ Returns:
396
+ Dictionary with mean, std, min, max, and count of valid scores
397
+ """
398
+ n_samples = len(explanations)
399
+ if max_samples:
400
+ n_samples = min(n_samples, max_samples)
401
+
402
+ scores = []
403
+
404
+ for i in range(n_samples):
405
+ try:
406
+ score = compute_monotonicity_nguyen(
407
+ model, X[i], explanations[i],
408
+ baseline=baseline, background_data=X,
409
+ use_absolute=use_absolute
410
+ )
411
+ if not np.isnan(score):
412
+ scores.append(score)
413
+ except Exception:
414
+ continue
415
+
416
+ if not scores:
417
+ return {"mean": 0.0, "std": 0.0, "min": 0.0, "max": 0.0, "n_samples": 0}
418
+
419
+ return {
420
+ "mean": float(np.mean(scores)),
421
+ "std": float(np.std(scores)),
422
+ "min": float(np.min(scores)),
423
+ "max": float(np.max(scores)),
424
+ "n_samples": len(scores),
425
+ }
426
+
427
+
428
+ # =============================================================================
429
+ # Metric 2: Monotonicity (Arya et al., 2019)
430
+ # =============================================================================
431
+
432
+ def compute_monotonicity(
433
+ model,
434
+ instance: np.ndarray,
435
+ explanation: Explanation,
436
+ baseline: Union[str, float, np.ndarray, Callable] = "mean",
437
+ background_data: np.ndarray = None,
438
+ target_class: int = None,
439
+ use_absolute: bool = True,
440
+ tolerance: float = 1e-6,
441
+ ) -> float:
442
+ """
443
+ Compute Monotonicity (Arya et al., 2019).
444
+
445
+ Measures whether sequentially adding features in order of their attributed
446
+ importance monotonically increases the model's prediction confidence.
447
+ Starting from a baseline (all features masked), features are revealed
448
+ one-by-one in descending order of attribution. A faithful explanation
449
+ should show monotonically increasing predictions.
450
+
451
+ Args:
452
+ model: Model adapter with predict/predict_proba method
453
+ instance: Input instance (1D array)
454
+ explanation: Explanation object with feature_attributions
455
+ baseline: Baseline for masked features ("mean", "median", scalar, array, callable)
456
+ background_data: Reference data for computing baseline (required for "mean"/"median")
457
+ target_class: Target class index for probability (default: predicted class)
458
+ use_absolute: If True, sort features by absolute attribution value
459
+ tolerance: Small value for numerical stability in monotonicity check
460
+
461
+ Returns:
462
+ Monotonicity score (0 to 1, higher is better)
463
+ 1.0 means perfectly monotonic increase
464
+
465
+ References:
466
+ Arya, V., et al. (2019). One Explanation Does Not Fit All: A Toolkit and
467
+ Taxonomy of AI Explainability Techniques. arXiv:1909.03012.
468
+ """
469
+ instance = np.asarray(instance).flatten()
470
+ n_features = len(instance)
471
+
472
+ # Get baseline values
473
+ baseline_values = compute_baseline_values(
474
+ baseline, background_data, n_features
475
+ )
476
+
477
+ # Extract attributions as array
478
+ attr_array = _extract_attribution_array(explanation, n_features)
479
+
480
+ # Sort features by attribution (descending - most important first)
481
+ if use_absolute:
482
+ sorted_indices = np.argsort(-np.abs(attr_array))
483
+ else:
484
+ sorted_indices = np.argsort(-attr_array)
485
+
486
+ # Determine target class
487
+ if target_class is None:
488
+ # Use predicted class
489
+ pred = get_prediction_value(model, instance.reshape(1, -1))
490
+ if isinstance(pred, np.ndarray) and pred.ndim > 0:
491
+ target_class = int(np.argmax(pred))
492
+ else:
493
+ target_class = 0
494
+
495
+ # Start from baseline (all features masked)
496
+ current = baseline_values.copy()
497
+
498
+ # Track predictions as features are revealed
499
+ predictions = []
500
+
501
+ # Get initial prediction (baseline state)
502
+ pred = get_prediction_value(model, current.reshape(1, -1))
503
+ if isinstance(pred, np.ndarray) and pred.ndim > 0 and len(pred) > target_class:
504
+ predictions.append(pred[target_class])
505
+ else:
506
+ predictions.append(float(pred))
507
+
508
+ # Add features one by one
509
+ revealed_features = []
510
+ for idx in sorted_indices:
511
+ # Reveal this feature (set to original value)
512
+ revealed_features.append(idx)
513
+ current[idx] = instance[idx]
514
+
515
+ # Get prediction
516
+ pred = get_prediction_value(model, current.reshape(1, -1))
517
+ if isinstance(pred, np.ndarray) and pred.ndim > 0 and len(pred) > target_class:
518
+ predictions.append(pred[target_class])
519
+ else:
520
+ predictions.append(float(pred))
521
+
522
+ # Count monotonic increases
523
+ # A step is monotonic if: pred[i+1] >= pred[i] - tolerance
524
+ n_steps = len(predictions) - 1
525
+ if n_steps == 0:
526
+ return 1.0
527
+
528
+ monotonic_steps = 0
529
+ for i in range(n_steps):
530
+ if predictions[i + 1] >= predictions[i] - tolerance:
531
+ monotonic_steps += 1
532
+
533
+ return float(monotonic_steps) / float(n_steps)
534
+
535
+
536
+ def compute_batch_monotonicity(
537
+ model,
538
+ X: np.ndarray,
539
+ explanations: List[Explanation],
540
+ baseline: Union[str, float, np.ndarray, Callable] = "mean",
541
+ max_samples: int = None,
542
+ use_absolute: bool = True,
543
+ ) -> Dict[str, float]:
544
+ """
545
+ Compute average Monotonicity over a batch of instances.
546
+
547
+ Args:
548
+ model: Model adapter
549
+ X: Input data (2D array)
550
+ explanations: List of Explanation objects (one per instance)
551
+ baseline: Baseline for masked features
552
+ max_samples: Maximum number of samples to evaluate
553
+ use_absolute: If True, sort features by absolute attribution value
554
+
555
+ Returns:
556
+ Dictionary with mean, std, min, max, and count of valid scores
557
+ """
558
+ n_samples = len(explanations)
559
+ if max_samples:
560
+ n_samples = min(n_samples, max_samples)
561
+
562
+ scores = []
563
+
564
+ for i in range(n_samples):
565
+ try:
566
+ score = compute_monotonicity(
567
+ model, X[i], explanations[i],
568
+ baseline=baseline, background_data=X,
569
+ use_absolute=use_absolute
570
+ )
571
+ if not np.isnan(score):
572
+ scores.append(score)
573
+ except Exception:
574
+ continue
575
+
576
+ if not scores:
577
+ return {"mean": 0.0, "std": 0.0, "min": 0.0, "max": 0.0, "n_samples": 0}
578
+
579
+ return {
580
+ "mean": float(np.mean(scores)),
581
+ "std": float(np.std(scores)),
582
+ "min": float(np.min(scores)),
583
+ "max": float(np.max(scores)),
584
+ "n_samples": len(scores),
585
+ }
File without changes
File without changes