explainiverse 0.8.3__tar.gz → 0.8.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {explainiverse-0.8.3 → explainiverse-0.8.5}/PKG-INFO +18 -3
  2. {explainiverse-0.8.3 → explainiverse-0.8.5}/README.md +17 -2
  3. {explainiverse-0.8.3 → explainiverse-0.8.5}/pyproject.toml +1 -1
  4. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/__init__.py +1 -1
  5. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/evaluation/__init__.py +8 -0
  6. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/evaluation/faithfulness_extended.py +390 -0
  7. {explainiverse-0.8.3 → explainiverse-0.8.5}/LICENSE +0 -0
  8. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/adapters/__init__.py +0 -0
  9. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/adapters/base_adapter.py +0 -0
  10. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/adapters/pytorch_adapter.py +0 -0
  11. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/adapters/sklearn_adapter.py +0 -0
  12. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/core/__init__.py +0 -0
  13. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/core/explainer.py +0 -0
  14. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/core/explanation.py +0 -0
  15. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/core/registry.py +0 -0
  16. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/engine/__init__.py +0 -0
  17. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/engine/suite.py +0 -0
  18. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/evaluation/_utils.py +0 -0
  19. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/evaluation/faithfulness.py +0 -0
  20. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/evaluation/metrics.py +0 -0
  21. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/evaluation/stability.py +0 -0
  22. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/explainers/__init__.py +0 -0
  23. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/explainers/attribution/__init__.py +0 -0
  24. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/explainers/attribution/lime_wrapper.py +0 -0
  25. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/explainers/attribution/shap_wrapper.py +0 -0
  26. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/explainers/attribution/treeshap_wrapper.py +0 -0
  27. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/explainers/counterfactual/__init__.py +0 -0
  28. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/explainers/counterfactual/dice_wrapper.py +0 -0
  29. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/explainers/example_based/__init__.py +0 -0
  30. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/explainers/example_based/protodash.py +0 -0
  31. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/explainers/global_explainers/__init__.py +0 -0
  32. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/explainers/global_explainers/ale.py +0 -0
  33. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/explainers/global_explainers/partial_dependence.py +0 -0
  34. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/explainers/global_explainers/permutation_importance.py +0 -0
  35. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/explainers/global_explainers/sage.py +0 -0
  36. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/explainers/gradient/__init__.py +0 -0
  37. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/explainers/gradient/deeplift.py +0 -0
  38. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/explainers/gradient/gradcam.py +0 -0
  39. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/explainers/gradient/integrated_gradients.py +0 -0
  40. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/explainers/gradient/lrp.py +0 -0
  41. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/explainers/gradient/saliency.py +0 -0
  42. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/explainers/gradient/smoothgrad.py +0 -0
  43. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/explainers/gradient/tcav.py +0 -0
  44. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/explainers/rule_based/__init__.py +0 -0
  45. {explainiverse-0.8.3 → explainiverse-0.8.5}/src/explainiverse/explainers/rule_based/anchors_wrapper.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: explainiverse
3
- Version: 0.8.3
3
+ Version: 0.8.5
4
4
  Summary: Unified, extensible explainability framework supporting 18 XAI methods including LIME, SHAP, LRP, TCAV, GradCAM, and more
5
5
  Home-page: https://github.com/jemsbhai/explainiverse
6
6
  License: MIT
@@ -44,7 +44,7 @@ Description-Content-Type: text/markdown
44
44
  | Feature | Description |
45
45
  |---------|-------------|
46
46
  | **18 Explainers** | LIME, KernelSHAP, TreeSHAP, Integrated Gradients, DeepLIFT, DeepSHAP, SmoothGrad, Saliency Maps, GradCAM/GradCAM++, LRP, TCAV, Anchors, Counterfactual, Permutation Importance, PDP, ALE, SAGE, ProtoDash |
47
- | **8 Evaluation Metrics** | Faithfulness (PGI, PGU, Comprehensiveness, Sufficiency, Correlation) and Stability (RIS, ROS, Lipschitz) |
47
+ | **14 Evaluation Metrics** | Faithfulness (PGI, PGU, Comprehensiveness, Sufficiency, Correlation, Faithfulness Estimate, Monotonicity, Monotonicity-Nguyen, Pixel Flipping, Region Perturbation) and Stability (RIS, ROS, Lipschitz) |
48
48
  | **Unified API** | Consistent `BaseExplainer` interface with standardized `Explanation` output |
49
49
  | **Plugin Registry** | Filter explainers by scope, model type, data type; automatic recommendations |
50
50
  | **Framework Support** | Adapters for scikit-learn and PyTorch (with gradient computation) |
@@ -96,6 +96,11 @@ Explainiverse includes a comprehensive suite of evaluation metrics based on the
96
96
  | **Comprehensiveness** | Drop when removing top-k features | [DeYoung et al., 2020](https://arxiv.org/abs/1911.03429) |
97
97
  | **Sufficiency** | Prediction using only top-k features | [DeYoung et al., 2020](https://arxiv.org/abs/1911.03429) |
98
98
  | **Faithfulness Correlation** | Correlation between attribution and impact | [Bhatt et al., 2020](https://arxiv.org/abs/2005.00631) |
99
+ | **Faithfulness Estimate** | Correlation of attributions with single-feature perturbation impact | [Alvarez-Melis & Jaakkola, 2018](https://arxiv.org/abs/1806.08049) |
100
+ | **Monotonicity** | Sequential feature addition shows monotonic prediction increase | [Arya et al., 2019](https://arxiv.org/abs/1909.03012) |
101
+ | **Monotonicity-Nguyen** | Spearman correlation between attributions and feature removal impact | [Nguyen & Martinez, 2020](https://arxiv.org/abs/2010.07455) |
102
+ | **Pixel Flipping** | AUC of prediction degradation when removing features by importance | [Bach et al., 2015](https://doi.org/10.1371/journal.pone.0130140) |
103
+ | **Region Perturbation** | AUC of prediction degradation when perturbing feature regions by importance | [Samek et al., 2015](https://arxiv.org/abs/1509.06321) |
99
104
 
100
105
  ### Stability Metrics
101
106
 
@@ -715,6 +720,16 @@ poetry run pytest tests/test_lrp.py::TestLRPConv2d -v
715
720
  - [x] Evaluation: Stability metrics (RIS, ROS, Lipschitz)
716
721
  - [x] PyTorch adapter with gradient support
717
722
 
723
+ ### In Progress 🔄
724
+ - [ ] **Evaluation metrics expansion** - Adding 42 more metrics across 7 categories to exceed Quantus (37 metrics)
725
+ - Phase 1: Faithfulness (+9 metrics) - 4/12 complete
726
+ - Phase 2: Robustness (+7 metrics)
727
+ - Phase 3: Localisation (+8 metrics)
728
+ - Phase 4: Complexity (+4 metrics)
729
+ - Phase 5: Randomisation (+5 metrics)
730
+ - Phase 6: Axiomatic (+4 metrics)
731
+ - Phase 7: Fairness (+4 metrics)
732
+
718
733
  ### Planned 📋
719
734
  - [ ] Attention-based explanations (for Transformers)
720
735
  - [ ] TensorFlow/Keras adapter
@@ -734,7 +749,7 @@ If you use Explainiverse in your research, please cite:
734
749
  author = {Syed, Muntaser},
735
750
  year = {2025},
736
751
  url = {https://github.com/jemsbhai/explainiverse},
737
- version = {0.8.0}
752
+ version = {0.8.4}
738
753
  }
739
754
  ```
740
755
 
@@ -13,7 +13,7 @@
13
13
  | Feature | Description |
14
14
  |---------|-------------|
15
15
  | **18 Explainers** | LIME, KernelSHAP, TreeSHAP, Integrated Gradients, DeepLIFT, DeepSHAP, SmoothGrad, Saliency Maps, GradCAM/GradCAM++, LRP, TCAV, Anchors, Counterfactual, Permutation Importance, PDP, ALE, SAGE, ProtoDash |
16
- | **8 Evaluation Metrics** | Faithfulness (PGI, PGU, Comprehensiveness, Sufficiency, Correlation) and Stability (RIS, ROS, Lipschitz) |
16
+ | **14 Evaluation Metrics** | Faithfulness (PGI, PGU, Comprehensiveness, Sufficiency, Correlation, Faithfulness Estimate, Monotonicity, Monotonicity-Nguyen, Pixel Flipping, Region Perturbation) and Stability (RIS, ROS, Lipschitz) |
17
17
  | **Unified API** | Consistent `BaseExplainer` interface with standardized `Explanation` output |
18
18
  | **Plugin Registry** | Filter explainers by scope, model type, data type; automatic recommendations |
19
19
  | **Framework Support** | Adapters for scikit-learn and PyTorch (with gradient computation) |
@@ -65,6 +65,11 @@ Explainiverse includes a comprehensive suite of evaluation metrics based on the
65
65
  | **Comprehensiveness** | Drop when removing top-k features | [DeYoung et al., 2020](https://arxiv.org/abs/1911.03429) |
66
66
  | **Sufficiency** | Prediction using only top-k features | [DeYoung et al., 2020](https://arxiv.org/abs/1911.03429) |
67
67
  | **Faithfulness Correlation** | Correlation between attribution and impact | [Bhatt et al., 2020](https://arxiv.org/abs/2005.00631) |
68
+ | **Faithfulness Estimate** | Correlation of attributions with single-feature perturbation impact | [Alvarez-Melis & Jaakkola, 2018](https://arxiv.org/abs/1806.08049) |
69
+ | **Monotonicity** | Sequential feature addition shows monotonic prediction increase | [Arya et al., 2019](https://arxiv.org/abs/1909.03012) |
70
+ | **Monotonicity-Nguyen** | Spearman correlation between attributions and feature removal impact | [Nguyen & Martinez, 2020](https://arxiv.org/abs/2010.07455) |
71
+ | **Pixel Flipping** | AUC of prediction degradation when removing features by importance | [Bach et al., 2015](https://doi.org/10.1371/journal.pone.0130140) |
72
+ | **Region Perturbation** | AUC of prediction degradation when perturbing feature regions by importance | [Samek et al., 2015](https://arxiv.org/abs/1509.06321) |
68
73
 
69
74
  ### Stability Metrics
70
75
 
@@ -684,6 +689,16 @@ poetry run pytest tests/test_lrp.py::TestLRPConv2d -v
684
689
  - [x] Evaluation: Stability metrics (RIS, ROS, Lipschitz)
685
690
  - [x] PyTorch adapter with gradient support
686
691
 
692
+ ### In Progress 🔄
693
+ - [ ] **Evaluation metrics expansion** - Adding 42 more metrics across 7 categories to exceed Quantus (37 metrics)
694
+ - Phase 1: Faithfulness (+9 metrics) - 4/12 complete
695
+ - Phase 2: Robustness (+7 metrics)
696
+ - Phase 3: Localisation (+8 metrics)
697
+ - Phase 4: Complexity (+4 metrics)
698
+ - Phase 5: Randomisation (+5 metrics)
699
+ - Phase 6: Axiomatic (+4 metrics)
700
+ - Phase 7: Fairness (+4 metrics)
701
+
687
702
  ### Planned 📋
688
703
  - [ ] Attention-based explanations (for Transformers)
689
704
  - [ ] TensorFlow/Keras adapter
@@ -703,7 +718,7 @@ If you use Explainiverse in your research, please cite:
703
718
  author = {Syed, Muntaser},
704
719
  year = {2025},
705
720
  url = {https://github.com/jemsbhai/explainiverse},
706
- version = {0.8.0}
721
+ version = {0.8.4}
707
722
  }
708
723
  ```
709
724
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "explainiverse"
3
- version = "0.8.3"
3
+ version = "0.8.5"
4
4
  description = "Unified, extensible explainability framework supporting 18 XAI methods including LIME, SHAP, LRP, TCAV, GradCAM, and more"
5
5
  authors = ["Muntaser Syed <jemsbhai@gmail.com>"]
6
6
  license = "MIT"
@@ -34,7 +34,7 @@ from explainiverse.adapters.sklearn_adapter import SklearnAdapter
34
34
  from explainiverse.adapters import TORCH_AVAILABLE
35
35
  from explainiverse.engine.suite import ExplanationSuite
36
36
 
37
- __version__ = "0.8.3"
37
+ __version__ = "0.8.5"
38
38
 
39
39
  __all__ = [
40
40
  # Core
@@ -43,6 +43,10 @@ from explainiverse.evaluation.faithfulness_extended import (
43
43
  compute_batch_monotonicity,
44
44
  compute_monotonicity_nguyen,
45
45
  compute_batch_monotonicity_nguyen,
46
+ compute_pixel_flipping,
47
+ compute_batch_pixel_flipping,
48
+ compute_region_perturbation,
49
+ compute_batch_region_perturbation,
46
50
  )
47
51
 
48
52
  __all__ = [
@@ -74,4 +78,8 @@ __all__ = [
74
78
  "compute_batch_monotonicity",
75
79
  "compute_monotonicity_nguyen",
76
80
  "compute_batch_monotonicity_nguyen",
81
+ "compute_pixel_flipping",
82
+ "compute_batch_pixel_flipping",
83
+ "compute_region_perturbation",
84
+ "compute_batch_region_perturbation",
77
85
  ]
@@ -256,6 +256,396 @@ def compute_batch_faithfulness_estimate(
256
256
  }
257
257
 
258
258
 
259
+ # =============================================================================
260
+ # Metric 5: Region Perturbation (Samek et al., 2015)
261
+ # =============================================================================
262
+
263
+ def compute_region_perturbation(
264
+ model,
265
+ instance: np.ndarray,
266
+ explanation: Explanation,
267
+ baseline: Union[str, float, np.ndarray, Callable] = "mean",
268
+ background_data: np.ndarray = None,
269
+ target_class: int = None,
270
+ region_size: int = None,
271
+ use_absolute: bool = True,
272
+ return_curve: bool = False,
273
+ ) -> Union[float, Dict[str, Union[float, np.ndarray]]]:
274
+ """
275
+ Compute Region Perturbation score (Samek et al., 2015).
276
+
277
+ Similar to Pixel Flipping, but operates on regions (groups) of features
278
+ rather than individual features. Features are divided into non-overlapping
279
+ regions, and regions are perturbed in order of their cumulative importance
280
+ (sum of attributions within the region).
281
+
282
+ This metric is particularly relevant for image data where local spatial
283
+ correlations exist, but is also applicable to tabular data with groups
284
+ of related features.
285
+
286
+ The score is the Area Under the perturbation Curve (AUC), normalized
287
+ to [0, 1]. Lower AUC indicates better faithfulness (faster degradation
288
+ when important regions are removed first).
289
+
290
+ Args:
291
+ model: Model adapter with predict/predict_proba method
292
+ instance: Input instance (1D array)
293
+ explanation: Explanation object with feature_attributions
294
+ baseline: Baseline for feature removal ("mean", "median", scalar, array, callable)
295
+ background_data: Reference data for computing baseline (required for "mean"/"median")
296
+ target_class: Target class index for probability (default: predicted class)
297
+ region_size: Number of features per region. If None, defaults to max(1, n_features // 4)
298
+ For image-like data, this would correspond to patch size.
299
+ use_absolute: If True, sort regions by absolute attribution sum (default: True)
300
+ return_curve: If True, return full degradation curve and details
301
+
302
+ Returns:
303
+ If return_curve=False: AUC score (float, 0 to 1, lower is better)
304
+ If return_curve=True: Dictionary with:
305
+ - 'auc': float - Area under the perturbation curve
306
+ - 'curve': np.ndarray - Normalized prediction values at each step
307
+ - 'predictions': np.ndarray - Raw prediction values
308
+ - 'region_order': list - Order in which regions were perturbed
309
+ - 'regions': list - List of feature indices in each region
310
+ - 'n_regions': int - Number of regions
311
+ - 'region_size': int - Size of each region
312
+
313
+ References:
314
+ Samek, W., Binder, A., Montavon, G., Lapuschkin, S., & Müller, K. R. (2015).
315
+ Evaluating the Visualization of What a Deep Neural Network has Learned.
316
+ arXiv preprint arXiv:1509.06321.
317
+ """
318
+ instance = np.asarray(instance).flatten()
319
+ n_features = len(instance)
320
+
321
+ # Get baseline values
322
+ baseline_values = compute_baseline_values(
323
+ baseline, background_data, n_features
324
+ )
325
+
326
+ # Extract attributions as array
327
+ attr_array = _extract_attribution_array(explanation, n_features)
328
+
329
+ # Determine region size
330
+ if region_size is None:
331
+ # Default: divide features into ~4 regions
332
+ region_size = max(1, n_features // 4)
333
+ region_size = max(1, min(region_size, n_features)) # Clamp to valid range
334
+
335
+ # Create non-overlapping regions
336
+ regions = []
337
+ for start_idx in range(0, n_features, region_size):
338
+ end_idx = min(start_idx + region_size, n_features)
339
+ regions.append(list(range(start_idx, end_idx)))
340
+
341
+ n_regions = len(regions)
342
+
343
+ # Compute region importance (sum of attributions in each region)
344
+ region_importance = []
345
+ for region in regions:
346
+ if use_absolute:
347
+ importance = np.sum(np.abs(attr_array[region]))
348
+ else:
349
+ importance = np.sum(attr_array[region])
350
+ region_importance.append(importance)
351
+
352
+ # Sort regions by importance (descending - most important first)
353
+ sorted_region_indices = np.argsort(-np.array(region_importance))
354
+
355
+ # Determine target class
356
+ if target_class is None:
357
+ pred = get_prediction_value(model, instance.reshape(1, -1))
358
+ if isinstance(pred, np.ndarray) and pred.ndim > 0:
359
+ target_class = int(np.argmax(pred))
360
+ else:
361
+ target_class = 0
362
+
363
+ # Get original prediction for the target class
364
+ original_pred = get_prediction_value(model, instance.reshape(1, -1))
365
+ if isinstance(original_pred, np.ndarray) and original_pred.ndim > 0 and len(original_pred) > target_class:
366
+ original_value = original_pred[target_class]
367
+ else:
368
+ original_value = float(original_pred)
369
+
370
+ # Start with original instance
371
+ current = instance.copy()
372
+
373
+ # Track predictions as regions are perturbed
374
+ predictions = [original_value]
375
+
376
+ # Perturb regions one by one (most important first)
377
+ for region_idx in sorted_region_indices:
378
+ region = regions[region_idx]
379
+
380
+ # Replace all features in this region with baseline
381
+ for feat_idx in region:
382
+ current[feat_idx] = baseline_values[feat_idx]
383
+
384
+ # Get prediction
385
+ pred = get_prediction_value(model, current.reshape(1, -1))
386
+ if isinstance(pred, np.ndarray) and pred.ndim > 0 and len(pred) > target_class:
387
+ predictions.append(pred[target_class])
388
+ else:
389
+ predictions.append(float(pred))
390
+
391
+ predictions = np.array(predictions)
392
+
393
+ # Normalize predictions to [0, 1] relative to original
394
+ # curve[i] = prediction after perturbing i regions / original prediction
395
+ if abs(original_value) > 1e-10:
396
+ curve = predictions / original_value
397
+ else:
398
+ # Handle zero original prediction
399
+ curve = predictions
400
+
401
+ # Compute AUC using trapezoidal rule
402
+ # x-axis: fraction of regions perturbed (0 to 1)
403
+ # y-axis: relative prediction value
404
+ x = np.linspace(0, 1, len(predictions))
405
+ auc = np.trapz(curve, x)
406
+
407
+ if return_curve:
408
+ return {
409
+ "auc": float(auc),
410
+ "curve": curve,
411
+ "predictions": predictions,
412
+ "region_order": sorted_region_indices.tolist(),
413
+ "regions": regions,
414
+ "n_regions": n_regions,
415
+ "region_size": region_size,
416
+ }
417
+
418
+ return float(auc)
419
+
420
+
421
+ def compute_batch_region_perturbation(
422
+ model,
423
+ X: np.ndarray,
424
+ explanations: List[Explanation],
425
+ baseline: Union[str, float, np.ndarray, Callable] = "mean",
426
+ max_samples: int = None,
427
+ region_size: int = None,
428
+ use_absolute: bool = True,
429
+ ) -> Dict[str, float]:
430
+ """
431
+ Compute average Region Perturbation score over a batch of instances.
432
+
433
+ Args:
434
+ model: Model adapter
435
+ X: Input data (2D array)
436
+ explanations: List of Explanation objects (one per instance)
437
+ baseline: Baseline for feature removal
438
+ max_samples: Maximum number of samples to evaluate
439
+ region_size: Number of features per region (default: n_features // 4)
440
+ use_absolute: If True, sort regions by absolute attribution sum
441
+
442
+ Returns:
443
+ Dictionary with mean, std, min, max, and count of valid scores
444
+ """
445
+ n_samples = len(explanations)
446
+ if max_samples:
447
+ n_samples = min(n_samples, max_samples)
448
+
449
+ scores = []
450
+
451
+ for i in range(n_samples):
452
+ try:
453
+ score = compute_region_perturbation(
454
+ model, X[i], explanations[i],
455
+ baseline=baseline, background_data=X,
456
+ region_size=region_size,
457
+ use_absolute=use_absolute
458
+ )
459
+ if not np.isnan(score):
460
+ scores.append(score)
461
+ except Exception:
462
+ continue
463
+
464
+ if not scores:
465
+ return {"mean": 0.0, "std": 0.0, "min": 0.0, "max": 0.0, "n_samples": 0}
466
+
467
+ return {
468
+ "mean": float(np.mean(scores)),
469
+ "std": float(np.std(scores)),
470
+ "min": float(np.min(scores)),
471
+ "max": float(np.max(scores)),
472
+ "n_samples": len(scores),
473
+ }
474
+
475
+
476
+ # =============================================================================
477
+ # Metric 4: Pixel Flipping (Bach et al., 2015)
478
+ # =============================================================================
479
+
480
+ def compute_pixel_flipping(
481
+ model,
482
+ instance: np.ndarray,
483
+ explanation: Explanation,
484
+ baseline: Union[str, float, np.ndarray, Callable] = "mean",
485
+ background_data: np.ndarray = None,
486
+ target_class: int = None,
487
+ use_absolute: bool = True,
488
+ return_curve: bool = False,
489
+ ) -> Union[float, Dict[str, Union[float, np.ndarray]]]:
490
+ """
491
+ Compute Pixel Flipping score (Bach et al., 2015).
492
+
493
+ Sequentially removes features in order of attributed importance (most
494
+ important first) and measures the cumulative prediction degradation.
495
+ A faithful explanation should cause rapid prediction drop when the
496
+ most important features are removed first.
497
+
498
+ The score is the Area Under the perturbation Curve (AUC), normalized
499
+ to [0, 1]. Lower AUC indicates better faithfulness (faster degradation).
500
+
501
+ Args:
502
+ model: Model adapter with predict/predict_proba method
503
+ instance: Input instance (1D array)
504
+ explanation: Explanation object with feature_attributions
505
+ baseline: Baseline for feature removal ("mean", "median", scalar, array, callable)
506
+ background_data: Reference data for computing baseline (required for "mean"/"median")
507
+ target_class: Target class index for probability (default: predicted class)
508
+ use_absolute: If True, sort features by absolute attribution value
509
+ return_curve: If True, return full degradation curve and predictions
510
+
511
+ Returns:
512
+ If return_curve=False: AUC score (float, 0 to 1, lower is better)
513
+ If return_curve=True: Dictionary with 'auc', 'curve', 'predictions', 'feature_order'
514
+
515
+ References:
516
+ Bach, S., et al. (2015). On Pixel-Wise Explanations for Non-Linear
517
+ Classifier Decisions by Layer-Wise Relevance Propagation. PLOS ONE.
518
+ """
519
+ instance = np.asarray(instance).flatten()
520
+ n_features = len(instance)
521
+
522
+ # Get baseline values
523
+ baseline_values = compute_baseline_values(
524
+ baseline, background_data, n_features
525
+ )
526
+
527
+ # Extract attributions as array
528
+ attr_array = _extract_attribution_array(explanation, n_features)
529
+
530
+ # Sort features by attribution (descending - most important first)
531
+ if use_absolute:
532
+ sorted_indices = np.argsort(-np.abs(attr_array))
533
+ else:
534
+ sorted_indices = np.argsort(-attr_array)
535
+
536
+ # Determine target class
537
+ if target_class is None:
538
+ pred = get_prediction_value(model, instance.reshape(1, -1))
539
+ if isinstance(pred, np.ndarray) and pred.ndim > 0:
540
+ target_class = int(np.argmax(pred))
541
+ else:
542
+ target_class = 0
543
+
544
+ # Get original prediction for the target class
545
+ original_pred = get_prediction_value(model, instance.reshape(1, -1))
546
+ if isinstance(original_pred, np.ndarray) and original_pred.ndim > 0 and len(original_pred) > target_class:
547
+ original_value = original_pred[target_class]
548
+ else:
549
+ original_value = float(original_pred)
550
+
551
+ # Start with original instance
552
+ current = instance.copy()
553
+
554
+ # Track predictions as features are removed
555
+ predictions = [original_value]
556
+
557
+ # Remove features one by one (most important first)
558
+ for idx in sorted_indices:
559
+ # Remove this feature (replace with baseline)
560
+ current[idx] = baseline_values[idx]
561
+
562
+ # Get prediction
563
+ pred = get_prediction_value(model, current.reshape(1, -1))
564
+ if isinstance(pred, np.ndarray) and pred.ndim > 0 and len(pred) > target_class:
565
+ predictions.append(pred[target_class])
566
+ else:
567
+ predictions.append(float(pred))
568
+
569
+ predictions = np.array(predictions)
570
+
571
+ # Normalize predictions to [0, 1] relative to original
572
+ # curve[i] = prediction after removing i features / original prediction
573
+ if abs(original_value) > 1e-10:
574
+ curve = predictions / original_value
575
+ else:
576
+ # Handle zero original prediction
577
+ curve = predictions
578
+
579
+ # Compute AUC using trapezoidal rule
580
+ # x-axis: fraction of features removed (0 to 1)
581
+ # y-axis: relative prediction value
582
+ x = np.linspace(0, 1, len(predictions))
583
+ auc = np.trapz(curve, x)
584
+
585
+ if return_curve:
586
+ return {
587
+ "auc": float(auc),
588
+ "curve": curve,
589
+ "predictions": predictions,
590
+ "feature_order": sorted_indices,
591
+ "n_features": n_features,
592
+ }
593
+
594
+ return float(auc)
595
+
596
+
597
+ def compute_batch_pixel_flipping(
598
+ model,
599
+ X: np.ndarray,
600
+ explanations: List[Explanation],
601
+ baseline: Union[str, float, np.ndarray, Callable] = "mean",
602
+ max_samples: int = None,
603
+ use_absolute: bool = True,
604
+ ) -> Dict[str, float]:
605
+ """
606
+ Compute average Pixel Flipping score over a batch of instances.
607
+
608
+ Args:
609
+ model: Model adapter
610
+ X: Input data (2D array)
611
+ explanations: List of Explanation objects (one per instance)
612
+ baseline: Baseline for feature removal
613
+ max_samples: Maximum number of samples to evaluate
614
+ use_absolute: If True, sort features by absolute attribution value
615
+
616
+ Returns:
617
+ Dictionary with mean, std, min, max, and count of valid scores
618
+ """
619
+ n_samples = len(explanations)
620
+ if max_samples:
621
+ n_samples = min(n_samples, max_samples)
622
+
623
+ scores = []
624
+
625
+ for i in range(n_samples):
626
+ try:
627
+ score = compute_pixel_flipping(
628
+ model, X[i], explanations[i],
629
+ baseline=baseline, background_data=X,
630
+ use_absolute=use_absolute
631
+ )
632
+ if not np.isnan(score):
633
+ scores.append(score)
634
+ except Exception:
635
+ continue
636
+
637
+ if not scores:
638
+ return {"mean": 0.0, "std": 0.0, "min": 0.0, "max": 0.0, "n_samples": 0}
639
+
640
+ return {
641
+ "mean": float(np.mean(scores)),
642
+ "std": float(np.std(scores)),
643
+ "min": float(np.min(scores)),
644
+ "max": float(np.max(scores)),
645
+ "n_samples": len(scores),
646
+ }
647
+
648
+
259
649
  # =============================================================================
260
650
  # Metric 3: Monotonicity-Nguyen (Nguyen et al., 2020)
261
651
  # =============================================================================
File without changes