PyPI - explainiverse - Versions diffs - 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl - Mend

explainiverse 0.6.0py3-none-any.whl → 0.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

explainiverse/__init__.py +1 -1
explainiverse/adapters/pytorch_adapter.py +88 -25
explainiverse/core/explanation.py +165 -10
explainiverse/core/registry.py +18 -0
explainiverse/engine/suite.py +187 -78
explainiverse/evaluation/metrics.py +189 -108
explainiverse/explainers/attribution/lime_wrapper.py +90 -7
explainiverse/explainers/attribution/shap_wrapper.py +104 -8
explainiverse/explainers/gradient/__init__.py +12 -0
explainiverse/explainers/gradient/integrated_gradients.py +189 -76
explainiverse/explainers/gradient/tcav.py +865 -0
{explainiverse-0.6.0.dist-info → explainiverse-0.7.1.dist-info}/METADATA +60 -9
{explainiverse-0.6.0.dist-info → explainiverse-0.7.1.dist-info}/RECORD +15 -14
{explainiverse-0.6.0.dist-info → explainiverse-0.7.1.dist-info}/LICENSE +0 -0
{explainiverse-0.6.0.dist-info → explainiverse-0.7.1.dist-info}/WHEEL +0 -0

explainiverse/evaluation/metrics.py CHANGED Viewed

@@ -1,9 +1,68 @@
+# src/explainiverse/evaluation/metrics.py
+"""
+Legacy evaluation metrics: AOPC and ROAR.
+For comprehensive evaluation, prefer the metrics in faithfulness.py
+and stability.py which have better edge case handling.
+"""
 import numpy as np
+import re
+from typing import List, Dict, Optional, Union, Callable
 from explainiverse.core.explanation import Explanation
 from sklearn.metrics import accuracy_score
 import copy
+def _extract_feature_index(
+    feature_name: str,
+    feature_names: Optional[List[str]] = None,
+    fallback_index: int = 0
+) -> int:
+    """
+    Extract feature index from a feature name string.
+    Handles various naming conventions including LIME-style conditions
+    like "feature_0 <= 5.0".
+    Args:
+        feature_name: Feature name (possibly with conditions)
+        feature_names: Optional list of canonical feature names
+        fallback_index: Index to return if extraction fails
+    Returns:
+        Feature index
+    """
+    # Try exact match first
+    if feature_names is not None:
+        if feature_name in feature_names:
+            return feature_names.index(feature_name)
+        # Extract base name (remove LIME-style conditions)
+        base_name = re.sub(r'\s*[<>=!]+\s*[\d.\-]+$', '', feature_name).strip()
+        if base_name in feature_names:
+            return feature_names.index(base_name)
+        # Try partial match (feature name contained in key)
+        for i, fname in enumerate(feature_names):
+            if fname in feature_name:
+                return i
+    # Try extracting index from patterns like "feature_2", "f2", "x2"
+    patterns = [
+        r'feature[_\s]*(\d+)',
+        r'feat[_\s]*(\d+)',
+        r'^f(\d+)$',
+        r'^x(\d+)$',
+    ]
+    for pattern in patterns:
+        match = re.search(pattern, feature_name, re.IGNORECASE)
+        if match:
+            return int(match.group(1))
+    return fallback_index
 def compute_aopc(
     model,
     instance: np.ndarray,
@@ -12,39 +71,50 @@ def compute_aopc(
     baseline_value: float = 0.0
 ) -> float:
     """
-    Computes Area Over the Perturbation Curve (AOPC) by iteratively removing top features.
+    Compute Area Over the Perturbation Curve (AOPC).
+    AOPC measures explanation faithfulness by iteratively removing
+    the most important features and measuring prediction change.
     Args:
-        model: wrapped model with .predict() method
-        instance: input sample (1D array)
-        explanation: Explanation object
-        num_steps: number of top features to remove
-        baseline_value: value to replace removed features with (e.g., 0, mean)
+        model: Model adapter with .predict() method
+        instance: Input sample (1D array)
+        explanation: Explanation object with feature_attributions
+        num_steps: Number of top features to remove
+        baseline_value: Value to replace removed features with
     Returns:
-        AOPC score (higher means explanation is more faithful)
+        AOPC score (higher = more faithful explanation)
     """
+    instance = np.asarray(instance).flatten()
+    n_features = len(instance)
     base_pred = model.predict(instance.reshape(1, -1))[0]
+    if hasattr(base_pred, '__len__') and len(base_pred) > 1:
+        base_pred = float(np.max(base_pred))
+    else:
+        base_pred = float(base_pred)
     attributions = explanation.explanation_data.get("feature_attributions", {})
     if not attributions:
         raise ValueError("No feature attributions found in explanation.")
-    # Sort features by abs importance
+    # Sort features by absolute importance (most important first)
     sorted_features = sorted(
         attributions.items(),
         key=lambda x: abs(x[1]),
         reverse=True
     )
-    # Try to map feature names to indices
+    # Get feature_names from explanation (may be None)
+    feature_names = getattr(explanation, 'feature_names', None)
+    # Map feature names to indices
     feature_indices = []
     for i, (fname, _) in enumerate(sorted_features):
-        try:
-            idx = explanation.feature_names.index(fname)
-        except Exception:
-            idx = i  # fallback: assume order
-        feature_indices.append(idx)
+        idx = _extract_feature_index(fname, feature_names, fallback_index=i)
+        if 0 <= idx < n_features:
+            feature_indices.append(idx)
     deltas = []
     modified = instance.copy()
@@ -52,42 +122,52 @@ def compute_aopc(
     for i in range(min(num_steps, len(feature_indices))):
         idx = feature_indices[i]
         modified[idx] = baseline_value
         new_pred = model.predict(modified.reshape(1, -1))[0]
+        if hasattr(new_pred, '__len__') and len(new_pred) > 1:
+            new_pred = float(np.max(new_pred))
+        else:
+            new_pred = float(new_pred)
         delta = abs(base_pred - new_pred)
         deltas.append(delta)
-    return np.mean(deltas)
+    return float(np.mean(deltas)) if deltas else 0.0
 def compute_batch_aopc(
     model,
     X: np.ndarray,
-    explanations: dict,
+    explanations: Dict[str, List[Explanation]],
     num_steps: int = 10,
     baseline_value: float = 0.0
-) -> dict:
+) -> Dict[str, float]:
     """
-    Compute average AOPC for multiple explainers over a batch of instances.
+    Compute average AOPC across multiple explainers and instances.
     Args:
-        model: wrapped model
-        X: 2D input array
-        explanations: dict of {explainer_name: list of Explanation objects}
-        num_steps: number of top features to remove
-        baseline_value: value to replace features with
+        model: Model adapter
+        X: 2D input array (n_samples, n_features)
+        explanations: Dict mapping explainer names to lists of Explanation objects
+        num_steps: Number of top features to remove
+        baseline_value: Value to replace features with
     Returns:
-        Dict of {explainer_name: mean AOPC score}
+        Dict mapping explainer names to mean AOPC scores
     """
     results = {}
     for explainer_name, expl_list in explanations.items():
         scores = []
         for i, exp in enumerate(expl_list):
-            instance = X[i]
-            score = compute_aopc(model, instance, exp, num_steps, baseline_value)
-            scores.append(score)
-        results[explainer_name] = np.mean(scores)
+            if i >= len(X):
+                break
+            try:
+                score = compute_aopc(model, X[i], exp, num_steps, baseline_value)
+                scores.append(score)
+            except Exception:
+                continue
+        results[explainer_name] = float(np.mean(scores)) if scores else 0.0
     return results
@@ -98,136 +178,137 @@ def compute_roar(
     y_train: np.ndarray,
     X_test: np.ndarray,
     y_test: np.ndarray,
-    explanations: list,
+    explanations: List[Explanation],
     top_k: int = 3,
-    baseline_value: float = 0.0,
-    model_kwargs: dict = None
+    baseline_value: Union[str, float, np.ndarray, Callable] = 0.0,
+    model_kwargs: Optional[Dict] = None
 ) -> float:
     """
-    Compute ROAR (Remove And Retrain) using top-k important features from explanations.
+    Compute ROAR (Remove And Retrain) score.
+    ROAR retrains the model after removing top-k important features
+    and measures the accuracy drop.
     Args:
-        model_class: uninstantiated model class (e.g. LogisticRegression)
-        X_train: full training data
-        y_train: training labels
-        X_test: test features
-        y_test: test labels
-        explanations: list of Explanation objects (one per train instance)
-        top_k: number of top features to remove
-        baseline_value: what to set removed features to
-        model_kwargs: optional kwargs to pass to model_class
+        model_class: Uninstantiated model class (e.g., LogisticRegression)
+        X_train: Training features
+        y_train: Training labels
+        X_test: Test features
+        y_test: Test labels
+        explanations: List of Explanation objects (one per training instance)
+        top_k: Number of top features to remove
+        baseline_value: Replacement value for removed features:
+            - float/int: constant value
+            - "mean": per-feature mean from X_train
+            - "median": per-feature median from X_train
+            - np.ndarray: per-feature values
+            - callable: function(X_train) -> per-feature values
+        model_kwargs: Optional kwargs for model_class
     Returns:
         Accuracy drop (baseline_acc - retrained_acc)
     """
     model_kwargs = model_kwargs or {}
+    n_features = X_train.shape[1]
-    # Baseline model
+    # Train baseline model
     baseline_model = model_class(**model_kwargs)
     baseline_model.fit(X_train, y_train)
-    baseline_preds = baseline_model.predict(X_test)
-    baseline_acc = accuracy_score(y_test, baseline_preds)
+    baseline_acc = accuracy_score(y_test, baseline_model.predict(X_test))
-    # Compute top-k feature indices from attributions (use mode)
-    feature_counts = {}
-    for exp in explanations:
-        for fname, val in sorted(exp.explanation_data["feature_attributions"].items(), key=lambda x: abs(x[1]), reverse=True)[:top_k]:
-            try:
-                idx = exp.feature_names.index(fname)
-                feature_counts[idx] = feature_counts.get(idx, 0) + 1
-            except:
-                continue
-    top_features = sorted(feature_counts.items(), key=lambda x: x[1], reverse=True)[:top_k]
-    top_feature_indices = [idx for idx, _ in top_features]
-    # Remove top-k from training and test data
-    X_train_mod = copy.deepcopy(X_train)
-    X_test_mod = copy.deepcopy(X_test)
+    # Collect top-k feature indices via voting across explanations
+    feature_votes: Dict[int, int] = {}
-    # Prepare feature-wise baselines
-    # Compute or assign feature-wise baseline values
-    if not isinstance(
-    baseline_value,
-        (str, float, int, np.number, np.ndarray)
-    ) and not callable(baseline_value):
-        raise ValueError(f"Invalid baseline_value type: {type(baseline_value)}")
+    for exp in explanations:
+        attributions = exp.explanation_data.get("feature_attributions", {})
+        if not attributions:
+            continue
+        # Get feature_names from explanation
+        feature_names = getattr(exp, 'feature_names', None)
+        # Get top-k features by absolute importance
+        sorted_attrs = sorted(
+            attributions.items(),
+            key=lambda x: abs(x[1]),
+            reverse=True
+        )[:top_k]
+        for i, (fname, _) in enumerate(sorted_attrs):
+            idx = _extract_feature_index(fname, feature_names, fallback_index=i)
+            if 0 <= idx < n_features:
+                feature_votes[idx] = feature_votes.get(idx, 0) + 1
+    # Select most voted features
+    top_features = sorted(feature_votes.items(), key=lambda x: x[1], reverse=True)[:top_k]
+    top_indices = [idx for idx, _ in top_features]
+    if not top_indices:
+        return 0.0
+    # Compute baseline values
     if isinstance(baseline_value, str):
         if baseline_value == "mean":
             feature_baseline = np.mean(X_train, axis=0)
         elif baseline_value == "median":
             feature_baseline = np.median(X_train, axis=0)
         else:
-            raise ValueError(f"Unsupported string baseline: {baseline_value}")
+            raise ValueError(f"Unsupported baseline: {baseline_value}")
     elif callable(baseline_value):
         feature_baseline = baseline_value(X_train)
     elif isinstance(baseline_value, np.ndarray):
-        if baseline_value.shape != (X_train.shape[1],):
-            raise ValueError("baseline_value ndarray must match number of features")
         feature_baseline = baseline_value
-    elif isinstance(baseline_value, (float, int, np.number)):
-        feature_baseline = np.full(X_train.shape[1], baseline_value)
     else:
-        raise ValueError(f"Invalid baseline_value type: {type(baseline_value)}")
+        feature_baseline = np.full(n_features, float(baseline_value))
+    # Remove features
+    X_train_mod = X_train.copy()
+    X_test_mod = X_test.copy()
-    for idx in top_feature_indices:
+    for idx in top_indices:
         X_train_mod[:, idx] = feature_baseline[idx]
         X_test_mod[:, idx] = feature_baseline[idx]
-        # X_train_mod[:, idx] = baseline_value
-        # X_test_mod[:, idx] = baseline_value
     # Retrain and evaluate
     retrained_model = model_class(**model_kwargs)
     retrained_model.fit(X_train_mod, y_train)
-    retrained_preds = retrained_model.predict(X_test_mod)
-    retrained_acc = accuracy_score(y_test, retrained_preds)
+    retrained_acc = accuracy_score(y_test, retrained_model.predict(X_test_mod))
-    return baseline_acc - retrained_acc
+    return float(baseline_acc - retrained_acc)
 def compute_roar_curve(
     model_class,
-    X_train,
-    y_train,
-    X_test,
-    y_test,
-    explanations,
-    max_k=5,
-    baseline_value="mean",
-    model_kwargs=None
-) -> dict:
+    X_train: np.ndarray,
+    y_train: np.ndarray,
+    X_test: np.ndarray,
+    y_test: np.ndarray,
+    explanations: List[Explanation],
+    max_k: int = 5,
+    baseline_value: Union[str, float, np.ndarray, Callable] = "mean",
+    model_kwargs: Optional[Dict] = None
+) -> Dict[int, float]:
     """
-    Compute ROAR accuracy drops across a range of top-k features removed.
-    Args:
-        model_class: model type (e.g. LogisticRegression)
-        X_train, y_train, X_test, y_test: full dataset
-        explanations: list of Explanation objects
-        max_k: maximum top-k to try
-        baseline_value: string, scalar, ndarray, or callable
-        model_kwargs: passed to model class
+    Compute ROAR scores for k=1 to max_k.
     Returns:
-        Dict of {k: accuracy drop} for k in 1..max_k
+        Dict mapping k to accuracy drop
     """
-    from copy import deepcopy
     model_kwargs = model_kwargs or {}
     curve = {}
     for k in range(1, max_k + 1):
         acc_drop = compute_roar(
             model_class=model_class,
-            X_train=deepcopy(X_train),
-            y_train=deepcopy(y_train),
-            X_test=deepcopy(X_test),
-            y_test=deepcopy(y_test),
-            explanations=deepcopy(explanations),
+            X_train=X_train.copy(),
+            y_train=y_train.copy(),
+            X_test=X_test.copy(),
+            y_test=y_test.copy(),
+            explanations=explanations,
             top_k=k,
             baseline_value=baseline_value,
-            model_kwargs=deepcopy(model_kwargs)
+            model_kwargs=model_kwargs
         )
         curve[k] = acc_drop
-    return curve
+    return curve

explainiverse/explainers/attribution/lime_wrapper.py CHANGED Viewed

@@ -8,14 +8,36 @@ model (linear regression) to perturbed samples around the instance.
 Reference:
     Ribeiro, M.T., Singh, S., & Guestrin, C. (2016). "Why Should I Trust You?":
     Explaining the Predictions of Any Classifier. KDD 2016.
+    https://arxiv.org/abs/1602.04938
 """
 import numpy as np
-from lime.lime_tabular import LimeTabularExplainer
+from typing import List, Optional
 from explainiverse.core.explainer import BaseExplainer
 from explainiverse.core.explanation import Explanation
+# Lazy import check - don't import lime at module level
+_LIME_AVAILABLE = None
+def _check_lime_available():
+    """Check if LIME is available and raise ImportError if not."""
+    global _LIME_AVAILABLE
+    if _LIME_AVAILABLE is None:
+        try:
+            import lime
+            _LIME_AVAILABLE = True
+        except ImportError:
+            _LIME_AVAILABLE = False
+    if not _LIME_AVAILABLE:
+        raise ImportError(
+            "LIME is required for LimeExplainer. "
+            "Install it with: pip install lime"
+        )
 class LimeExplainer(BaseExplainer):
     """
@@ -34,9 +56,26 @@ class LimeExplainer(BaseExplainer):
         class_names: List of class names
         mode: 'classification' or 'regression'
         explainer: The underlying LimeTabularExplainer
+    Example:
+        >>> from explainiverse.explainers.attribution import LimeExplainer
+        >>> explainer = LimeExplainer(
+        ...     model=adapter,
+        ...     training_data=X_train,
+        ...     feature_names=feature_names,
+        ...     class_names=class_names
+        ... )
+        >>> explanation = explainer.explain(X_test[0])
     """
-    def __init__(self, model, training_data, feature_names, class_names, mode="classification"):
+    def __init__(
+        self,
+        model,
+        training_data: np.ndarray,
+        feature_names: List[str],
+        class_names: List[str],
+        mode: str = "classification"
+    ):
         """
         Initialize the LIME explainer.
@@ -47,20 +86,35 @@ class LimeExplainer(BaseExplainer):
             feature_names: List of feature names.
             class_names: List of class names.
             mode: 'classification' or 'regression'.
+        Raises:
+            ImportError: If lime package is not installed.
         """
+        # Check availability before importing
+        _check_lime_available()
+        # Import after check passes
+        from lime.lime_tabular import LimeTabularExplainer
         super().__init__(model)
         self.feature_names = list(feature_names)
         self.class_names = list(class_names)
         self.mode = mode
+        self.training_data = np.asarray(training_data)
         self.explainer = LimeTabularExplainer(
-            training_data=training_data,
-            feature_names=feature_names,
-            class_names=class_names,
+            training_data=self.training_data,
+            feature_names=self.feature_names,
+            class_names=self.class_names,
             mode=mode
         )
-    def explain(self, instance, num_features=5, top_labels=1):
+    def explain(
+        self,
+        instance: np.ndarray,
+        num_features: int = 5,
+        top_labels: int = 1
+    ) -> Explanation:
         """
         Generate a local explanation for the given instance.
@@ -72,6 +126,8 @@ class LimeExplainer(BaseExplainer):
         Returns:
             Explanation object with feature attributions
         """
+        instance = np.asarray(instance).flatten()
         lime_exp = self.explainer.explain_instance(
             data_row=instance,
             predict_fn=self.model.predict,
@@ -86,5 +142,32 @@ class LimeExplainer(BaseExplainer):
         return Explanation(
             explainer_name="LIME",
             target_class=label_name,
-            explanation_data={"feature_attributions": attributions}
+            explanation_data={"feature_attributions": attributions},
+            feature_names=self.feature_names
         )
+    def explain_batch(
+        self,
+        X: np.ndarray,
+        num_features: int = 5,
+        top_labels: int = 1
+    ) -> List[Explanation]:
+        """
+        Generate explanations for multiple instances.
+        Args:
+            X: 2D numpy array of instances
+            num_features: Number of features per explanation
+            top_labels: Number of top labels to explain
+        Returns:
+            List of Explanation objects
+        """
+        X = np.asarray(X)
+        if X.ndim == 1:
+            X = X.reshape(1, -1)
+        return [
+            self.explain(X[i], num_features=num_features, top_labels=top_labels)
+            for i in range(X.shape[0])
+        ]

explainiverse 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl

explainiverse 0.6.0py3-none-any.whl → 0.7.1py3-none-any.whl