PyPI - fast-causal-shap - Versions diffs - 0.1.3__py3-none-any.whl - Mend

fast-causal-shap 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

fast_causal_shap/__init__.py +9 -0
fast_causal_shap/core.py +444 -0
fast_causal_shap-0.1.3.dist-info/METADATA +104 -0
fast_causal_shap-0.1.3.dist-info/RECORD +7 -0
fast_causal_shap-0.1.3.dist-info/WHEEL +5 -0
fast_causal_shap-0.1.3.dist-info/licenses/LICENSE +21 -0
fast_causal_shap-0.1.3.dist-info/top_level.txt +1 -0

fast_causal_shap/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""Fast Causal SHAP - A Python package for efficient causal SHAP computations."""
+from .core import FastCausalSHAP
+__version__ = "0.1.0"
+__author__ = "woonyee28"
+__email__ = "ngnwy289@gmail.com"
+__all__ = ["FastCausalSHAP"]

fast_causal_shap/core.py ADDED Viewed

@@ -0,0 +1,444 @@
+import json
+import logging
+from collections import defaultdict
+from math import factorial
+from typing import Any, Dict, List, Optional, Tuple
+import networkx as nx
+import numpy as np
+import pandas as pd
+from sklearn.linear_model import LinearRegression
+logger = logging.getLogger(__name__)
+class FastCausalSHAP:
+    def __init__(self, data: pd.DataFrame, model: Any, target_variable: str) -> None:
+        """
+        Initialize FastCausalSHAP with data, model, and target variable.
+        Parameters
+        ----------
+        data : pd.DataFrame
+            The dataset containing features and target variable.
+            Must not be empty.
+        model : Any
+            A fitted sklearn model with predict() method and feature_names_in_ attribute
+            Can be a classifier or regressor.
+        target_variable : str
+            The name of the target variable column in the data.
+            Must exist in data.columns.
+        Raises
+        ------
+        TypeError
+            If data is not a pandas DataFrame.
+        ValueError
+            If data is empty or target_variable not in data columns.
+        AttributeError
+            If model doesn't have required methods/attributes.
+        Examples
+        --------
+        >>> from sklearn.ensemble import RandomForestRegressor
+        >>> import pandas as pd
+        >>>
+        >>> data = pd.DataFrame({'X1': [1, 2, 3], 'X2': [4, 5, 6], 'Y': [7, 8, 9]})
+        >>> model = RandomForestRegressor()
+        >>> model.fit(data[['X1', 'X2']], data['Y'])
+        >>>
+        >>> shap = FastCausalSHAP(data, model, 'Y')
+        """
+        if not isinstance(data, pd.DataFrame):
+            raise TypeError("data must be a pandas DataFrame")
+        if data.empty:
+            raise ValueError("data must not be empty")
+        if target_variable not in data.columns:
+            raise ValueError(
+                f"target_variable '{target_variable}' not found in data columns. "
+                f"Available columns: {list(data.columns)}"
+            )
+        if not hasattr(model, "predict"):
+            raise AttributeError("model must have a predict method")
+        if not hasattr(model, "feature_names_in_"):
+            raise AttributeError(
+                "model must have 'feature_names_in_' attribute. "
+                "Ensure the model has been fitted before passing it."
+            )
+        self.data: pd.DataFrame = data
+        self.model: Any = model
+        self.gamma: Optional[Dict[str, float]] = None
+        self.target_variable: str = target_variable
+        self.ida_graph: Optional[nx.DiGraph] = None
+        self.regression_models: Dict[Tuple[str, Tuple[str, ...]], Tuple[Any, float]] = (
+            {}
+        )
+        self.feature_depths: Dict[str, int] = {}
+        self.path_cache: Dict[Any, float] = {}
+        self.causal_paths: Dict[str, List[List[str]]] = {}
+    def remove_cycles(self) -> List[Tuple[str, str, float]]:
+        """
+        Detects cycles in the graph and removes edges causing cycles.
+        Returns a list of removed edges.
+        """
+        if self.ida_graph is None:
+            return []
+        G = self.ida_graph.copy()
+        removed_edges = []
+        # Find all cycles in the graph
+        try:
+            cycles = list(nx.simple_cycles(G))
+        except nx.NetworkXNoCycle:
+            return []  # No cycles found
+        while cycles:
+            # Get the current cycle
+            cycle = cycles[0]
+            # Find the edge with the smallest weight in the cycle
+            min_weight = float("inf")
+            edge_to_remove = None
+            for i in range(len(cycle)):
+                source = cycle[i]
+                target = cycle[(i + 1) % len(cycle)]
+                if G.has_edge(source, target):
+                    weight = abs(G[source][target]["weight"])
+                    if weight < min_weight:
+                        min_weight = weight
+                        edge_to_remove = (source, target)
+            if edge_to_remove:
+                # Remove the edge with the smallest weight
+                G.remove_edge(*edge_to_remove)
+                removed_edges.append(
+                    (
+                        edge_to_remove[0],
+                        edge_to_remove[1],
+                        self.ida_graph[edge_to_remove[0]][edge_to_remove[1]]["weight"],
+                    )
+                )
+                # Recalculate cycles after removing an edge
+                try:
+                    cycles = list(nx.simple_cycles(G))
+                except nx.NetworkXNoCycle:
+                    cycles = []  # No more cycles
+            else:
+                break
+        # Update the graph
+        self.ida_graph = G
+        return removed_edges
+    def _compute_causal_paths(self) -> None:
+        """Compute and store all causal paths to target for each feature."""
+        features = [col for col in self.data.columns if col != self.target_variable]
+        for feature in features:
+            try:
+                # Store the actual paths instead of just the features
+                paths = list(
+                    nx.all_simple_paths(self.ida_graph, feature, self.target_variable)
+                )
+                self.causal_paths[feature] = paths
+            except nx.NetworkXNoPath:
+                self.causal_paths[feature] = []
+    def load_causal_strengths(self, json_file_path: str) -> Dict[str, float]:
+        """Load causal strengths from JSON file and compute gamma values."""
+        if not isinstance(json_file_path, str):
+            raise TypeError("json_file_path must be a string")
+        import os
+        if not os.path.isfile(json_file_path):
+            raise ValueError("json_file_path must be a valid file path")
+        try:
+            with open(json_file_path, "r") as f:
+                causal_effects_list = json.load(f)
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid JSON file: {json_file_path}. Error: {e}")
+        if not isinstance(causal_effects_list, list):
+            raise ValueError(
+                f"JSON file must has a list, got {type(causal_effects_list).__name__}"
+            )
+        if not causal_effects_list:
+            raise ValueError("JSON file contains an empty list")
+        G = nx.DiGraph()
+        nodes = list(self.data.columns)
+        G.add_nodes_from(nodes)
+        for item in causal_effects_list:
+            pair = item["Pair"]
+            mean_causal_effect = item["Mean_Causal_Effect"]
+            if mean_causal_effect is None:
+                continue
+            source, target = pair.split("->")
+            source = source.strip()
+            target = target.strip()
+            G.add_edge(source, target, weight=mean_causal_effect)
+        self.ida_graph = G.copy()
+        removed_edges = self.remove_cycles()
+        if removed_edges:
+            logger.info(
+                f"Removed {len(removed_edges)} edges to make the graph acyclic:"
+            )
+            for source, target, weight in removed_edges:
+                logger.info(f"  {source} -> {target} (weight: {weight})")
+        self._compute_feature_depths()
+        self._compute_causal_paths()
+        features = self.data.columns.tolist()
+        beta_dict = {}
+        for feature in features:
+            if feature == self.target_variable:
+                continue
+            try:
+                paths = list(
+                    nx.all_simple_paths(G, source=feature, target=self.target_variable)
+                )
+            except nx.NetworkXNoPath:
+                continue
+            total_effect = 0
+            for path in paths:
+                effect = 1
+                for i in range(len(path) - 1):
+                    edge_weight = G[path[i]][path[i + 1]]["weight"]
+                    effect *= edge_weight
+                total_effect += effect
+            if total_effect != 0:
+                beta_dict[feature] = total_effect
+        total_causal_effect = sum(abs(beta) for beta in beta_dict.values())
+        if total_causal_effect == 0:
+            self.gamma = {k: 0.0 for k in features}
+        else:
+            self.gamma = {
+                k: abs(beta_dict.get(k, 0.0)) / total_causal_effect for k in features
+            }
+        return self.gamma
+    def _compute_feature_depths(self) -> None:
+        """Compute minimum depth of each feature to target in causal graph."""
+        features = [col for col in self.data.columns if col != self.target_variable]
+        for feature in features:
+            try:
+                all_paths = list(
+                    nx.all_simple_paths(self.ida_graph, feature, self.target_variable)
+                )
+                if all_paths:
+                    min_depth = min(len(path) - 1 for path in all_paths)
+                    self.feature_depths[feature] = min_depth
+            except nx.NetworkXNoPath:
+                continue
+    def get_topological_order(self, S: List[str]) -> List[str]:
+        """Returns the topological order of variables after intervening on subset S."""
+        if self.ida_graph is None:
+            return []
+        G_intervened = self.ida_graph.copy()
+        for feature in S:
+            G_intervened.remove_edges_from(list(G_intervened.in_edges(feature)))
+        missing_nodes = set(self.data.columns) - set(G_intervened.nodes)
+        G_intervened.add_nodes_from(missing_nodes)
+        try:
+            order = list(nx.topological_sort(G_intervened))
+        except nx.NetworkXUnfeasible:
+            raise ValueError("The causal graph contains cycles.")
+        return order
+    def get_parents(self, feature: str) -> List[str]:
+        """Returns the parent features for a given feature in the causal graph."""
+        if self.ida_graph is None:
+            return []
+        return list(self.ida_graph.predecessors(feature))
+    def sample_marginal(self, feature: str) -> float:
+        """Sample a value from the marginal distribution of the specified feature."""
+        return self.data[feature].sample(1).iloc[0]
+    def sample_conditional(
+        self, feature: str, parent_values: Dict[str, float]
+    ) -> float:
+        """Sample a value for a feature conditioned on its parent features."""
+        effective_parents = [
+            p for p in self.get_parents(feature) if p != self.target_variable
+        ]
+        if not effective_parents:
+            return self.sample_marginal(feature)
+        model_key = (feature, tuple(sorted(effective_parents)))
+        if model_key not in self.regression_models:
+            X = self.data[effective_parents].values
+            y = self.data[feature].values
+            reg = LinearRegression()
+            reg.fit(X, y)
+            residuals = y - reg.predict(X)
+            std = residuals.std()
+            self.regression_models[model_key] = (reg, std)
+        reg, std = self.regression_models[model_key]
+        parent_values_array = np.array(
+            [parent_values[parent] for parent in effective_parents]
+        ).reshape(1, -1)
+        mean = reg.predict(parent_values_array)[0]
+        sampled_value = np.random.normal(mean, std)
+        return sampled_value
+    def compute_v_do(
+        self, S: List[str], x_S: Dict[str, float], is_classifier: bool = False
+    ) -> float:
+        """Compute interventional expectations with caching."""
+        cache_key = (
+            frozenset(S),
+            tuple(sorted(x_S.items())) if len(x_S) > 0 else tuple(),
+        )
+        if cache_key in self.path_cache:
+            return self.path_cache[cache_key]
+        variables_order = self.get_topological_order(S)
+        sample = {}
+        for feature in S:
+            sample[feature] = x_S[feature]
+        for feature in variables_order:
+            if feature in S or feature == self.target_variable:
+                continue
+            parents = self.get_parents(feature)
+            parent_values = {
+                p: x_S[p] if p in S else sample[p]
+                for p in parents
+                if p != self.target_variable
+            }
+            if not parent_values:
+                sample[feature] = self.sample_marginal(feature)
+            else:
+                sample[feature] = self.sample_conditional(feature, parent_values)
+        intervened_data = pd.DataFrame([sample])
+        intervened_data = intervened_data[self.model.feature_names_in_]
+        if is_classifier:
+            probas = self.model.predict_proba(intervened_data)[:, 1]
+        else:
+            probas = self.model.predict(intervened_data)
+        result = float(np.mean(probas))
+        self.path_cache[cache_key] = result
+        return result
+    def is_on_causal_path(self, feature: str, target_feature: str) -> bool:
+        """Check if feature is on any causal path from S to target_feature."""
+        if target_feature not in self.causal_paths:
+            return False
+        path_features = self.causal_paths[target_feature]
+        return feature in path_features
+    def compute_modified_shap_proba(
+        self, x: pd.Series, is_classifier: bool = False
+    ) -> Dict[str, float]:
+        """TreeSHAP-inspired computation using causal paths and dynamic programming."""
+        if self.gamma is None:
+            raise ValueError(
+                "Must call load_causal_strengths before computing SHAP values"
+            )
+        if not isinstance(x, pd.Series):
+            raise TypeError(f"x must be a pandas Series, got {type(x).__name__}")
+        # validate x contains required features
+        required_features = self.model.feature_names_in_
+        missing_features = set(required_features) - set(x.index)
+        if missing_features:
+            raise ValueError(
+                f"x is missing required features: {missing_features}. "
+                f"Required features: {list(required_features)}"
+            )
+        features = [col for col in self.data.columns if col != self.target_variable]
+        phi_causal = {feature: 0.0 for feature in features}
+        data_without_target = self.data.drop(columns=[self.target_variable])
+        if is_classifier:
+            E_fX = self.model.predict_proba(data_without_target)[:, 1].mean()
+        else:
+            E_fX = self.model.predict(data_without_target).mean()
+        x_ordered = x[self.model.feature_names_in_]
+        if is_classifier:
+            f_x = self.model.predict_proba(x_ordered.to_frame().T)[0][1]
+        else:
+            f_x = self.model.predict(x_ordered.to_frame().T)[0]
+        sorted_features = sorted(features, key=lambda f: self.feature_depths.get(f, 0))
+        max_path_length = max(self.feature_depths.values(), default=0)
+        shapley_weights = {}
+        for m in range(max_path_length + 1):
+            for d in range(m + 1, max_path_length + 1):
+                shapley_weights[(m, d)] = (
+                    factorial(m) * factorial(d - m - 1)
+                ) / factorial(d)
+        # Track contributions using dynamic programming (EXTEND-like logic in TreeSHAP)
+        # m_values will accumulate contributions from subsets (use combinatorial logic)
+        # Essentially, values in m_values[k] represent how many ways there are
+        # to select k nodes from the path seen so far.
+        for feature in sorted_features:
+            if feature not in self.causal_paths:
+                continue
+            for path in self.causal_paths[feature]:
+                path_features = [n for n in path if n != self.target_variable]
+                d = len(path_features)
+                m_values = defaultdict(float)
+                m_values[0] = 1.0
+                for node in path_features:
+                    if node == feature:
+                        continue
+                    new_m_values: defaultdict[int, float] = defaultdict(float)
+                    for m, val in m_values.items():
+                        new_m_values[m + 1] += val
+                        new_m_values[m] += val
+                    m_values = new_m_values
+                for m in m_values:
+                    weight = shapley_weights.get((m, d), 0) * self.gamma.get(feature, 0)
+                    delta_v = self._compute_path_delta_v(
+                        feature, path, m, x, is_classifier
+                    )
+                    phi_causal[feature] += weight * delta_v
+        sum_phi = sum(phi_causal.values())
+        if sum_phi != 0:
+            scaling_factor = (f_x - E_fX) / sum_phi
+            phi_causal = {k: v * scaling_factor for k, v in phi_causal.items()}
+        return phi_causal
+    def _compute_path_delta_v(
+        self, feature: str, path: List[str], m: int, x: pd.Series, is_classifier: bool
+    ) -> float:
+        """Compute Δv for a causal path using precomputed expectations."""
+        S = [n for n in path[:m] if n != feature]
+        x_S = {n: x[n] for n in S if n in x}
+        v_S = self.compute_v_do(S, x_S, is_classifier)
+        S_with_i = S + [feature]
+        x_Si = {**x_S, feature: x[feature]}
+        v_Si = self.compute_v_do(S_with_i, x_Si, is_classifier)
+        return v_Si - v_S

fast_causal_shap-0.1.3.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,104 @@
+Metadata-Version: 2.4
+Name: fast-causal-shap
+Version: 0.1.3
+Summary: A Python package for efficient causal SHAP computations
+Author-email: woonyee28 <ngnwy289@gmail.com>
+License: MIT
+Project-URL: Homepage, https://github.com/woonyee28/CausalSHAP
+Project-URL: Issues, https://github.com/woonyee28/CausalSHAP/issues
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: pandas>=1.0.0
+Requires-Dist: networkx>=2.0
+Requires-Dist: numpy>=1.18.0
+Requires-Dist: scikit-learn>=0.24.0
+Provides-Extra: dev
+Requires-Dist: pytest>=6.0; extra == "dev"
+Requires-Dist: black>=21.0; extra == "dev"
+Requires-Dist: flake8>=3.8; extra == "dev"
+Requires-Dist: mypy>=0.800; extra == "dev"
+Requires-Dist: isort>=5.0; extra == "dev"
+Requires-Dist: pytest-cov>=2.0; extra == "dev"
+Requires-Dist: pre-commit>=2.0; extra == "dev"
+Dynamic: license-file
+# Fast Causal SHAP
+Fast Causal SHAP is a Python package designed for efficient and interpretable SHAP value computation in causal inference tasks. It integrates seamlessly with various causal inference frameworks and enables feature attribution with awareness of causal dependencies.
+## Features
+- Fast computation of SHAP values for causal models
+- Support for multiple causal inference frameworks
+## Installation
+Install the stable version via PyPI:
+```bash
+pip install fast-causal-shap
+```
+Or, for the latest development version:
+```bash
+pip install git+https://github.com/woonyee28/CausalSHAP.git
+```
+## Usage
+```
+from fast_causal_shap.core import FastCausalSHAP
+# Predict probabilities and assign to training data
+predicted_probabilities = model.predict_proba(X_train)[:,1]
+X_train['target'] = predicted_probabilities
+# Initialize FastCausalInference
+ci = FastCausalInference(data=X_train, model=model, target_variable='target')
+# Load causal strengths (precomputed using R packages)
+ci.load_causal_strengths(result_dir + 'Causal_Effect.json')
+# Compute modified SHAP values for a single instance
+x_instance = X_train.iloc[33]
+print(ci.compute_modified_shap_proba(x_instance, is_classifier=True))
+```
+Format of the Causal_Effect.json:
+```
+[
+  {
+    "Pair": "Bacteroidia->Clostridia",
+    "Mean_Causal_Effect": 0.71292
+  },
+  {
+    "Pair": "Clostridia->Alphaproteobacteria",
+    "Mean_Causal_Effect": 0.37652
+  }, ......
+]
+```
+Fast Causal SHAP supports integration with structural algorithms such as:
+1. Peter-Clarke (PC) Algorithm
+2. IDA Algorithm
+3. Fast Causal Inference (FCI) Algorithm
+You can find example R code for these integrations here: [FastCausalSHAP R code examples](https://github.com/woonyee28/CausalSHAP/tree/main/code/r)
+## Citation
+If you use Fast Causal SHAP in your research, please cite:
+```
+@inproceedings{ng2025causal,
+  title={Causal SHAP: Feature Attribution with Dependency Awareness through Causal Discovery},
+  author={Ng, Woon Yee and Wang, Li Rong and Liu, Siyuan and Fan, Xiuyi},
+  booktitle={Proceedings of the International Joint Conference on Neural Networks (IJCNN)},
+  year={2025},
+  organization={IEEE}
+}
+```
+## License
+This project is licensed under the MIT License.

fast_causal_shap-0.1.3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+fast_causal_shap/__init__.py,sha256=n62hNTwd-c9-gHOpO5BGHOiq-jKhykeDPFBmydAmJy0,227
+fast_causal_shap/core.py,sha256=XfUhKHd4pmLMe6BF9igLVT0KQDpoIbRVo1VjpnY6csE,17056
+fast_causal_shap-0.1.3.dist-info/licenses/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
+fast_causal_shap-0.1.3.dist-info/METADATA,sha256=IyQJ_QjzMOSd8fp1EM6YSkS5jLlDGbdlqu0_CnoUG-0,3102
+fast_causal_shap-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+fast_causal_shap-0.1.3.dist-info/top_level.txt,sha256=nAIqoFfVB4g6cJal-o9z4LmDYIX1lj1x15oJrlsT_4E,17
+fast_causal_shap-0.1.3.dist-info/RECORD,,

fast_causal_shap-0.1.3.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (80.9.0)
+Root-Is-Purelib: true
+Tag: py3-none-any

fast_causal_shap-0.1.3.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) [year] [fullname]
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

fast_causal_shap-0.1.3.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ fast_causal_shap