PyPI - churnkit - Versions diffs - 0.75.0a1__py3-none-any.whl - Mend

churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (302) hide show

customer_retention/analysis/interpretability/cohort_analyzer.py ADDED Viewed

@@ -0,0 +1,185 @@
+"""Cohort-level interpretability analysis."""
+from dataclasses import dataclass, field
+from typing import Any, Dict, List
+import numpy as np
+import shap
+from customer_retention.core.compat import DataFrame, Series, pd
+@dataclass
+class CohortInsight:
+    cohort_name: str
+    cohort_size: int
+    cohort_percentage: float
+    churn_rate: float
+    top_features: List[Dict[str, float]]
+    key_differentiators: List[str] = field(default_factory=list)
+    recommended_strategy: str = ""
+@dataclass
+class CohortComparison:
+    cohort_a: str
+    cohort_b: str
+    feature_differences: Dict[str, float]
+    churn_rate_difference: float
+    key_differences: List[str] = field(default_factory=list)
+@dataclass
+class CohortAnalysisResult:
+    cohort_insights: List[CohortInsight]
+    key_differences: List[str]
+    overall_summary: str = ""
+class CohortAnalyzer:
+    def __init__(self, model: Any, background_data: DataFrame, max_samples: int = 100):
+        self.model = model
+        self.background_data = background_data.head(max_samples)
+        self._explainer = self._create_explainer()
+    def _create_explainer(self) -> shap.Explainer:
+        model_type = type(self.model).__name__
+        if model_type in ["RandomForestClassifier", "GradientBoostingClassifier"]:
+            return shap.TreeExplainer(self.model)
+        return shap.KernelExplainer(self.model.predict_proba, self.background_data)
+    def analyze(self, X: DataFrame, y: Series, cohorts: Series) -> CohortAnalysisResult:
+        unique_cohorts = cohorts.unique()
+        insights = []
+        all_features_by_cohort = {}
+        for cohort in unique_cohorts:
+            mask = cohorts == cohort
+            cohort_X = X[mask]
+            cohort_y = y[mask]
+            churn_rate = float(1 - cohort_y.mean())
+            top_features = self._get_cohort_feature_importance(cohort_X)
+            all_features_by_cohort[cohort] = top_features
+            strategy = self._generate_strategy(cohort, churn_rate, top_features)
+            insights.append(CohortInsight(
+                cohort_name=cohort,
+                cohort_size=len(cohort_X),
+                cohort_percentage=len(cohort_X) / len(X),
+                churn_rate=churn_rate,
+                top_features=top_features,
+                recommended_strategy=strategy
+            ))
+        key_differences = self._identify_key_differences(all_features_by_cohort, insights)
+        for insight in insights:
+            insight.key_differentiators = self._get_differentiators(insight.cohort_name, all_features_by_cohort)
+        return CohortAnalysisResult(
+            cohort_insights=insights,
+            key_differences=key_differences
+        )
+    def _get_cohort_feature_importance(self, cohort_X: DataFrame) -> List[Dict[str, float]]:
+        if len(cohort_X) == 0:
+            return []
+        sample = cohort_X.head(min(50, len(cohort_X)))
+        shap_values = self._extract_shap_values(sample)
+        mean_abs_shap = np.abs(shap_values).mean(axis=0)
+        sorted_indices = np.argsort(mean_abs_shap)[::-1][:5]
+        result = []
+        for idx in sorted_indices:
+            importance_val = mean_abs_shap[idx]
+            if hasattr(importance_val, '__len__') and len(importance_val) == 1:
+                importance_val = importance_val[0]
+            result.append({"feature": cohort_X.columns[idx], "importance": float(importance_val)})
+        return result
+    def _extract_shap_values(self, X: DataFrame) -> np.ndarray:
+        shap_values = self._explainer.shap_values(X)
+        if hasattr(shap_values, 'values'):
+            shap_values = shap_values.values
+        if isinstance(shap_values, list):
+            shap_values = shap_values[1]
+        if len(shap_values.shape) == 3:
+            shap_values = shap_values[:, :, 1]
+        return shap_values
+    def _generate_strategy(self, cohort: str, churn_rate: float,
+                           top_features: List[Dict[str, float]]) -> str:
+        if churn_rate > 0.5:
+            priority = "urgent intervention"
+        elif churn_rate > 0.3:
+            priority = "proactive engagement"
+        else:
+            priority = "standard nurturing"
+        top_feature = top_features[0]["feature"] if top_features else "engagement"
+        return f"Focus on {top_feature} with {priority} for {cohort} cohort"
+    def _identify_key_differences(self, features_by_cohort: Dict[str, List[Dict[str, float]]],
+                                  insights: List[CohortInsight]) -> List[str]:
+        differences = []
+        churn_rates = {i.cohort_name: i.churn_rate for i in insights}
+        if churn_rates:
+            max_cohort = max(churn_rates, key=churn_rates.get)
+            min_cohort = min(churn_rates, key=churn_rates.get)
+            diff = churn_rates[max_cohort] - churn_rates[min_cohort]
+            differences.append(f"{max_cohort} has {diff:.1%} higher churn than {min_cohort}")
+        for cohort, features in features_by_cohort.items():
+            if features:
+                top = features[0]["feature"]
+                differences.append(f"{cohort}: top driver is {top}")
+        return differences
+    def _get_differentiators(self, cohort: str,
+                             features_by_cohort: Dict[str, List[Dict[str, float]]]) -> List[str]:
+        cohort_features = features_by_cohort.get(cohort, [])
+        cohort_top = set(f["feature"] for f in cohort_features[:3])
+        other_tops = set()
+        for other, features in features_by_cohort.items():
+            if other != cohort:
+                other_tops.update(f["feature"] for f in features[:3])
+        unique = cohort_top - other_tops
+        return [f"{cohort} uniquely driven by {f}" for f in unique]
+    def compare_cohorts(self, X: DataFrame, y: Series, cohorts: Series,
+                        cohort_a: str, cohort_b: str) -> CohortComparison:
+        mask_a = cohorts == cohort_a
+        mask_b = cohorts == cohort_b
+        churn_a = 1 - y[mask_a].mean()
+        churn_b = 1 - y[mask_b].mean()
+        feature_diffs = {}
+        for col in X.columns:
+            mean_a = X.loc[mask_a, col].mean()
+            mean_b = X.loc[mask_b, col].mean()
+            feature_diffs[col] = float(mean_a - mean_b)
+        key_diffs = []
+        sorted_diffs = sorted(feature_diffs.items(), key=lambda x: abs(x[1]), reverse=True)
+        for feature, diff in sorted_diffs[:3]:
+            direction = "higher" if diff > 0 else "lower"
+            key_diffs.append(f"{cohort_a} has {direction} {feature} than {cohort_b}")
+        return CohortComparison(
+            cohort_a=cohort_a,
+            cohort_b=cohort_b,
+            feature_differences=feature_diffs,
+            churn_rate_difference=float(churn_a - churn_b),
+            key_differences=key_diffs
+        )
+    @staticmethod
+    def create_tenure_cohorts(tenure: Series,
+                              bins: List[float] = None) -> Series:
+        bins = bins or [0, 90, 365, float("inf")]
+        labels = ["New", "Established", "Mature"]
+        return pd.cut(tenure, bins=bins, labels=labels)
+    @staticmethod
+    def create_value_cohorts(value: Series,
+                             quantiles: List[float] = None) -> Series:
+        quantiles = quantiles or [0.33, 0.66]
+        q1, q2 = value.quantile(quantiles[0]), value.quantile(quantiles[1])
+        return pd.cut(value, bins=[-float("inf"), q1, q2, float("inf")],
+                      labels=["Low", "Medium", "High"])
+    @staticmethod
+    def create_activity_cohorts(activity: Series,
+                                thresholds: List[float] = None) -> Series:
+        thresholds = thresholds or [5, 15]
+        return pd.cut(activity, bins=[-float("inf"), thresholds[0], thresholds[1], float("inf")],
+                      labels=["Dormant", "Moderate", "Active"])

customer_retention/analysis/interpretability/counterfactual.py ADDED Viewed

@@ -0,0 +1,175 @@
+"""Counterfactual explanation generation."""
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+import numpy as np
+from customer_retention.core.compat import DataFrame, Series
+@dataclass
+class CounterfactualChange:
+    feature_name: str
+    original_value: float
+    new_value: float
+    change_magnitude: float
+@dataclass
+class Counterfactual:
+    original_prediction: float
+    counterfactual_prediction: float
+    changes: List[CounterfactualChange]
+    feasibility_score: float
+    business_interpretation: str
+class CounterfactualGenerator:
+    def __init__(self, model: Any, reference_data: DataFrame,
+                 actionable_features: Optional[List[str]] = None,
+                 constraints: Optional[Dict[str, Dict[str, float]]] = None):
+        self.model = model
+        self.reference_data = reference_data
+        self.actionable_features = actionable_features or list(reference_data.columns)
+        self.constraints = constraints or {}
+        self._feature_bounds = self._calculate_bounds()
+    def _calculate_bounds(self) -> Dict[str, Dict[str, float]]:
+        bounds = {}
+        for col in self.reference_data.columns:
+            bounds[col] = {
+                "min": float(self.reference_data[col].min()),
+                "max": float(self.reference_data[col].max()),
+                "mean": float(self.reference_data[col].mean()),
+                "std": float(self.reference_data[col].std())
+            }
+        return bounds
+    def generate(self, instance: Series, target_class: int = 0,
+                 max_iterations: int = 100) -> Counterfactual:
+        instance_df = instance.to_frame().T
+        original_pred = float(self.model.predict_proba(instance_df)[0, 1])
+        best_cf = instance.copy()
+        best_pred = original_pred
+        best_changes = []
+        target_pred = 0.3 if target_class == 0 else 0.7
+        for _ in range(max_iterations):
+            candidate = self._perturb_instance(instance, best_cf)
+            candidate_df = candidate.to_frame().T
+            pred = float(self.model.predict_proba(candidate_df)[0, 1])
+            improved = (target_class == 0 and pred < best_pred) or (target_class == 1 and pred > best_pred)
+            if improved:
+                best_cf = candidate
+                best_pred = pred
+                best_changes = self._compute_changes(instance, best_cf)
+            if (target_class == 0 and best_pred < target_pred) or (target_class == 1 and best_pred > target_pred):
+                break
+        feasibility = self._calculate_feasibility(instance, best_cf)
+        interpretation = self._generate_interpretation(best_changes, original_pred, best_pred)
+        return Counterfactual(
+            original_prediction=original_pred,
+            counterfactual_prediction=best_pred,
+            changes=best_changes,
+            feasibility_score=feasibility,
+            business_interpretation=interpretation
+        )
+    def _perturb_instance(self, original: Series, current: Series) -> Series:
+        candidate = current.copy()
+        feature = np.random.choice(self.actionable_features)
+        bounds = self._get_feature_bounds(feature)
+        current_val = candidate[feature]
+        step = (bounds["max"] - bounds["min"]) * 0.1
+        direction = np.random.choice([-1, 1])
+        new_val = current_val + direction * step * np.random.uniform(0.5, 1.5)
+        new_val = np.clip(new_val, bounds["min"], bounds["max"])
+        candidate[feature] = new_val
+        return candidate
+    def _get_feature_bounds(self, feature: str) -> Dict[str, float]:
+        if feature in self.constraints:
+            constraint = self.constraints[feature]
+            return {
+                "min": constraint.get("min", self._feature_bounds[feature]["min"]),
+                "max": constraint.get("max", self._feature_bounds[feature]["max"])
+            }
+        return self._feature_bounds[feature]
+    def _compute_changes(self, original: Series, counterfactual: Series) -> List[CounterfactualChange]:
+        changes = []
+        for feature in self.actionable_features:
+            if abs(original[feature] - counterfactual[feature]) > 1e-6:
+                changes.append(CounterfactualChange(
+                    feature_name=feature,
+                    original_value=float(original[feature]),
+                    new_value=float(counterfactual[feature]),
+                    change_magnitude=float(abs(original[feature] - counterfactual[feature]))
+                ))
+        return changes
+    def _calculate_feasibility(self, original: Series, counterfactual: Series) -> float:
+        total_change = 0
+        max_change = 0
+        for feature in self.actionable_features:
+            bounds = self._feature_bounds[feature]
+            range_size = bounds["max"] - bounds["min"]
+            if range_size > 0:
+                normalized_change = abs(original[feature] - counterfactual[feature]) / range_size
+                total_change += normalized_change
+                max_change += 1
+        if max_change == 0:
+            return 1.0
+        feasibility = 1 - (total_change / max_change)
+        return max(0.0, min(1.0, feasibility))
+    def _generate_interpretation(self, changes: List[CounterfactualChange],
+                                 original_pred: float, new_pred: float) -> str:
+        if not changes:
+            return "No changes needed to achieve target prediction."
+        change_strs = []
+        for c in changes[:3]:
+            direction = "increase" if c.new_value > c.original_value else "decrease"
+            change_strs.append(f"{direction} {c.feature_name} from {c.original_value:.2f} to {c.new_value:.2f}")
+        changes_text = ", ".join(change_strs)
+        return f"To reduce churn risk from {original_pred:.1%} to {new_pred:.1%}: {changes_text}"
+    def generate_diverse(self, instance: Series, n: int = 3) -> List[Counterfactual]:
+        counterfactuals = []
+        used_features = set()
+        for _ in range(n):
+            available = [f for f in self.actionable_features if f not in used_features]
+            if not available:
+                available = self.actionable_features
+            temp_generator = CounterfactualGenerator(
+                self.model, self.reference_data,
+                actionable_features=available,
+                constraints=self.constraints
+            )
+            cf = temp_generator.generate(instance)
+            counterfactuals.append(cf)
+            for change in cf.changes:
+                used_features.add(change.feature_name)
+        return counterfactuals
+    def generate_prototype(self, instance: Series, prototype_data: DataFrame) -> Counterfactual:
+        instance_df = instance.to_frame().T
+        original_pred = float(self.model.predict_proba(instance_df)[0, 1])
+        prototype = prototype_data.mean()
+        best_cf = instance.copy()
+        for feature in self.actionable_features:
+            bounds = self._get_feature_bounds(feature)
+            target_val = np.clip(prototype[feature], bounds["min"], bounds["max"])
+            best_cf[feature] = instance[feature] + 0.5 * (target_val - instance[feature])
+        cf_df = best_cf.to_frame().T
+        new_pred = float(self.model.predict_proba(cf_df)[0, 1])
+        changes = self._compute_changes(instance, best_cf)
+        feasibility = self._calculate_feasibility(instance, best_cf)
+        interpretation = self._generate_interpretation(changes, original_pred, new_pred)
+        return Counterfactual(
+            original_prediction=original_pred,
+            counterfactual_prediction=new_pred,
+            changes=changes,
+            feasibility_score=feasibility,
+            business_interpretation=interpretation
+        )

customer_retention/analysis/interpretability/individual_explainer.py ADDED Viewed

@@ -0,0 +1,141 @@
+"""Individual customer explanation."""
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Dict, List, Optional
+import numpy as np
+import shap
+from sklearn.neighbors import NearestNeighbors
+from sklearn.preprocessing import StandardScaler
+from customer_retention.core.compat import DataFrame, Series
+class Confidence(Enum):
+    HIGH = "high"
+    MEDIUM = "medium"
+    LOW = "low"
+@dataclass
+class RiskContribution:
+    feature_name: str
+    contribution: float
+    current_value: float
+    direction: str
+@dataclass
+class IndividualExplanation:
+    customer_id: Optional[str]
+    churn_probability: float
+    base_value: float
+    shap_values: np.ndarray
+    top_positive_factors: List[RiskContribution]
+    top_negative_factors: List[RiskContribution]
+    confidence: Confidence
+    feature_names: List[str] = field(default_factory=list)
+class IndividualExplainer:
+    def __init__(self, model: Any, background_data: DataFrame, max_samples: int = 100):
+        self.model = model
+        self.background_data = background_data.head(max_samples)
+        self.feature_names = list(background_data.columns)
+        self._explainer = self._create_explainer()
+    def _create_explainer(self) -> shap.Explainer:
+        model_type = type(self.model).__name__
+        if model_type in ["RandomForestClassifier", "GradientBoostingClassifier"]:
+            return shap.TreeExplainer(self.model)
+        if model_type in ["LogisticRegression", "LinearRegression"]:
+            return shap.LinearExplainer(self.model, self.background_data)
+        return shap.KernelExplainer(self.model.predict_proba, self.background_data)
+    def explain(self, instance: Series, customer_id: Optional[str] = None,
+                top_n: int = 3) -> IndividualExplanation:
+        instance_df = instance.to_frame().T
+        shap_values = self._extract_shap_values(instance_df)
+        churn_prob = float(self.model.predict_proba(instance_df)[0, 1])
+        expected_value = self._get_expected_value()
+        positive_factors = self._extract_factors(instance, shap_values, top_n, positive=True)
+        negative_factors = self._extract_factors(instance, shap_values, top_n, positive=False)
+        confidence = self._assess_confidence(churn_prob)
+        return IndividualExplanation(
+            customer_id=customer_id,
+            churn_probability=churn_prob,
+            base_value=float(expected_value),
+            shap_values=shap_values,
+            top_positive_factors=positive_factors,
+            top_negative_factors=negative_factors,
+            confidence=confidence,
+            feature_names=self.feature_names
+        )
+    def _extract_shap_values(self, X: DataFrame) -> np.ndarray:
+        shap_values = self._explainer.shap_values(X)
+        if hasattr(shap_values, 'values'):
+            shap_values = shap_values.values
+        if isinstance(shap_values, list):
+            shap_values = shap_values[1]
+        if len(shap_values.shape) == 3:
+            shap_values = shap_values[:, :, 1]
+        return shap_values.flatten()
+    def _get_expected_value(self) -> float:
+        expected_value = self._explainer.expected_value
+        if hasattr(expected_value, '__len__'):
+            if len(expected_value) > 1:
+                return float(expected_value[1])
+            return float(expected_value[0])
+        return float(expected_value)
+    def _extract_factors(self, instance: Series, shap_values: np.ndarray,
+                         top_n: int, positive: bool) -> List[RiskContribution]:
+        if positive:
+            indices = np.argsort(shap_values)[::-1]
+            values = [(i, shap_values[i]) for i in indices if shap_values[i] > 0]
+        else:
+            indices = np.argsort(shap_values)
+            values = [(i, shap_values[i]) for i in indices if shap_values[i] < 0]
+        factors = []
+        for idx, contrib in values[:top_n]:
+            feature_name = self.feature_names[idx]
+            factors.append(RiskContribution(
+                feature_name=feature_name,
+                contribution=float(contrib),
+                current_value=float(instance[feature_name]),
+                direction="increases risk" if contrib > 0 else "decreases risk"
+            ))
+        return factors
+    def _assess_confidence(self, probability: float) -> Confidence:
+        if probability < 0.2 or probability > 0.8:
+            return Confidence.HIGH
+        if 0.4 < probability < 0.6:
+            return Confidence.LOW
+        return Confidence.MEDIUM
+    def find_similar_customers(self, instance: Series, X: DataFrame,
+                               y: Series, k: int = 5) -> List[Dict]:
+        scaler = StandardScaler()
+        X_scaled = scaler.fit_transform(X)
+        instance_scaled = scaler.transform(instance.to_frame().T)
+        knn = NearestNeighbors(n_neighbors=k + 1, metric="euclidean")
+        knn.fit(X_scaled)
+        distances, indices = knn.kneighbors(instance_scaled)
+        similar = []
+        for dist, idx in zip(distances[0][1:], indices[0][1:]):
+            similar.append({
+                "index": int(idx),
+                "distance": float(dist),
+                "outcome": int(y.iloc[idx]),
+                "features": X.iloc[idx].to_dict()
+            })
+        return similar
+    def explain_batch(self, X: DataFrame,
+                      customer_ids: Optional[List[str]] = None) -> List[IndividualExplanation]:
+        customer_ids = customer_ids or [None] * len(X)
+        return [self.explain(X.iloc[i], customer_ids[i]) for i in range(len(X))]

customer_retention/analysis/interpretability/pdp_generator.py ADDED Viewed

@@ -0,0 +1,103 @@
+"""Partial Dependence Plot generation."""
+from dataclasses import dataclass
+from typing import Any, List, Optional
+import numpy as np
+from sklearn.inspection import partial_dependence
+from customer_retention.core.compat import DataFrame
+@dataclass
+class PDPResult:
+    feature_name: str
+    grid_values: np.ndarray
+    pdp_values: np.ndarray
+    feature_min: float
+    feature_max: float
+    average_prediction: float
+    ice_values: Optional[List[np.ndarray]] = None
+@dataclass
+class InteractionResult:
+    feature1_name: str
+    feature2_name: str
+    grid1_values: np.ndarray
+    grid2_values: np.ndarray
+    pdp_matrix: np.ndarray
+class PDPGenerator:
+    def __init__(self, model: Any):
+        self.model = model
+    def generate(self, X: DataFrame, feature: str, grid_resolution: int = 50,
+                 include_ice: bool = False, ice_lines: int = 100) -> PDPResult:
+        feature_idx = list(X.columns).index(feature)
+        pd_result = partial_dependence(
+            self.model, X, [feature_idx], kind="average", grid_resolution=grid_resolution
+        )
+        grid_values = pd_result["grid_values"][0]
+        pdp_values = pd_result["average"][0]
+        ice_values = None
+        if include_ice:
+            ice_values = self._calculate_ice(X, feature, grid_values, ice_lines)
+        return PDPResult(
+            feature_name=feature,
+            grid_values=grid_values,
+            pdp_values=pdp_values,
+            feature_min=float(X[feature].min()),
+            feature_max=float(X[feature].max()),
+            average_prediction=float(np.mean(pdp_values)),
+            ice_values=ice_values
+        )
+    def _calculate_ice(self, X: DataFrame, feature: str,
+                       grid_values: np.ndarray, n_samples: int) -> List[np.ndarray]:
+        sample_indices = np.random.choice(len(X), min(n_samples, len(X)), replace=False)
+        ice_lines = []
+        for idx in sample_indices:
+            X_temp = X.iloc[[idx]].copy()
+            predictions = []
+            for val in grid_values:
+                X_temp[feature] = val
+                pred = self.model.predict_proba(X_temp)[0, 1]
+                predictions.append(pred)
+            ice_lines.append(np.array(predictions))
+        return ice_lines
+    def generate_multiple(self, X: DataFrame, features: List[str],
+                          grid_resolution: int = 50) -> List[PDPResult]:
+        return [self.generate(X, feature, grid_resolution) for feature in features]
+    def generate_top_features(self, X: DataFrame, n_features: int = 5,
+                              grid_resolution: int = 50) -> List[PDPResult]:
+        importances = {}
+        for feature in X.columns:
+            X_shuffled = X.copy()
+            X_shuffled[feature] = np.random.permutation(X_shuffled[feature].values)
+            original_pred = self.model.predict_proba(X)[:, 1].mean()
+            shuffled_pred = self.model.predict_proba(X_shuffled)[:, 1].mean()
+            importances[feature] = abs(original_pred - shuffled_pred)
+        top_features = sorted(importances.keys(), key=lambda f: importances[f], reverse=True)[:n_features]
+        return self.generate_multiple(X, top_features, grid_resolution)
+    def generate_interaction(self, X: DataFrame, feature1: str, feature2: str,
+                             grid_resolution: int = 20) -> InteractionResult:
+        feature1_idx = list(X.columns).index(feature1)
+        feature2_idx = list(X.columns).index(feature2)
+        pd_result = partial_dependence(
+            self.model, X, [(feature1_idx, feature2_idx)], kind="average", grid_resolution=grid_resolution
+        )
+        grid1 = pd_result["grid_values"][0]
+        grid2 = pd_result["grid_values"][1]
+        pdp_matrix = pd_result["average"][0]
+        return InteractionResult(
+            feature1_name=feature1,
+            feature2_name=feature2,
+            grid1_values=grid1,
+            grid2_values=grid2,
+            pdp_matrix=pdp_matrix
+        )