PyPI - churnkit - Versions diffs - 0.75.0a1__py3-none-any.whl - Mend

churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (302) hide show

customer_retention/analysis/interpretability/shap_explainer.py ADDED Viewed

@@ -0,0 +1,106 @@
+"""SHAP-based model explainability."""
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+import numpy as np
+import shap
+from sklearn.inspection import permutation_importance
+from customer_retention.core.compat import DataFrame, Series
+@dataclass
+class FeatureImportance:
+    feature_name: str
+    importance: float
+    mean_abs_shap: float
+    business_description: Optional[str] = None
+@dataclass
+class GlobalExplanation:
+    feature_importance: List[FeatureImportance]
+    shap_values: np.ndarray
+    expected_value: float
+    feature_names: List[str] = field(default_factory=list)
+class ShapExplainer:
+    def __init__(self, model: Any, background_data: DataFrame,
+                 feature_translations: Optional[Dict[str, str]] = None, max_samples: int = 100):
+        self.model = model
+        self.background_data = background_data.head(max_samples)
+        self.feature_translations = feature_translations or {}
+        self.explainer_type = self._determine_explainer_type()
+        self._explainer = self._create_explainer()
+    def _determine_explainer_type(self) -> str:
+        model_type = type(self.model).__name__
+        tree_models = ["RandomForestClassifier", "GradientBoostingClassifier",
+                       "XGBClassifier", "LGBMClassifier", "DecisionTreeClassifier", "RandomForestRegressor"]
+        linear_models = ["LogisticRegression", "LinearRegression", "Ridge", "Lasso"]
+        if model_type in tree_models:
+            return "tree"
+        if model_type in linear_models:
+            return "linear"
+        return "kernel"
+    def _create_explainer(self) -> shap.Explainer:
+        if self.explainer_type == "tree":
+            return shap.TreeExplainer(self.model)
+        if self.explainer_type == "linear":
+            return shap.LinearExplainer(self.model, self.background_data)
+        return shap.KernelExplainer(self.model.predict_proba, self.background_data)
+    def explain_global(self, X: DataFrame, top_n: Optional[int] = None) -> GlobalExplanation:
+        shap_values = self._extract_shap_values(X)
+        mean_abs_shap = np.abs(shap_values).mean(axis=0)
+        sorted_indices = np.argsort(mean_abs_shap)[::-1]
+        if top_n:
+            sorted_indices = sorted_indices[:top_n]
+        feature_importance = []
+        for idx in sorted_indices:
+            feature_name = X.columns[idx]
+            importance_val = mean_abs_shap[idx]
+            if hasattr(importance_val, '__len__') and len(importance_val) == 1:
+                importance_val = importance_val[0]
+            feature_importance.append(FeatureImportance(
+                feature_name=feature_name,
+                importance=float(importance_val),
+                mean_abs_shap=float(importance_val),
+                business_description=self.feature_translations.get(feature_name, feature_name)
+            ))
+        expected_value = self._get_expected_value()
+        return GlobalExplanation(
+            feature_importance=feature_importance,
+            shap_values=shap_values,
+            expected_value=float(expected_value),
+            feature_names=list(X.columns)
+        )
+    def _extract_shap_values(self, X: DataFrame) -> np.ndarray:
+        shap_values = self._explainer.shap_values(X)
+        if hasattr(shap_values, 'values'):
+            shap_values = shap_values.values
+        if isinstance(shap_values, list):
+            shap_values = shap_values[1]
+        if len(shap_values.shape) == 3:
+            shap_values = shap_values[:, :, 1]
+        return shap_values
+    def _get_expected_value(self) -> float:
+        expected_value = self._explainer.expected_value
+        if hasattr(expected_value, '__len__'):
+            if len(expected_value) > 1:
+                return float(expected_value[1])
+            return float(expected_value[0])
+        return float(expected_value)
+    def calculate_permutation_importance(self, X: DataFrame, y: Series,
+                                         n_repeats: int = 10) -> Dict[str, float]:
+        result = permutation_importance(self.model, X, y, n_repeats=n_repeats, random_state=42)
+        return {feature: float(importance) for feature, importance in zip(X.columns, result.importances_mean)}
+    def get_shap_values(self, X: DataFrame) -> np.ndarray:
+        return self._extract_shap_values(X)

customer_retention/analysis/jupyter_save_hook.py ADDED Viewed

@@ -0,0 +1,28 @@
+"""Jupyter post-save hook that exports exploration notebooks to HTML.
+Add to jupyter_notebook_config.py or jupyter_server_config.py::
+    from customer_retention.analysis.jupyter_save_hook import post_save_export
+    c.ContentsManager.post_save_hook = post_save_export
+"""
+import logging
+from pathlib import Path
+from customer_retention.analysis.notebook_html_exporter import export_notebook_html
+from customer_retention.core.config.experiments import get_experiments_dir
+logger = logging.getLogger(__name__)
+EXPLORATION_DIR_NAME = "exploration_notebooks"
+def post_save_export(model, os_path, contents_manager, **kwargs):
+    if model.get("type") != "notebook":
+        return
+    path = Path(os_path)
+    if EXPLORATION_DIR_NAME not in path.parts:
+        return
+    try:
+        export_notebook_html(path, get_experiments_dir() / "docs")
+    except Exception:
+        logger.warning("HTML export failed for %s", path.name, exc_info=True)

customer_retention/analysis/notebook_html_exporter.py ADDED Viewed

@@ -0,0 +1,136 @@
+"""Export a notebook as self-contained HTML for documentation snapshots."""
+import html
+import subprocess
+import sys
+from pathlib import Path
+from typing import Optional
+TEMPLATE_DIR = Path(__file__).parents[2] / ".." / "scripts" / "templates" / "tutorial_html"
+def _preprocess_plotly(notebook_path: Path, output_dir: Path) -> Path:
+    try:
+        import nbformat
+        from customer_retention.analysis.plotly_preprocessor import PlotlyToImagePreprocessor
+    except ImportError:
+        return notebook_path
+    preprocessor = PlotlyToImagePreprocessor()
+    if not preprocessor.kaleido_available or not preprocessor.plotly_available:
+        return notebook_path
+    try:
+        with open(notebook_path, "r", encoding="utf-8") as fh:
+            nb = nbformat.read(fh, as_version=4)
+        nb, _ = preprocessor.preprocess(nb, {})
+        processed_dir = output_dir / "_processed"
+        processed_dir.mkdir(parents=True, exist_ok=True)
+        processed_path = processed_dir / notebook_path.name
+        with open(processed_path, "w", encoding="utf-8") as fh:
+            nbformat.write(nb, fh)
+        return processed_path
+    except Exception:
+        return notebook_path
+def _cleanup_processed(processed_path: Path, original_path: Path) -> None:
+    """Remove the temporary processed notebook if it differs from the original."""
+    if processed_path != original_path and processed_path.exists():
+        try:
+            processed_path.unlink()
+            parent = processed_path.parent
+            if parent.name == "_processed" and not any(parent.iterdir()):
+                parent.rmdir()
+        except OSError:
+            pass
+def export_notebook_html(notebook_path: Path, output_dir: Path) -> Optional[Path]:
+    """Export *notebook_path* to a self-contained HTML file in *output_dir*.
+    Returns the output path on success, ``None`` on failure (missing
+    ``nbconvert``, file not found, conversion error).  No exceptions are
+    raised so callers can treat this as best-effort documentation.
+    """
+    if not notebook_path.exists():
+        return None
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_name = notebook_path.stem + ".html"
+    processed_path = _preprocess_plotly(notebook_path, output_dir)
+    cmd = [
+        sys.executable, "-m", "nbconvert",
+        "--to", "html",
+        "--output", output_name,
+        "--output-dir", str(output_dir),
+    ]
+    if TEMPLATE_DIR.exists():
+        cmd.extend(["--template", str(TEMPLATE_DIR)])
+    cmd.append(str(processed_path))
+    try:
+        subprocess.run(cmd, capture_output=True, text=True, check=True)
+    except (FileNotFoundError, subprocess.CalledProcessError):
+        _cleanup_processed(processed_path, notebook_path)
+        return None
+    _cleanup_processed(processed_path, notebook_path)
+    result = output_dir / output_name
+    return result if result.exists() else None
+def check_exported_html(
+    docs_dir: Path, notebook_dir: Path
+) -> tuple[list[Path], list[str]]:
+    """Check which notebook HTML exports exist and which are missing.
+    Returns ``(found_paths, missing_stems)`` where *found_paths* are existing
+    HTML files that correspond to notebooks and *missing_stems* are notebook
+    stems with no matching HTML.
+    """
+    expected_stems = sorted(p.stem for p in notebook_dir.glob("*.ipynb"))
+    if not docs_dir.exists():
+        return [], expected_stems
+    html_by_stem = {p.stem: p for p in docs_dir.glob("*.html")}
+    found: list[Path] = []
+    missing: list[str] = []
+    for stem in expected_stems:
+        if stem in html_by_stem:
+            found.append(html_by_stem[stem])
+        else:
+            missing.append(stem)
+    return sorted(found), sorted(missing)
+def display_html_documentation(docs_dir: Path) -> None:
+    """Render every HTML file in *docs_dir* inline inside a Jupyter notebook.
+    Each file is wrapped in an ``<iframe srcdoc="...">`` for CSS isolation.
+    """
+    from IPython.display import HTML, display
+    if not docs_dir.exists():
+        return
+    html_files = sorted(docs_dir.glob("*.html"))
+    for path in html_files:
+        content = path.read_text(encoding="utf-8")
+        escaped = html.escape(content)
+        display(HTML(f"<h2>{html.escape(path.stem)}</h2>"))
+        display(
+            HTML(
+                f'<iframe srcdoc="{escaped}" '
+                f'style="width:100%;height:600px;border:1px solid #ccc;" '
+                f"sandbox></iframe>"
+            )
+        )

customer_retention/analysis/notebook_progress.py ADDED Viewed

@@ -0,0 +1,60 @@
+"""Track notebook execution progress and export previous notebook on start."""
+import json
+import threading
+from pathlib import Path
+from typing import Optional
+from customer_retention.analysis.notebook_html_exporter import export_notebook_html
+from customer_retention.core.config.experiments import get_notebook_experiments_dir
+def track_and_export_previous(current_notebook: str) -> None:
+    """Record the current notebook and export the previous one in the background.
+    Called at the top of each notebook.  Progress is written *before* the
+    export thread starts so that the current notebook is already recorded
+    even if export is slow or fails.
+    Returns ``None`` — the export runs asynchronously.
+    """
+    experiments_dir = get_notebook_experiments_dir()
+    experiments_dir.mkdir(parents=True, exist_ok=True)
+    progress_file = experiments_dir / "notebook_progress.json"
+    docs_dir = experiments_dir / "docs"
+    previous = _read_last_notebook(progress_file)
+    _write_current_notebook(progress_file, current_notebook)
+    if previous:
+        _export_in_background(previous, docs_dir)
+def _read_last_notebook(progress_file: Path) -> Optional[str]:
+    """Return the last-run notebook name, or ``None`` if missing/corrupt."""
+    try:
+        data = json.loads(progress_file.read_text(encoding="utf-8"))
+        return data.get("last_notebook")
+    except (FileNotFoundError, json.JSONDecodeError, KeyError):
+        return None
+def _export_notebook(notebook_name: str, docs_dir: Path) -> Optional[Path]:
+    """Export *notebook_name* to HTML in *docs_dir*."""
+    return export_notebook_html(Path(notebook_name), docs_dir)
+def _export_in_background(notebook_name: str, docs_dir: Path) -> None:
+    """Dispatch export as a daemon thread so the notebook cell does not block."""
+    threading.Thread(
+        target=_export_notebook,
+        args=(notebook_name, docs_dir),
+        daemon=True,
+    ).start()
+def _write_current_notebook(progress_file: Path, current_notebook: str) -> None:
+    """Write the current notebook name to the progress file."""
+    progress_file.write_text(
+        json.dumps({"last_notebook": current_notebook}),
+        encoding="utf-8",
+    )

customer_retention/analysis/plotly_preprocessor.py ADDED Viewed

@@ -0,0 +1,154 @@
+"""NBConvert preprocessor that converts Plotly figures to static PNG images."""
+import base64
+import json
+import re
+from nbconvert.preprocessors import Preprocessor
+class PlotlyToImagePreprocessor(Preprocessor):
+    """Convert Plotly figures to static PNG images in notebook outputs.
+    Requires ``plotly`` and ``kaleido`` to be installed.  When either is
+    missing the preprocessor is a no-op so callers can always apply it
+    without guarding imports.
+    """
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self._kaleido_available = None
+        self._plotly_available = None
+    @property
+    def kaleido_available(self):
+        if self._kaleido_available is None:
+            try:
+                import kaleido  # noqa: F401
+                self._kaleido_available = True
+            except ImportError:
+                self._kaleido_available = False
+                self.log.warning("kaleido not available - Plotly figures will not be converted to images")
+        return self._kaleido_available
+    @property
+    def plotly_available(self):
+        if self._plotly_available is None:
+            try:
+                import plotly  # noqa: F401
+                self._plotly_available = True
+            except ImportError:
+                self._plotly_available = False
+                self.log.warning("plotly not available - cannot convert figures")
+        return self._plotly_available
+    def preprocess(self, nb, resources):
+        if not self.kaleido_available or not self.plotly_available:
+            return nb, resources
+        return super().preprocess(nb, resources)
+    def preprocess_cell(self, cell, resources, index):
+        if cell.cell_type != "code":
+            return cell, resources
+        outputs = getattr(cell, "outputs", None)
+        if not outputs:
+            return cell, resources
+        new_outputs = []
+        converted_count = 0
+        for output in outputs:
+            converted = self._convert_plotly_output(output)
+            new_outputs.append(converted)
+            if converted is not output:
+                converted_count += 1
+        cell.outputs = new_outputs
+        if converted_count > 0:
+            self.log.info(f"Converted {converted_count} Plotly figures in cell {index}")
+        return cell, resources
+    def _convert_plotly_output(self, output):
+        from nbformat.notebooknode import NotebookNode
+        output_type = getattr(output, "output_type", None) or output.get("output_type")
+        if output_type != "display_data":
+            return output
+        data = getattr(output, "data", None) or output.get("data", {})
+        plotly_json = None
+        if "application/vnd.plotly.v1+json" in data:
+            plotly_json = data["application/vnd.plotly.v1+json"]
+        elif "text/html" in data:
+            html = data.get("text/html", "")
+            if isinstance(html, list):
+                html = "".join(html)
+            plotly_json = self._extract_plotly_from_html(html)
+        if plotly_json is None:
+            return output
+        try:
+            png_bytes = self._plotly_to_png(plotly_json)
+            if png_bytes:
+                png_b64 = base64.b64encode(png_bytes).decode("utf-8")
+                return NotebookNode({
+                    "output_type": "display_data",
+                    "data": {"image/png": png_b64},
+                    "metadata": {}
+                })
+        except Exception as e:
+            self.log.warning(f"Failed to convert Plotly figure: {e}")
+        return output
+    def _extract_plotly_from_html(self, html: str):
+        patterns = [
+            r'Plotly\.(?:newPlot|react)\s*\(\s*["\'][\w-]+["\']\s*,\s*(\[.*?\])\s*,\s*(\{.*?\})',
+            r'var\s+data\s*=\s*(\[.*?\]);',
+            r'"data"\s*:\s*(\[.*?\])',
+        ]
+        for pattern in patterns:
+            match = re.search(pattern, html, re.DOTALL)
+            if match:
+                try:
+                    data_str = match.group(1)
+                    data = json.loads(data_str)
+                    layout = {}
+                    if len(match.groups()) > 1:
+                        try:
+                            layout = json.loads(match.group(2))
+                        except (json.JSONDecodeError, IndexError):
+                            pass
+                    return {"data": data, "layout": layout}
+                except json.JSONDecodeError:
+                    continue
+        return None
+    def _plotly_to_png(self, fig_dict: dict, width: int = 1200, height: int = 600) -> bytes:
+        import plotly.graph_objects as go
+        import plotly.io as pio
+        if isinstance(fig_dict, dict):
+            fig = go.Figure(fig_dict)
+        else:
+            fig = fig_dict
+        orig_layout = fig_dict.get("layout", {}) if isinstance(fig_dict, dict) else {}
+        orig_width = orig_layout.get("width")
+        orig_height = orig_layout.get("height")
+        if orig_height:
+            height = orig_height
+        if orig_width:
+            width = max(orig_width, 1200)
+        fig.update_layout(
+            width=width,
+            height=height,
+            margin=dict(l=50, r=50, t=50, b=50),
+        )
+        png_bytes = pio.to_image(fig, format="png", scale=1.0)
+        return png_bytes

customer_retention/analysis/recommendations/__init__.py ADDED Viewed

@@ -0,0 +1,54 @@
+from .base import (
+    BaseRecommendation,
+    CleaningRecommendation,
+    DatetimeRecommendation,
+    EncodingRecommendation,
+    FeatureRecommendation,
+    Platform,
+    RecommendationResult,
+    TransformRecommendation,
+)
+from .cleaning import (
+    ConsistencyNormalizeRecommendation,
+    DeduplicateRecommendation,
+    ImputeRecommendation,
+    OutlierCapRecommendation,
+)
+from .datetime import DaysSinceRecommendation, ExtractDayOfWeekRecommendation, ExtractMonthRecommendation
+from .encoding import LabelEncodeRecommendation, OneHotEncodeRecommendation
+from .pipeline import RecommendationPipeline
+from .registry import RecommendationRegistry
+from .selection import DropColumnRecommendation
+from .transform import (
+    LogTransformRecommendation,
+    MinMaxScaleRecommendation,
+    SqrtTransformRecommendation,
+    StandardScaleRecommendation,
+)
+__all__ = [
+    "Platform",
+    "RecommendationResult",
+    "BaseRecommendation",
+    "CleaningRecommendation",
+    "TransformRecommendation",
+    "EncodingRecommendation",
+    "DatetimeRecommendation",
+    "FeatureRecommendation",
+    "RecommendationPipeline",
+    "RecommendationRegistry",
+    "ImputeRecommendation",
+    "OutlierCapRecommendation",
+    "DeduplicateRecommendation",
+    "ConsistencyNormalizeRecommendation",
+    "StandardScaleRecommendation",
+    "MinMaxScaleRecommendation",
+    "LogTransformRecommendation",
+    "SqrtTransformRecommendation",
+    "OneHotEncodeRecommendation",
+    "LabelEncodeRecommendation",
+    "ExtractMonthRecommendation",
+    "ExtractDayOfWeekRecommendation",
+    "DaysSinceRecommendation",
+    "DropColumnRecommendation",
+]

customer_retention/analysis/recommendations/base.py ADDED Viewed

@@ -0,0 +1,158 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+import pandas as pd
+from customer_retention.core.components.enums import Platform
+if TYPE_CHECKING:
+    from customer_retention.analysis.auto_explorer.findings import ColumnFinding
+    from customer_retention.stages.features.feature_definitions import FeatureDefinition
+@dataclass
+class RecommendationResult:
+    data: pd.DataFrame
+    columns_affected: List[str]
+    rows_before: int
+    rows_after: int
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    warnings: List[str] = field(default_factory=list)
+class BaseRecommendation(ABC):
+    def __init__(
+        self, columns: List[str], rationale: str, evidence: List[str] = None,
+        priority: str = "medium", source_finding: Optional["ColumnFinding"] = None
+    ):
+        self.columns = columns
+        self.rationale = rationale
+        self.evidence = evidence or []
+        self.priority = priority
+        self.source_finding = source_finding
+        self._is_fitted = False
+        self._fit_params: Dict[str, Any] = {}
+    @property
+    @abstractmethod
+    def recommendation_type(self) -> str:
+        pass
+    @property
+    @abstractmethod
+    def category(self) -> str:
+        pass
+    def fit(self, df: pd.DataFrame) -> "BaseRecommendation":
+        self._fit_impl(df)
+        self._is_fitted = True
+        return self
+    @abstractmethod
+    def _fit_impl(self, df: pd.DataFrame) -> None:
+        pass
+    def transform(
+        self, df: pd.DataFrame, platform: Platform = Platform.LOCAL,
+        mlflow_adapter: Optional[Any] = None
+    ) -> RecommendationResult:
+        if not self._is_fitted:
+            raise ValueError(f"{self.__class__.__name__} not fitted. Call fit() first.")
+        result = self._transform_databricks(df) if platform == Platform.DATABRICKS else self._transform_local(df)
+        if mlflow_adapter:
+            mlflow_adapter.log_params(self._fit_params)
+            mlflow_adapter.log_metrics({k: v for k, v in result.metadata.items() if isinstance(v, (int, float))})
+        return result
+    @abstractmethod
+    def _transform_local(self, df: pd.DataFrame) -> RecommendationResult:
+        pass
+    @abstractmethod
+    def _transform_databricks(self, df: pd.DataFrame) -> RecommendationResult:
+        pass
+    def fit_transform(self, df: pd.DataFrame, platform: Platform = Platform.LOCAL) -> RecommendationResult:
+        self.fit(df)
+        return self.transform(df, platform)
+    def generate_code(self, platform: Platform = Platform.LOCAL) -> str:
+        return self._generate_databricks_code() if platform == Platform.DATABRICKS else self._generate_local_code()
+    @abstractmethod
+    def _generate_local_code(self) -> str:
+        pass
+    @abstractmethod
+    def _generate_databricks_code(self) -> str:
+        pass
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "type": self.recommendation_type,
+            "category": self.category,
+            "columns": self.columns,
+            "rationale": self.rationale,
+            "evidence": self.evidence,
+            "priority": self.priority,
+            "fit_params": self._fit_params,
+            "is_fitted": self._is_fitted,
+        }
+    def describe(self) -> str:
+        return f"{self.recommendation_type} on {self.columns}: {self.rationale}"
+    def to_feature_definition(self) -> "FeatureDefinition":
+        from customer_retention.stages.features.feature_definitions import (
+            FeatureCategory,
+            FeatureDefinition,
+            LeakageRisk,
+        )
+        category_map = {
+            "cleaning": FeatureCategory.AGGREGATE,
+            "transform": FeatureCategory.AGGREGATE,
+            "encoding": FeatureCategory.AGGREGATE,
+            "datetime": FeatureCategory.TEMPORAL,
+            "feature": FeatureCategory.AGGREGATE,
+        }
+        return FeatureDefinition(
+            name=f"{self.columns[0]}_{self.recommendation_type}",
+            description=self.rationale,
+            category=category_map.get(self.category, FeatureCategory.AGGREGATE),
+            derivation=self._generate_local_code(),
+            source_columns=self.columns,
+            data_type="float64",
+            business_meaning=self.rationale,
+            leakage_risk=LeakageRisk.LOW,
+        )
+class CleaningRecommendation(BaseRecommendation, ABC):
+    @property
+    def category(self) -> str:
+        return "cleaning"
+class TransformRecommendation(BaseRecommendation, ABC):
+    @property
+    def category(self) -> str:
+        return "transform"
+class EncodingRecommendation(BaseRecommendation, ABC):
+    @property
+    def category(self) -> str:
+        return "encoding"
+class DatetimeRecommendation(BaseRecommendation, ABC):
+    @property
+    def category(self) -> str:
+        return "datetime"
+class FeatureRecommendation(BaseRecommendation, ABC):
+    @property
+    def category(self) -> str:
+        return "feature"