PyPI - churnkit - Versions diffs - 0.75.0a1__py3-none-any.whl - Mend

churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (302) hide show

customer_retention/transforms/__init__.py ADDED Viewed

@@ -0,0 +1,47 @@
+"""Lean runtime module for DataFrame transformations.
+This module is safe to import from generated pipelines — it depends only on
+``core.compat``, ``numpy``, ``sklearn``, ``joblib``, and the lightweight
+``models`` dataclasses.  It does **not** import from ``analysis/``,
+``generators/`` (except ``models``), ``visualization/``, or ``stages/``.
+"""
+from .artifact_store import ArtifactStore
+from .executor import TransformExecutor
+from .ops import (
+    apply_cap_outlier,
+    apply_cap_then_log,
+    apply_derived_composite,
+    apply_derived_interaction,
+    apply_derived_ratio,
+    apply_drop_column,
+    apply_feature_select,
+    apply_impute_null,
+    apply_log_transform,
+    apply_one_hot_encode,
+    apply_segment_aware_cap,
+    apply_sqrt_transform,
+    apply_type_cast,
+    apply_winsorize,
+    apply_zero_inflation_handling,
+)
+__all__ = [
+    "TransformExecutor",
+    "ArtifactStore",
+    "apply_impute_null",
+    "apply_cap_outlier",
+    "apply_type_cast",
+    "apply_drop_column",
+    "apply_winsorize",
+    "apply_segment_aware_cap",
+    "apply_log_transform",
+    "apply_sqrt_transform",
+    "apply_zero_inflation_handling",
+    "apply_cap_then_log",
+    "apply_one_hot_encode",
+    "apply_feature_select",
+    "apply_derived_ratio",
+    "apply_derived_interaction",
+    "apply_derived_composite",
+]

customer_retention/transforms/artifact_store.py ADDED Viewed

@@ -0,0 +1,50 @@
+"""Persistence for fitted transformers.
+Drop-in replacement for the ``FitArtifactRegistry`` class that was
+previously inlined in ``gold.py.j2`` and ``run_scoring.py.j2`` templates.
+"""
+from __future__ import annotations
+from pathlib import Path
+import joblib
+import yaml
+class ArtifactStore:
+    """Manages persistence of fitted transformers (scalers, encoders, etc.)."""
+    def __init__(self, artifacts_dir: str | Path):
+        self._dir = Path(artifacts_dir)
+        self._dir.mkdir(parents=True, exist_ok=True)
+        self._manifest: dict = {}
+    def register(self, artifact_type: str, target_column: str, transformer) -> None:
+        artifact_id = f"{target_column}_{artifact_type}"
+        path = self._dir / f"{artifact_id}.joblib"
+        joblib.dump(transformer, path)
+        self._manifest[artifact_id] = {
+            "type": artifact_type,
+            "column": target_column,
+            "path": str(path),
+        }
+    def save_manifest(self) -> None:
+        with open(self._dir / "manifest.yaml", "w") as f:
+            yaml.dump(self._manifest, f)
+    def load(self, artifact_id: str):
+        if artifact_id not in self._manifest:
+            raise KeyError(f"Artifact {artifact_id} not found")
+        return joblib.load(self._manifest[artifact_id]["path"])
+    def has(self, artifact_id: str) -> bool:
+        return artifact_id in self._manifest
+    @classmethod
+    def from_manifest(cls, manifest_path: str | Path) -> ArtifactStore:
+        store = cls(str(Path(manifest_path).parent))
+        with open(manifest_path) as f:
+            store._manifest = yaml.safe_load(f) or {}
+        return store

customer_retention/transforms/executor.py ADDED Viewed

@@ -0,0 +1,157 @@
+"""TransformExecutor — single dispatch table for all transformation types.
+Maps :class:`TransformationStep` types to the appropriate function in
+:mod:`ops` (stateless) or class in :mod:`fitted` (stateful).
+"""
+from __future__ import annotations
+from customer_retention.core.compat import DataFrame
+from customer_retention.generators.pipeline_generator.models import (
+    PipelineTransformationType,
+    TransformationStep,
+)
+from . import ops
+from .artifact_store import ArtifactStore
+from .fitted import FittedEncoder, FittedPowerTransform, FittedScaler
+class TransformExecutor:
+    """Applies :class:`TransformationStep` objects to DataFrames.
+    Works with both pandas and pyspark.pandas DataFrames via the compat
+    layer.  Used by generated pipelines and exploration code alike.
+    """
+    def apply(
+        self,
+        df: DataFrame,
+        step: TransformationStep,
+        *,
+        fit_mode: bool = False,
+        artifact_store: ArtifactStore | None = None,
+    ) -> DataFrame:
+        handler = self._DISPATCH.get(step.type)
+        if handler is None:
+            raise ValueError(f"Unknown transformation type: {step.type}")
+        return handler(self, df, step, fit_mode=fit_mode, artifact_store=artifact_store)
+    def apply_all(
+        self,
+        df: DataFrame,
+        steps: list[TransformationStep],
+        *,
+        fit_mode: bool = False,
+        artifact_store: ArtifactStore | None = None,
+    ) -> DataFrame:
+        for step in steps:
+            df = self.apply(df, step, fit_mode=fit_mode, artifact_store=artifact_store)
+        return df
+    def _apply_fitted(self, fitted, df, step, *, fit_mode=False, artifact_store=None):
+        if fit_mode:
+            return fitted.fit_transform(df, step.column, artifact_store)
+        return fitted.transform(df, step.column, artifact_store)
+    def _handle_impute_null(self, df, step, **kw):
+        return ops.apply_impute_null(df, step.column, value=step.parameters.get("value", 0))
+    def _handle_cap_outlier(self, df, step, **kw):
+        return ops.apply_cap_outlier(
+            df,
+            step.column,
+            lower=step.parameters.get("lower", 0),
+            upper=step.parameters.get("upper", 1_000_000),
+        )
+    def _handle_type_cast(self, df, step, **kw):
+        return ops.apply_type_cast(df, step.column, dtype=step.parameters.get("dtype", "float"))
+    def _handle_drop_column(self, df, step, **kw):
+        return ops.apply_drop_column(df, step.column)
+    def _handle_winsorize(self, df, step, **kw):
+        return ops.apply_winsorize(
+            df,
+            step.column,
+            lower_bound=step.parameters.get("lower_bound", 0),
+            upper_bound=step.parameters.get("upper_bound", 1_000_000),
+        )
+    def _handle_segment_aware_cap(self, df, step, **kw):
+        return ops.apply_segment_aware_cap(
+            df, step.column, n_segments=step.parameters.get("n_segments", 2)
+        )
+    def _handle_log_transform(self, df, step, **kw):
+        return ops.apply_log_transform(df, step.column)
+    def _handle_sqrt_transform(self, df, step, **kw):
+        return ops.apply_sqrt_transform(df, step.column)
+    def _handle_zero_inflation(self, df, step, **kw):
+        return ops.apply_zero_inflation_handling(df, step.column)
+    def _handle_cap_then_log(self, df, step, **kw):
+        return ops.apply_cap_then_log(df, step.column)
+    def _handle_encode(self, df, step, *, fit_mode=False, artifact_store=None, **kw):
+        method = step.parameters.get("method", "one_hot")
+        if method == "one_hot":
+            return ops.apply_one_hot_encode(df, step.column)
+        return self._apply_fitted(
+            FittedEncoder(), df, step, fit_mode=fit_mode, artifact_store=artifact_store
+        )
+    def _handle_scale(self, df, step, *, fit_mode=False, artifact_store=None, **kw):
+        method = step.parameters.get("method", "standard")
+        return self._apply_fitted(
+            FittedScaler(method), df, step, fit_mode=fit_mode, artifact_store=artifact_store
+        )
+    def _handle_yeo_johnson(self, df, step, *, fit_mode=False, artifact_store=None, **kw):
+        return self._apply_fitted(
+            FittedPowerTransform(), df, step, fit_mode=fit_mode, artifact_store=artifact_store
+        )
+    def _handle_feature_select(self, df, step, **kw):
+        return ops.apply_feature_select(df, step.column)
+    def _handle_derived_column(self, df, step, **kw):
+        method = step.parameters.get("method") or step.parameters.get("action")
+        if method == "ratio":
+            return ops.apply_derived_ratio(
+                df,
+                step.column,
+                numerator=step.parameters.get("numerator", ""),
+                denominator=step.parameters.get("denominator", ""),
+            )
+        if method == "interaction":
+            return ops.apply_derived_interaction(
+                df,
+                step.column,
+                col_a=step.parameters.get("col_a", ""),
+                col_b=step.parameters.get("col_b", ""),
+            )
+        if method == "composite":
+            return ops.apply_derived_composite(df, step.column, columns=step.parameters.get("columns", []))
+        return df
+    _DISPATCH = {
+        PipelineTransformationType.IMPUTE_NULL: _handle_impute_null,
+        PipelineTransformationType.CAP_OUTLIER: _handle_cap_outlier,
+        PipelineTransformationType.TYPE_CAST: _handle_type_cast,
+        PipelineTransformationType.DROP_COLUMN: _handle_drop_column,
+        PipelineTransformationType.WINSORIZE: _handle_winsorize,
+        PipelineTransformationType.SEGMENT_AWARE_CAP: _handle_segment_aware_cap,
+        PipelineTransformationType.LOG_TRANSFORM: _handle_log_transform,
+        PipelineTransformationType.SQRT_TRANSFORM: _handle_sqrt_transform,
+        PipelineTransformationType.YEO_JOHNSON: _handle_yeo_johnson,
+        PipelineTransformationType.ZERO_INFLATION_HANDLING: _handle_zero_inflation,
+        PipelineTransformationType.CAP_THEN_LOG: _handle_cap_then_log,
+        PipelineTransformationType.ENCODE: _handle_encode,
+        PipelineTransformationType.SCALE: _handle_scale,
+        PipelineTransformationType.FEATURE_SELECT: _handle_feature_select,
+        PipelineTransformationType.DERIVED_COLUMN: _handle_derived_column,
+    }

customer_retention/transforms/fitted.py ADDED Viewed

@@ -0,0 +1,92 @@
+"""Stateful fit/transform classes.
+These wrap sklearn transformers and integrate with :class:`ArtifactStore`
+for persistence.  All classes use ``core.compat.to_pandas`` to convert
+DataFrames to numpy-backed pandas before calling sklearn, ensuring the
+same source code works on both pandas and pyspark.pandas.
+"""
+from __future__ import annotations
+from sklearn.preprocessing import (
+    LabelEncoder,
+    MinMaxScaler,
+    PowerTransformer,
+    StandardScaler,
+)
+from customer_retention.core.compat import DataFrame, ensure_pandas_series, to_pandas
+class FittedScaler:
+    """Wraps :class:`StandardScaler` / :class:`MinMaxScaler`."""
+    def __init__(self, method: str = "standard"):
+        self.method = method
+        self._scaler = StandardScaler() if method == "standard" else MinMaxScaler()
+    def fit_transform(self, df: DataFrame, column: str, artifact_store) -> DataFrame:
+        if column not in df.columns:
+            return df
+        col_values = to_pandas(df[[column]])
+        fitted = self._scaler.fit_transform(col_values)
+        df[column] = fitted.ravel()
+        artifact_store.register("scaler", column, self._scaler)
+        return df
+    def transform(self, df: DataFrame, column: str, artifact_store) -> DataFrame:
+        if column not in df.columns:
+            return df
+        scaler = artifact_store.load(f"{column}_scaler")
+        col_values = to_pandas(df[[column]])
+        df[column] = scaler.transform(col_values).ravel()
+        return df
+class FittedEncoder:
+    """Wraps :class:`LabelEncoder` with unknown-class fallback."""
+    def __init__(self):
+        self._encoder = LabelEncoder()
+    def fit_transform(self, df: DataFrame, column: str, artifact_store) -> DataFrame:
+        if column not in df.columns:
+            return df
+        series = ensure_pandas_series(df[column].astype(str))
+        df[column] = self._encoder.fit_transform(series)
+        artifact_store.register("encoder", column, self._encoder)
+        return df
+    def transform(self, df: DataFrame, column: str, artifact_store) -> DataFrame:
+        if column not in df.columns:
+            return df
+        encoder = artifact_store.load(f"{column}_encoder")
+        series = ensure_pandas_series(df[column].astype(str))
+        df[column] = series.apply(
+            lambda x: encoder.transform([x])[0] if x in encoder.classes_ else 0
+        )
+        return df
+class FittedPowerTransform:
+    """Wraps Yeo-Johnson :class:`PowerTransformer`."""
+    def __init__(self):
+        self._pt = PowerTransformer(method="yeo-johnson")
+    def fit_transform(self, df: DataFrame, column: str, artifact_store) -> DataFrame:
+        if column not in df.columns:
+            return df
+        col_values = to_pandas(df[[column]].fillna(0))
+        fitted = self._pt.fit_transform(col_values)
+        df[column] = fitted.ravel()
+        artifact_store.register("power_transformer", column, self._pt)
+        return df
+    def transform(self, df: DataFrame, column: str, artifact_store) -> DataFrame:
+        if column not in df.columns:
+            return df
+        pt = artifact_store.load(f"{column}_power_transformer")
+        col_values = to_pandas(df[[column]].fillna(0))
+        df[column] = pt.transform(col_values).ravel()
+        return df

customer_retention/transforms/ops.py ADDED Viewed

@@ -0,0 +1,148 @@
+"""Stateless transformation functions.
+Each function takes (df, column, **params) and returns df.
+Uses core.compat for platform-agnostic DataFrame operations
+(works on both pandas and pyspark.pandas).
+"""
+from __future__ import annotations
+import functools
+from typing import Any
+import numpy as np
+from customer_retention.core.compat import DataFrame, pd
+def _requires_column(fn):
+    @functools.wraps(fn)
+    def wrapper(df: DataFrame, column: str, *args, **kwargs) -> DataFrame:
+        if column not in df.columns:
+            return df
+        return fn(df, column, *args, **kwargs)
+    return wrapper
+@_requires_column
+def apply_impute_null(df: DataFrame, column: str, *, value: Any = 0) -> DataFrame:
+    if value == "median":
+        df[column] = df[column].fillna(df[column].median())
+    else:
+        df[column] = df[column].fillna(value)
+    return df
+@_requires_column
+def apply_cap_outlier(
+    df: DataFrame, column: str, *, lower: float = 0, upper: float = 1_000_000
+) -> DataFrame:
+    df[column] = df[column].clip(lower=lower, upper=upper)
+    return df
+def apply_type_cast(df: DataFrame, column: str, *, dtype: str = "float") -> DataFrame:
+    if column not in df.columns:
+        return df
+    df[column] = df[column].astype(dtype)
+    return df
+def apply_drop_column(df: DataFrame, column: str) -> DataFrame:
+    return df.drop(columns=[column], errors="ignore")
+@_requires_column
+def apply_winsorize(
+    df: DataFrame, column: str, *, lower_bound: float = 0, upper_bound: float = 1_000_000
+) -> DataFrame:
+    df[column] = df[column].clip(lower=lower_bound, upper=upper_bound)
+    return df
+def apply_segment_aware_cap(df: DataFrame, column: str, *, n_segments: int = 2) -> DataFrame:
+    if column not in df.columns:
+        return df
+    from sklearn.cluster import KMeans
+    valid = df[column].dropna()
+    if len(valid) < n_segments:
+        return df
+    labels = KMeans(n_clusters=n_segments, random_state=42, n_init=10).fit_predict(
+        valid.values.reshape(-1, 1)
+    )
+    df = df.copy()
+    for seg in range(n_segments):
+        mask = pd.Series(False, index=df.index)
+        mask.iloc[valid.index[labels == seg]] = True
+        seg_vals = df.loc[mask, column]
+        q1, q3 = seg_vals.quantile(0.25), seg_vals.quantile(0.75)
+        iqr = q3 - q1
+        lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
+        df.loc[mask, column] = df.loc[mask, column].clip(lower=lower, upper=upper)
+    return df
+@_requires_column
+def apply_log_transform(df: DataFrame, column: str) -> DataFrame:
+    df[column] = np.log1p(df[column].clip(lower=0))
+    return df
+@_requires_column
+def apply_sqrt_transform(df: DataFrame, column: str) -> DataFrame:
+    df[column] = np.sqrt(df[column].clip(lower=0))
+    return df
+@_requires_column
+def apply_zero_inflation_handling(df: DataFrame, column: str) -> DataFrame:
+    df[f"{column}_is_zero"] = (df[column] == 0).astype(int)
+    nonzero = df[column] != 0
+    df.loc[nonzero, column] = np.log1p(df.loc[nonzero, column].clip(lower=0))
+    return df
+@_requires_column
+def apply_cap_then_log(df: DataFrame, column: str) -> DataFrame:
+    q99 = df[column].quantile(0.99)
+    df[column] = np.log1p(df[column].clip(upper=q99).clip(lower=0))
+    return df
+@_requires_column
+def apply_one_hot_encode(df: DataFrame, column: str) -> DataFrame:
+    return pd.get_dummies(df, columns=[column], prefix=column)
+def apply_feature_select(df: DataFrame, column: str) -> DataFrame:
+    return df.drop(columns=[column], errors="ignore")
+def apply_derived_ratio(
+    df: DataFrame, column: str, *, numerator: str, denominator: str
+) -> DataFrame:
+    if numerator not in df.columns or denominator not in df.columns:
+        return df
+    df[column] = df[numerator] / df[denominator].replace(0, float("nan"))
+    return df
+def apply_derived_interaction(
+    df: DataFrame, column: str, *, col_a: str, col_b: str
+) -> DataFrame:
+    if col_a not in df.columns or col_b not in df.columns:
+        return df
+    df[column] = df[col_a] * df[col_b]
+    return df
+def apply_derived_composite(
+    df: DataFrame, column: str, *, columns: list[str]
+) -> DataFrame:
+    valid = [c for c in columns if c in df.columns]
+    if not valid:
+        return df
+    df[column] = df[valid].mean(axis=1)
+    return df