PyPI - churnkit - Versions diffs - 0.75.0a1__py3-none-any.whl - Mend

churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (302) hide show

customer_retention/stages/temporal/__init__.py ADDED Viewed

@@ -0,0 +1,166 @@
+"""Temporal framework for leakage-safe ML pipelines.
+This module provides infrastructure for preventing data leakage in ML training
+by enforcing point-in-time (PIT) correctness throughout the data preparation
+and training pipeline.
+Core Components:
+    - TimestampManager: Ensures proper timestamp columns exist
+    - TimestampDiscoveryEngine: Auto-detects timestamps in datasets
+    - ScenarioDetector: Determines appropriate timestamp strategy
+    - UnifiedDataPreparer: Single entry point for data preparation
+    - SnapshotManager: Versioned training snapshots with integrity hashing
+    - DataAccessGuard: Context-based data access control
+Quick Start:
+    >>> from customer_retention.stages.temporal import (
+    ...     ScenarioDetector, UnifiedDataPreparer
+    ... )
+    >>> from datetime import datetime
+    >>>
+    >>> # Detect scenario and prepare data
+    >>> detector = ScenarioDetector()
+    >>> scenario, config, _ = detector.detect(df, target_column="churn")
+    >>>
+    >>> preparer = UnifiedDataPreparer(output_path="./output", timestamp_config=config)
+    >>> prepared_df = preparer.prepare_from_raw(df, "churn", "customer_id")
+    >>>
+    >>> # Create versioned training snapshot
+    >>> snapshot_df, meta = preparer.create_training_snapshot(
+    ...     prepared_df,
+    ...     cutoff_date=datetime(2024, 6, 1)
+    ... )
+    >>> print(f"Snapshot: {meta['snapshot_id']}, hash: {meta['data_hash']}")
+Timestamp Scenarios:
+    - production: Dataset has explicit feature and label timestamps
+    - partial: Only feature timestamp found, label derived from window
+    - derived: Timestamps can be computed from other columns
+    - synthetic: No temporal information, must use synthetic timestamps
+"""
+# Import canonical temporal metadata columns from central location
+from customer_retention.core.utils.leakage import TEMPORAL_METADATA_COLUMNS
+from .access_guard import AccessContext, DataAccessGuard
+from .cutoff_analyzer import CutoffAnalysis, CutoffAnalyzer, SplitResult
+from .data_preparer import PreparedData, UnifiedDataPreparer
+from .point_in_time_join import PointInTimeJoiner
+from .point_in_time_registry import ConsistencyReport, DatasetSnapshot, PointInTimeRegistry
+from .scenario_detector import ScenarioDetector
+from .snapshot_manager import SnapshotManager, SnapshotMetadata
+from .synthetic_coordinator import SyntheticCoordinationParams, SyntheticTimestampCoordinator
+from .timestamp_discovery import (
+    DatetimeOrderAnalyzer,
+    TimestampCandidate,
+    TimestampDiscoveryEngine,
+    TimestampDiscoveryResult,
+    TimestampRole,
+)
+from .timestamp_manager import TimestampConfig, TimestampManager, TimestampStrategy
+# Backwards compatible alias - prefer TEMPORAL_METADATA_COLUMNS
+TEMPORAL_METADATA_COLS = TEMPORAL_METADATA_COLUMNS
+def _restore_snapshot_columns(df, findings):
+    """Reverse the entity_id/target renames applied by UnifiedDataPreparer."""
+    renames = {}
+    ts_meta = getattr(findings, "time_series_metadata", None)
+    entity_col = ts_meta.entity_column if ts_meta else None
+    target_col = getattr(findings, "target_column", None)
+    if entity_col and "entity_id" in df.columns and entity_col not in df.columns:
+        renames["entity_id"] = entity_col
+    if target_col and "target" in df.columns and target_col not in df.columns:
+        renames["target"] = target_col
+    return df.rename(columns=renames) if renames else df
+def load_data_with_snapshot_preference(findings, output_dir: str = "../explorations"):
+    """Load data preferring snapshots over raw source files.
+    This function implements the recommended data loading pattern for exploration
+    notebooks. It checks if a training snapshot exists and loads from it if available,
+    otherwise falls back to the original source file.
+    Parameters
+    ----------
+    findings : ExplorationFindings
+        The findings object loaded from a previous exploration
+    output_dir : str
+        Directory containing explorations and snapshots
+    Returns
+    -------
+    tuple[pd.DataFrame, str]
+        DataFrame and a string indicating the source ("snapshot" or "source")
+    Example
+    -------
+    >>> from customer_retention.stages.temporal import load_data_with_snapshot_preference
+    >>> findings = ExplorationFindings.load(FINDINGS_PATH)
+    >>> df, source = load_data_with_snapshot_preference(findings)
+    >>> print(f"Loaded from: {source}")
+    """
+    from pathlib import Path
+    import pandas as pd
+    # Check if snapshot exists in findings
+    snapshot_path = getattr(findings, 'snapshot_path', None)
+    if snapshot_path and Path(snapshot_path).exists():
+        df = pd.read_parquet(snapshot_path)
+        return _restore_snapshot_columns(df, findings), "snapshot"
+    # Check for snapshots in output directory
+    output_path = Path(output_dir) / "snapshots"
+    if output_path.exists():
+        snapshot_manager = SnapshotManager(Path(output_dir))
+        snapshots = snapshot_manager.list_snapshots()
+        if snapshots:
+            latest = snapshot_manager.get_latest_snapshot()
+            if latest:
+                df, _ = snapshot_manager.load_snapshot(latest)
+                return _restore_snapshot_columns(df, findings), f"snapshot:{latest}"
+    # Fall back to source file
+    source_path = findings.source_path
+    if source_path.endswith('.csv'):
+        df = pd.read_csv(source_path)
+    else:
+        df = pd.read_parquet(source_path)
+    return df, "source"
+__all__ = [
+    "DatetimeOrderAnalyzer",
+    "TimestampStrategy",
+    "TimestampConfig",
+    "TimestampManager",
+    "TimestampRole",
+    "TimestampCandidate",
+    "TimestampDiscoveryResult",
+    "TimestampDiscoveryEngine",
+    "SnapshotMetadata",
+    "SnapshotManager",
+    "PointInTimeJoiner",
+    "PreparedData",
+    "UnifiedDataPreparer",
+    "AccessContext",
+    "DataAccessGuard",
+    "ScenarioDetector",
+    "DatasetSnapshot",
+    "ConsistencyReport",
+    "PointInTimeRegistry",
+    "CutoffAnalysis",
+    "CutoffAnalyzer",
+    "SplitResult",
+    "SyntheticCoordinationParams",
+    "SyntheticTimestampCoordinator",
+    "load_data_with_snapshot_preference",
+    "TEMPORAL_METADATA_COLS",
+]

customer_retention/stages/temporal/access_guard.py ADDED Viewed

@@ -0,0 +1,180 @@
+"""Data access control based on execution context.
+This module provides path-based access control to prevent accidental
+data leakage by restricting which data paths are accessible in different
+execution contexts (exploration, training, inference, etc.).
+Key concepts:
+    - AccessContext: The current execution mode
+    - DataAccessGuard: Validates path access against context rules
+    - require_context: Decorator to enforce context requirements
+Example:
+    >>> from customer_retention.stages.temporal import AccessContext, DataAccessGuard
+    >>> # Set context for the session
+    >>> with DataAccessGuard(AccessContext.TRAINING):
+    ...     # Can access snapshots/ and gold/
+    ...     df = pd.read_parquet("output/snapshots/training_v1.parquet")
+    ...     # This would raise PermissionError:
+    ...     # df = pd.read_parquet("output/raw/customers.csv")
+"""
+import os
+from enum import Enum
+from pathlib import Path
+from typing import Optional
+class AccessContext(Enum):
+    """Execution context for data access control.
+    Attributes:
+        EXPLORATION: Interactive data exploration (can access snapshots)
+        TRAINING: Model training (can access snapshots and gold)
+        INFERENCE: Production inference (can access gold and feature_store)
+        BACKFILL: Historical data processing (can access raw through gold)
+        ADMIN: Administrative access (unrestricted)
+    """
+    EXPLORATION = "exploration"
+    TRAINING = "training"
+    INFERENCE = "inference"
+    BACKFILL = "backfill"
+    ADMIN = "admin"
+class DataAccessGuard:
+    """Guards data access based on the current execution context.
+    The DataAccessGuard prevents accidental data leakage by restricting
+    which paths can be accessed based on the execution context. For example,
+    during training, raw data paths are blocked to ensure only properly
+    prepared snapshots are used.
+    Can be used as a context manager to temporarily set the access context:
+        >>> with DataAccessGuard(AccessContext.TRAINING):
+        ...     # Only training-appropriate paths accessible here
+        ...     pass
+    Or used directly for path validation:
+        >>> guard = DataAccessGuard(AccessContext.EXPLORATION)
+        >>> guard.validate_access("output/snapshots/v1.parquet")  # OK
+        >>> guard.validate_access("output/raw/data.csv")  # Raises PermissionError
+    """
+    ALLOWED_PATHS = {
+        AccessContext.EXPLORATION: ["snapshots/"],
+        AccessContext.TRAINING: ["snapshots/", "gold/"],
+        AccessContext.INFERENCE: ["gold/", "feature_store/"],
+        AccessContext.BACKFILL: ["raw/", "bronze/", "silver/", "gold/"],
+        AccessContext.ADMIN: ["*"],
+    }
+    BLOCKED_PATHS = {
+        AccessContext.EXPLORATION: ["raw/", "bronze/", "silver/"],
+        AccessContext.TRAINING: ["raw/", "bronze/"],
+        AccessContext.INFERENCE: ["snapshots/", "raw/", "bronze/", "silver/"],
+        AccessContext.BACKFILL: ["snapshots/"],
+        AccessContext.ADMIN: [],
+    }
+    def __init__(self, context: AccessContext):
+        self.context = context
+    def validate_access(self, path: str) -> bool:
+        path_str = str(path)
+        for blocked in self.BLOCKED_PATHS[self.context]:
+            if blocked in path_str:
+                raise PermissionError(
+                    f"Access to '{path}' blocked in {self.context.value} context. "
+                    f"Blocked patterns: {self.BLOCKED_PATHS[self.context]}"
+                )
+        return True
+    def is_allowed(self, path: str) -> bool:
+        if "*" in self.ALLOWED_PATHS[self.context]:
+            return True
+        path_str = str(path)
+        return any(allowed in path_str for allowed in self.ALLOWED_PATHS[self.context])
+    def guard_read(self, path: str) -> Path:
+        self.validate_access(path)
+        return Path(path)
+    @staticmethod
+    def set_context(context: AccessContext) -> None:
+        os.environ["DATA_ACCESS_CONTEXT"] = context.value
+    @staticmethod
+    def get_current_context() -> AccessContext:
+        ctx = os.environ.get("DATA_ACCESS_CONTEXT", "exploration")
+        return AccessContext(ctx)
+    @classmethod
+    def from_environment(cls) -> "DataAccessGuard":
+        return cls(cls.get_current_context())
+    def __enter__(self) -> "DataAccessGuard":
+        self._previous_context = os.environ.get("DATA_ACCESS_CONTEXT")
+        os.environ["DATA_ACCESS_CONTEXT"] = self.context.value
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        if self._previous_context:
+            os.environ["DATA_ACCESS_CONTEXT"] = self._previous_context
+        elif "DATA_ACCESS_CONTEXT" in os.environ:
+            del os.environ["DATA_ACCESS_CONTEXT"]
+def require_context(*allowed_contexts: AccessContext):
+    """Decorator to enforce execution context requirements on functions.
+    Use this decorator to restrict a function to specific execution contexts.
+    If called from a disallowed context, raises PermissionError.
+    Args:
+        *allowed_contexts: One or more AccessContext values that are permitted
+    Example:
+        >>> @require_context(AccessContext.TRAINING, AccessContext.INFERENCE)
+        ... def predict(features):
+        ...     return model.predict(features)
+        >>>
+        >>> # Only works in TRAINING or INFERENCE context
+        >>> DataAccessGuard.set_context(AccessContext.TRAINING)
+        >>> predict(X)  # OK
+        >>> DataAccessGuard.set_context(AccessContext.EXPLORATION)
+        >>> predict(X)  # Raises PermissionError
+    """
+    def decorator(func):
+        def wrapper(*args, **kwargs):
+            current = DataAccessGuard.get_current_context()
+            if current not in allowed_contexts:
+                raise PermissionError(
+                    f"Function requires context {[c.value for c in allowed_contexts]}, "
+                    f"but current context is {current.value}"
+                )
+            return func(*args, **kwargs)
+        return wrapper
+    return decorator
+def guarded_read(path: str, context: Optional[AccessContext] = None) -> Path:
+    """Validate path access and return a Path object.
+    Convenience function that validates a path against access rules
+    and returns a Path object if access is allowed.
+    Args:
+        path: Path to validate
+        context: Optional context override (uses environment if None)
+    Returns:
+        Path object for the validated path
+    Raises:
+        PermissionError: If access is not allowed in the current context
+    """
+    guard = DataAccessGuard(context) if context else DataAccessGuard.from_environment()
+    return guard.guard_read(path)

customer_retention/stages/temporal/cutoff_analyzer.py ADDED Viewed

@@ -0,0 +1,235 @@
+import warnings
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Optional
+import numpy as np
+import pandas as pd
+from customer_retention.stages.temporal.timestamp_discovery import DatetimeOrderAnalyzer
+@dataclass
+class SplitResult:
+    train_df: pd.DataFrame
+    score_df: pd.DataFrame
+    unresolvable_df: pd.DataFrame
+    cutoff_date: datetime
+    timestamp_source: str
+    train_count: int
+    score_count: int
+    unresolvable_count: int
+    original_count: int
+@dataclass
+class CutoffAnalysis:
+    timestamp_column: str
+    total_rows: int
+    bins: list[datetime]
+    bin_counts: list[int]
+    train_percentages: list[float]
+    score_percentages: list[float]
+    date_range: tuple[datetime, datetime]
+    source_rows: int = 0
+    covered_rows: int = 0
+    resolved_timestamp_series: Optional[pd.Series] = field(default=None, repr=False)
+    _source_df: Optional[pd.DataFrame] = field(default=None, repr=False)
+    @property
+    def coverage_ratio(self) -> float:
+        return self.covered_rows / self.source_rows if self.source_rows > 0 else 0.0
+    def suggest_cutoff(self, train_ratio: float = 0.9) -> datetime:
+        target_pct = train_ratio * 100
+        for i, train_pct in enumerate(self.train_percentages):
+            if train_pct >= target_pct:
+                return self.bins[i]
+        return self.bins[-1]
+    def get_train_percentage(self, cutoff_date: datetime) -> float:
+        for i, bin_date in enumerate(self.bins):
+            if bin_date >= cutoff_date:
+                return self.train_percentages[max(0, i - 1)]
+        return self.train_percentages[-1]
+    def get_split_at_date(self, cutoff_date: datetime) -> dict:
+        train_pct = self.get_train_percentage(cutoff_date)
+        train_count = int(self.total_rows * train_pct / 100)
+        return {
+            "train_count": train_count,
+            "score_count": self.total_rows - train_count,
+            "train_pct": train_pct,
+            "score_pct": 100 - train_pct,
+        }
+    def split_at_cutoff(self, cutoff_date: Optional[datetime] = None) -> "SplitResult":
+        if self.resolved_timestamp_series is None:
+            raise ValueError(
+                "No resolved timestamp series available. "
+                "Re-run analyze() to populate resolved_timestamp_series."
+            )
+        if self._source_df is None:
+            raise ValueError(
+                "No source DataFrame available. "
+                "Re-run analyze() to populate the source reference."
+            )
+        cutoff = cutoff_date or self.suggest_cutoff()
+        ts = self.resolved_timestamp_series
+        df = self._source_df
+        not_null_mask = ts.notna()
+        train_mask = not_null_mask & (ts <= cutoff)
+        score_mask = not_null_mask & (ts > cutoff)
+        unresolvable_mask = ~not_null_mask
+        train_df = df.loc[train_mask]
+        score_df = df.loc[score_mask]
+        unresolvable_df = df.loc[unresolvable_mask]
+        assert len(train_df) + len(score_df) + len(unresolvable_df) == len(df), (
+            f"Data loss detected: train({len(train_df)}) + score({len(score_df)}) + "
+            f"unresolvable({len(unresolvable_df)}) != original({len(df)})"
+        )
+        return SplitResult(
+            train_df=train_df,
+            score_df=score_df,
+            unresolvable_df=unresolvable_df,
+            cutoff_date=cutoff,
+            timestamp_source=self.timestamp_column,
+            train_count=len(train_df),
+            score_count=len(score_df),
+            unresolvable_count=len(unresolvable_df),
+            original_count=len(df),
+        )
+    def to_dataframe(self) -> pd.DataFrame:
+        cumulative = np.cumsum(self.bin_counts)
+        return pd.DataFrame({
+            "date": self.bins,
+            "bin_count": self.bin_counts,
+            "cumulative_count": cumulative,
+            "train_pct": self.train_percentages,
+            "score_pct": self.score_percentages,
+        })
+    def get_percentage_milestones(self, step: int = 5) -> list[dict]:
+        milestones = []
+        target_pcts = list(range(step, 100, step))
+        for target in target_pcts:
+            for i, train_pct in enumerate(self.train_percentages):
+                if train_pct >= target:
+                    milestones.append({
+                        "date": self.bins[i],
+                        "train_pct": round(train_pct, 1),
+                        "score_pct": round(100 - train_pct, 1),
+                    })
+                    break
+        return milestones
+class CutoffAnalyzer:
+    TIMESTAMP_PATTERNS = ["feature_timestamp", "label_timestamp", "timestamp", "date", "datetime"]
+    def __init__(self):
+        self._datetime_analyzer = DatetimeOrderAnalyzer()
+    def analyze(
+        self,
+        df: pd.DataFrame,
+        timestamp_column: Optional[str] = None,
+        n_bins: int = 20,
+        timestamp_series: Optional[pd.Series] = None,
+    ) -> CutoffAnalysis:
+        source_rows = len(df)
+        ts_col, full_series = self._resolve_timestamp_series_full(df, timestamp_column, timestamp_series)
+        ts_series = full_series.dropna()
+        if len(ts_series) == 0:
+            return self._empty_analysis(ts_col, source_rows=source_rows, df=df, full_series=full_series)
+        covered_rows = len(ts_series)
+        coverage_ratio = covered_rows / source_rows if source_rows > 0 else 0.0
+        if coverage_ratio < 0.5:
+            warnings.warn(
+                f"Low timestamp coverage: {covered_rows}/{source_rows} rows "
+                f"({coverage_ratio:.1%}). Results may not represent the full dataset.",
+                stacklevel=2,
+            )
+        bins, counts = self._compute_bins(ts_series, n_bins)
+        train_pcts, score_pcts = self._compute_percentages(counts)
+        return CutoffAnalysis(
+            timestamp_column=ts_col,
+            total_rows=len(ts_series),
+            bins=bins,
+            bin_counts=counts,
+            train_percentages=train_pcts,
+            score_percentages=score_pcts,
+            date_range=(ts_series.min(), ts_series.max()),
+            source_rows=source_rows,
+            covered_rows=covered_rows,
+            resolved_timestamp_series=full_series,
+            _source_df=df,
+        )
+    def _resolve_timestamp_series_full(
+        self,
+        df: pd.DataFrame,
+        timestamp_column: Optional[str],
+        timestamp_series: Optional[pd.Series],
+    ) -> tuple[str, pd.Series]:
+        if timestamp_series is not None:
+            ts_col = timestamp_series.name or "timestamp_series"
+            series = self._ensure_datetime_series_full(timestamp_series)
+            return ts_col, series
+        ts_col = timestamp_column or self._detect_timestamp_column(df)
+        series = self._ensure_datetime_series_full(df[ts_col])
+        return ts_col, series
+    def _detect_timestamp_column(self, df: pd.DataFrame) -> str:
+        datetime_cols = self._datetime_analyzer._get_datetime_columns(df)
+        for pattern in self.TIMESTAMP_PATTERNS:
+            for col in datetime_cols:
+                if pattern in col.lower():
+                    return col
+        if datetime_cols:
+            return datetime_cols[0]
+        raise ValueError("No timestamp column found")
+    def _ensure_datetime_series_full(self, series: pd.Series) -> pd.Series:
+        if pd.api.types.is_datetime64_any_dtype(series):
+            return series
+        return pd.to_datetime(series, format="mixed", errors="coerce")
+    def _compute_bins(self, ts_series: pd.Series, n_bins: int) -> tuple[list[datetime], list[int]]:
+        if ts_series.nunique() == 1:
+            return [ts_series.iloc[0].to_pydatetime()], [len(ts_series)]
+        bin_edges = pd.date_range(ts_series.min(), ts_series.max(), periods=n_bins + 1)
+        counts, _ = np.histogram(ts_series, bins=bin_edges)
+        bin_centers = [edge.to_pydatetime() for edge in bin_edges[:-1]]
+        return bin_centers, counts.tolist()
+    def _compute_percentages(self, counts: list[int]) -> tuple[list[float], list[float]]:
+        total = sum(counts)
+        if total == 0:
+            return [0.0] * len(counts), [100.0] * len(counts)
+        cumulative = np.cumsum(counts)
+        train_pcts = (cumulative / total * 100).tolist()
+        score_pcts = [100 - p for p in train_pcts]
+        return train_pcts, score_pcts
+    def _empty_analysis(self, ts_col: str, source_rows: int = 0, df: Optional[pd.DataFrame] = None, full_series: Optional[pd.Series] = None) -> CutoffAnalysis:
+        return CutoffAnalysis(
+            timestamp_column=ts_col, total_rows=0, bins=[], bin_counts=[],
+            train_percentages=[], score_percentages=[],
+            date_range=(datetime.now(), datetime.now()),
+            source_rows=source_rows, covered_rows=0,
+            resolved_timestamp_series=full_series,
+            _source_df=df,
+        )