PyPI - churnkit - Versions diffs - 0.75.0a1__py3-none-any.whl - Mend

churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (302) hide show

customer_retention/stages/profiling/segment_aware_outlier.py ADDED Viewed

@@ -0,0 +1,265 @@
+"""Segment-aware outlier analysis that considers natural data clusters."""
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+import numpy as np
+from customer_retention.core.compat import DataFrame, pd, to_pandas
+from customer_retention.stages.cleaning.outlier_handler import OutlierDetectionMethod, OutlierHandler, OutlierResult
+from .segment_analyzer import SegmentAnalyzer, SegmentationResult
+@dataclass
+class SegmentAwareOutlierResult:
+    """Results from segment-aware outlier analysis."""
+    n_segments: int
+    global_analysis: Dict[str, OutlierResult]
+    segment_analysis: Dict[Any, Dict[str, OutlierResult]]
+    false_outliers: Dict[str, int]
+    segmentation_recommended: bool
+    recommendations: List[str]
+    rationale: List[str]
+    segment_labels: Optional[np.ndarray] = None
+    segmentation_result: Optional[SegmentationResult] = None
+class SegmentAwareOutlierAnalyzer:
+    """Analyzes outliers considering natural data segments.
+    Addresses the problem where global outliers may actually be valid data
+    points from a different segment (e.g., enterprise vs retail customers).
+    """
+    FALSE_OUTLIER_THRESHOLD = 0.5  # If >50% of global outliers are segment-normal
+    MIN_SEGMENT_SIZE = 10
+    def __init__(
+        self,
+        detection_method: OutlierDetectionMethod = OutlierDetectionMethod.IQR,
+        iqr_multiplier: float = 1.5,
+        zscore_threshold: float = 3.0,
+        max_segments: int = 5
+    ):
+        self.detection_method = detection_method
+        self.iqr_multiplier = iqr_multiplier
+        self.zscore_threshold = zscore_threshold
+        self.max_segments = max_segments
+        self._segment_analyzer = SegmentAnalyzer()
+    def analyze(
+        self,
+        df: DataFrame,
+        feature_cols: List[str],
+        segment_col: Optional[str] = None,
+        target_col: Optional[str] = None
+    ) -> SegmentAwareOutlierResult:
+        df = to_pandas(df)
+        if len(df) == 0 or all(df[col].isna().all() for col in feature_cols if col in df.columns):
+            return self._empty_result(feature_cols)
+        valid_cols = [c for c in feature_cols if c in df.columns]
+        if not valid_cols:
+            return self._empty_result(feature_cols)
+        global_analysis = self._analyze_global(df, valid_cols)
+        if segment_col and segment_col in df.columns:
+            segment_labels, n_segments = self._use_explicit_segments(df, segment_col)
+            segmentation_result = None
+        else:
+            segment_labels, n_segments, segmentation_result = self._detect_segments(
+                df, valid_cols, target_col
+            )
+        segment_analysis = self._analyze_by_segment(df, valid_cols, segment_labels, n_segments)
+        false_outliers = self._identify_false_outliers(
+            df, valid_cols, global_analysis, segment_analysis, segment_labels
+        )
+        segmentation_recommended, recommendations, rationale = self._make_recommendations(
+            global_analysis, segment_analysis, false_outliers, n_segments
+        )
+        return SegmentAwareOutlierResult(
+            n_segments=n_segments,
+            global_analysis=global_analysis,
+            segment_analysis=segment_analysis,
+            false_outliers=false_outliers,
+            segmentation_recommended=segmentation_recommended,
+            recommendations=recommendations,
+            rationale=rationale,
+            segment_labels=segment_labels,
+            segmentation_result=segmentation_result
+        )
+    def _analyze_global(self, df: DataFrame, feature_cols: List[str]) -> Dict[str, OutlierResult]:
+        handler = OutlierHandler(
+            detection_method=self.detection_method,
+            iqr_multiplier=self.iqr_multiplier,
+            zscore_threshold=self.zscore_threshold
+        )
+        return {col: handler.detect(df[col]) for col in feature_cols}
+    def _use_explicit_segments(self, df: DataFrame, segment_col: str) -> tuple:
+        unique_segments = df[segment_col].dropna().unique()
+        label_map = {v: i for i, v in enumerate(unique_segments)}
+        labels = df[segment_col].map(label_map).fillna(-1).astype(int).values
+        return labels, len(unique_segments)
+    def _detect_segments(
+        self, df: DataFrame, feature_cols: List[str], target_col: Optional[str]
+    ) -> tuple:
+        if len(df) < self.MIN_SEGMENT_SIZE * 2:
+            return np.zeros(len(df), dtype=int), 1, None
+        try:
+            result = self._segment_analyzer.analyze(
+                df,
+                target_col=target_col,
+                feature_cols=feature_cols,
+                max_segments=self.max_segments
+            )
+            return result.labels, result.n_segments, result
+        except Exception:
+            return np.zeros(len(df), dtype=int), 1, None
+    def _analyze_by_segment(
+        self,
+        df: DataFrame,
+        feature_cols: List[str],
+        segment_labels: np.ndarray,
+        n_segments: int
+    ) -> Dict[Any, Dict[str, OutlierResult]]:
+        segment_analysis = {}
+        handler = OutlierHandler(
+            detection_method=self.detection_method,
+            iqr_multiplier=self.iqr_multiplier,
+            zscore_threshold=self.zscore_threshold
+        )
+        for seg_id in range(n_segments):
+            mask = segment_labels == seg_id
+            if mask.sum() < self.MIN_SEGMENT_SIZE:
+                continue
+            segment_df = df.loc[mask]
+            segment_analysis[seg_id] = {
+                col: handler.detect(segment_df[col]) for col in feature_cols
+            }
+        return segment_analysis
+    def _identify_false_outliers(
+        self,
+        df: DataFrame,
+        feature_cols: List[str],
+        global_analysis: Dict[str, OutlierResult],
+        segment_analysis: Dict[Any, Dict[str, OutlierResult]],
+        segment_labels: np.ndarray
+    ) -> Dict[str, int]:
+        """Identify global outliers that are normal within their segment."""
+        false_outliers = {}
+        for col in feature_cols:
+            global_result = global_analysis[col]
+            if global_result.outlier_mask is None:
+                false_outliers[col] = 0
+                continue
+            global_outlier_indices = np.where(global_result.outlier_mask)[0]
+            false_count = 0
+            for idx in global_outlier_indices:
+                seg_id = segment_labels[idx]
+                if seg_id < 0 or seg_id not in segment_analysis:
+                    continue
+                seg_result = segment_analysis[seg_id].get(col)
+                if seg_result is None or seg_result.outlier_mask is None:
+                    continue
+                # Get the local index within segment
+                seg_mask = segment_labels == seg_id
+                seg_indices = np.where(seg_mask)[0]
+                local_idx = np.where(seg_indices == idx)[0]
+                if len(local_idx) > 0:
+                    local_pos = local_idx[0]
+                    seg_outlier_mask = seg_result.outlier_mask.values
+                    if local_pos < len(seg_outlier_mask) and not seg_outlier_mask[local_pos]:
+                        false_count += 1
+            false_outliers[col] = false_count
+        return false_outliers
+    def _make_recommendations(
+        self,
+        global_analysis: Dict[str, OutlierResult],
+        segment_analysis: Dict[Any, Dict[str, OutlierResult]],
+        false_outliers: Dict[str, int],
+        n_segments: int
+    ) -> tuple:
+        recommendations = []
+        rationale = []
+        segmentation_recommended = False
+        for col, false_count in false_outliers.items():
+            global_count = global_analysis[col].outliers_detected
+            if global_count == 0:
+                continue
+            false_ratio = false_count / global_count
+            if false_ratio >= self.FALSE_OUTLIER_THRESHOLD:
+                segmentation_recommended = True
+                rationale.append(
+                    f"{col}: {false_count}/{global_count} ({false_ratio:.0%}) global outliers "
+                    f"are normal within their segment"
+                )
+                recommendations.append(
+                    f"Consider segment-specific outlier treatment for '{col}' - "
+                    f"global outliers may be valid data from different customer segments"
+                )
+            elif false_ratio > 0.2:
+                rationale.append(
+                    f"{col}: {false_count}/{global_count} ({false_ratio:.0%}) false outliers detected"
+                )
+        if n_segments > 1 and not segmentation_recommended:
+            total_global = sum(r.outliers_detected for r in global_analysis.values())
+            total_segment = sum(
+                sum(r.outliers_detected for r in seg.values())
+                for seg in segment_analysis.values()
+            )
+            if total_global > 0:
+                reduction = (total_global - total_segment) / total_global
+                if reduction > 0.3:
+                    rationale.append(
+                        f"Segment-aware analysis reduces outliers by {reduction:.0%} "
+                        f"({total_global} global → {total_segment} segment-level)"
+                    )
+        if not segmentation_recommended and n_segments <= 1:
+            rationale.append("Data appears homogeneous - global outlier treatment is appropriate")
+            recommendations.append("Use standard global outlier detection methods")
+        return segmentation_recommended, recommendations, rationale
+    def _empty_result(self, feature_cols: List[str]) -> SegmentAwareOutlierResult:
+        empty_handler = OutlierHandler()
+        empty_series = pd.Series([], dtype=float)
+        empty_result = empty_handler.detect(empty_series)
+        return SegmentAwareOutlierResult(
+            n_segments=0,
+            global_analysis={col: empty_result for col in feature_cols},
+            segment_analysis={},
+            false_outliers={col: 0 for col in feature_cols},
+            segmentation_recommended=False,
+            recommendations=["Insufficient data for outlier analysis"],
+            rationale=["Empty or all-null dataset"]
+        )

customer_retention/stages/profiling/target_level_analyzer.py ADDED Viewed

@@ -0,0 +1,217 @@
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Dict, List, Optional, Tuple
+from customer_retention.core.compat import DataFrame
+class TargetLevel(Enum):
+    ENTITY_LEVEL = "entity_level"
+    EVENT_LEVEL = "event_level"
+    UNKNOWN = "unknown"
+    MISSING = "missing"
+class AggregationMethod(Enum):
+    MAX = "max"
+    MEAN = "mean"
+    SUM = "sum"
+    LAST = "last"
+    FIRST = "first"
+@dataclass
+class TargetDistribution:
+    value_counts: Dict[int, int]
+    total: int
+    @property
+    def as_percentages(self) -> Dict[int, float]:
+        return {k: v / self.total * 100 for k, v in self.value_counts.items()}
+    def get_label(self, value: int) -> str:
+        return {1: "Churned", 0: "Retained"}.get(value, str(value))
+@dataclass
+class TargetLevelResult:
+    target_column: str
+    entity_column: str
+    level: TargetLevel
+    suggested_aggregation: Optional[AggregationMethod]
+    event_distribution: Optional[TargetDistribution] = None
+    entity_distribution: Optional[TargetDistribution] = None
+    variation_pct: float = 0.0
+    is_binary: bool = False
+    entity_target_column: Optional[str] = None
+    aggregation_used: Optional[AggregationMethod] = None
+    messages: List[str] = field(default_factory=list)
+class TargetLevelAnalyzer:
+    ENTITY_LEVEL_THRESHOLD = 5.0
+    TARGET_KEYWORDS = ['churn', 'unsub', 'cancel', 'retain', 'active', 'lost', 'leave', 'target']
+    def __init__(self, variation_threshold: float = 5.0):
+        self.variation_threshold = variation_threshold
+    def detect_level(self, df: DataFrame, target_column: str, entity_column: str) -> TargetLevelResult:
+        if target_column is None or entity_column is None:
+            return TargetLevelResult(
+                target_column=target_column or "", entity_column=entity_column or "",
+                level=TargetLevel.UNKNOWN, suggested_aggregation=None,
+                messages=["Target or entity column not specified"])
+        if target_column not in df.columns:
+            return TargetLevelResult(
+                target_column=target_column, entity_column=entity_column,
+                level=TargetLevel.MISSING, suggested_aggregation=None,
+                messages=[f"Target column '{target_column}' not found in data"])
+        event_counts = df[target_column].value_counts().to_dict()
+        event_dist = TargetDistribution(value_counts=event_counts, total=len(df))
+        target_per_entity = df.groupby(entity_column)[target_column].nunique()
+        total_entities = len(target_per_entity)
+        variation_pct = ((target_per_entity > 1).sum() / total_entities * 100) if total_entities > 0 else 0
+        is_binary = len(event_counts) == 2
+        if variation_pct < self.variation_threshold:
+            entity_target = df.groupby(entity_column)[target_column].first()
+            entity_dist = TargetDistribution(value_counts=entity_target.value_counts().to_dict(), total=len(entity_target))
+            return TargetLevelResult(
+                target_column=target_column, entity_column=entity_column, level=TargetLevel.ENTITY_LEVEL,
+                suggested_aggregation=None, event_distribution=event_dist, entity_distribution=entity_dist,
+                variation_pct=variation_pct, is_binary=is_binary,
+                messages=["Target is consistent within entities (entity-level)"])
+        return TargetLevelResult(
+            target_column=target_column, entity_column=entity_column, level=TargetLevel.EVENT_LEVEL,
+            suggested_aggregation=self._suggest_aggregation(event_counts, is_binary),
+            event_distribution=event_dist, variation_pct=variation_pct, is_binary=is_binary,
+            messages=[f"Target varies within entities ({variation_pct:.1f}% have variation)",
+                      f"Suggested aggregation: {self._suggest_aggregation(event_counts, is_binary).value}"])
+    def aggregate_to_entity(self, df: DataFrame, target_column: str, entity_column: str,
+                           time_column: Optional[str] = None,
+                           method: AggregationMethod = AggregationMethod.MAX) -> Tuple[DataFrame, TargetLevelResult]:
+        result = self.detect_level(df, target_column, entity_column)
+        if result.level == TargetLevel.ENTITY_LEVEL:
+            result.entity_target_column = target_column
+            return df, result
+        if result.level in [TargetLevel.MISSING, TargetLevel.UNKNOWN]:
+            return df, result
+        entity_target_col = f"{target_column}_entity"
+        entity_target = self._compute_entity_target(df, target_column, entity_column, time_column, method, result)
+        entity_dist = TargetDistribution(value_counts=entity_target.value_counts().to_dict(), total=len(entity_target))
+        entity_target_map = entity_target.reset_index()
+        entity_target_map.columns = [entity_column, entity_target_col]
+        df_result = df.merge(entity_target_map, on=entity_column, how="left")
+        result.entity_distribution = entity_dist
+        result.entity_target_column = entity_target_col
+        result.aggregation_used = method
+        result.messages.append(f"Created entity-level target: {entity_target_col}")
+        return df_result, result
+    def _compute_entity_target(self, df: DataFrame, target_column: str, entity_column: str,
+                               time_column: Optional[str], method: AggregationMethod,
+                               result: TargetLevelResult):
+        agg_funcs = {
+            AggregationMethod.MAX: lambda: df.groupby(entity_column)[target_column].max(),
+            AggregationMethod.MEAN: lambda: df.groupby(entity_column)[target_column].mean(),
+            AggregationMethod.SUM: lambda: df.groupby(entity_column)[target_column].sum(),
+        }
+        if method in agg_funcs:
+            return agg_funcs[method]()
+        if method == AggregationMethod.LAST:
+            if time_column is None:
+                result.messages.append("Warning: 'last' aggregation without time_column uses row order")
+                return df.groupby(entity_column)[target_column].last()
+            return df.sort_values(time_column).groupby(entity_column)[target_column].last()
+        if method == AggregationMethod.FIRST:
+            if time_column is None:
+                return df.groupby(entity_column)[target_column].first()
+            return df.sort_values(time_column).groupby(entity_column)[target_column].first()
+        return df.groupby(entity_column)[target_column].max()
+    def _suggest_aggregation(self, value_counts: Dict[int, int], is_binary: bool) -> AggregationMethod:
+        return AggregationMethod.MAX
+    def print_analysis(self, result: TargetLevelResult):
+        print("=" * 70 + "\nTARGET LEVEL ANALYSIS\n" + "=" * 70)
+        print(f"\nColumn: {result.target_column}\nLevel: {result.level.value.upper()}")
+        if result.level == TargetLevel.EVENT_LEVEL:
+            print(f"\n⚠️  EVENT-LEVEL TARGET DETECTED\n   {result.variation_pct:.1f}% of entities have varying target values")
+            if result.event_distribution:
+                print("\n   Event-level distribution:")
+                for val, count in sorted(result.event_distribution.value_counts.items()):
+                    print(f"      {result.target_column}={val}: {count:,} events ({result.event_distribution.as_percentages[val]:.1f}%)")
+            if result.suggested_aggregation:
+                print(f"\n   Suggested aggregation: {result.suggested_aggregation.value}")
+        elif result.level == TargetLevel.ENTITY_LEVEL:
+            print("\n✓ Target is already at entity-level")
+            if result.entity_distribution:
+                print("\n   Entity-level distribution:")
+                for val, count in sorted(result.entity_distribution.value_counts.items()):
+                    pct = result.entity_distribution.as_percentages[val]
+                    label = result.entity_distribution.get_label(val)
+                    print(f"      {label} ({result.target_column}={val}): {count:,} entities ({pct:.1f}%)")
+        if result.aggregation_used:
+            print(f"\n   Aggregation applied: {result.aggregation_used.value}")
+            print(f"   Entity target column: {result.entity_target_column}")
+            if result.entity_distribution:
+                print("\n   Entity-level distribution (after aggregation):")
+                for val, count in sorted(result.entity_distribution.value_counts.items()):
+                    pct = result.entity_distribution.as_percentages[val]
+                    label = result.entity_distribution.get_label(val)
+                    print(f"      {label} ({result.entity_target_column}={val}): {count:,} entities ({pct:.1f}%)")
+        print()
+class TargetColumnDetector:
+    TARGET_KEYWORDS = ['churn', 'unsub', 'cancel', 'retain', 'active', 'lost', 'leave', 'target']
+    def detect(self, findings, df: DataFrame, override: Optional[str] = None) -> Tuple[Optional[str], str]:
+        from customer_retention.core.config.column_config import ColumnType
+        if override == "DEFER_TO_MULTI_DATASET":
+            return None, "deferred"
+        if override is not None:
+            return override, "override"
+        for col_name, col_info in findings.columns.items():
+            if col_info.inferred_type == ColumnType.TARGET:
+                return col_name, "auto-detected"
+        for col_name, col_info in findings.columns.items():
+            if col_info.inferred_type == ColumnType.BINARY:
+                if any(kw in col_name.lower() for kw in self.TARGET_KEYWORDS):
+                    return col_name, "binary-candidate"
+        return None, "not-found"
+    def print_detection(self, target_column: Optional[str], method: str,
+                        other_candidates: Optional[List[str]] = None):
+        messages = {
+            "deferred": "\n⏳ Target deferred to multi-dataset notebook (05)\n   Analysis will proceed without target-based comparisons",
+            "override": f"\n🔧 Using override target: {target_column}",
+            "auto-detected": f"\n🔍 Auto-detected target: {target_column}",
+            "not-found": "\n🔍 No target column detected"
+        }
+        if method == "binary-candidate":
+            print(f"\n🔍 No explicit target detected, using binary candidate: {target_column}")
+            if other_candidates:
+                print(f"   Other candidates: {other_candidates}")
+        else:
+            print(messages.get(method, ""))