PyPI - churnkit - Versions diffs - 0.75.0a1__py3-none-any.whl - Mend

churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (302) hide show

customer_retention/core/components/components/ingester.py ADDED Viewed

@@ -0,0 +1,34 @@
+from typing import List
+from customer_retention.core.compat import ops
+from customer_retention.generators.orchestration.context import PipelineContext
+from ..base import Component, ComponentResult
+class Ingester(Component):
+    def __init__(self):
+        super().__init__(name="Ingester", chapters=[1])
+    def validate_inputs(self, context: PipelineContext) -> List[str]:
+        errors = []
+        if not context.raw_data_path:
+            errors.append("raw_data_path is required")
+        return errors
+    def run(self, context: PipelineContext) -> ComponentResult:
+        self._start_timer()
+        try:
+            path = context.raw_data_path
+            df = ops.read_csv(path)
+            context.current_df = df
+            context.current_stage = "bronze"
+            row_count = len(df)
+            col_count = len(df.columns)
+            return self.create_result(
+                success=True,
+                artifacts={"bronze_data": context.bronze_path} if context.bronze_path else {},
+                metrics={"row_count": row_count, "column_count": col_count}
+            )
+        except Exception as e:
+            return self.create_result(success=False, errors=[str(e)])

customer_retention/core/components/components/profiler.py ADDED Viewed

@@ -0,0 +1,34 @@
+from typing import List
+from customer_retention.generators.orchestration.context import PipelineContext
+from ..base import Component, ComponentResult
+class Profiler(Component):
+    def __init__(self):
+        super().__init__(name="Profiler", chapters=[2])
+    def validate_inputs(self, context: PipelineContext) -> List[str]:
+        errors = []
+        if context.current_df is None:
+            errors.append("No DataFrame available for profiling")
+        return errors
+    def run(self, context: PipelineContext) -> ComponentResult:
+        self._start_timer()
+        try:
+            from customer_retention.stages.profiling.column_profiler import ColumnProfiler
+            from customer_retention.stages.profiling.type_detector import TypeDetector
+            df = context.current_df
+            type_detector = TypeDetector()
+            type_results = type_detector.detect_all(df)
+            profiler = ColumnProfiler()
+            profile = profiler.profile_all(df)
+            context.profiling_results = {"types": type_results, "profile": profile}
+            return self.create_result(
+                success=True,
+                metrics={"columns_profiled": len(df.columns)}
+            )
+        except Exception as e:
+            return self.create_result(success=False, errors=[str(e)])

customer_retention/core/components/components/trainer.py ADDED Viewed

@@ -0,0 +1,38 @@
+from typing import List
+from customer_retention.generators.orchestration.context import PipelineContext
+from ..base import Component, ComponentResult
+class Trainer(Component):
+    def __init__(self):
+        super().__init__(name="Trainer", chapters=[5])
+    def validate_inputs(self, context: PipelineContext) -> List[str]:
+        errors = []
+        if context.current_df is None:
+            errors.append("No DataFrame available for training")
+        if not context.target_column:
+            errors.append("target_column is required for training")
+        return errors
+    def run(self, context: PipelineContext) -> ComponentResult:
+        self._start_timer()
+        try:
+            from customer_retention.stages.modeling.baseline_trainer import BaselineTrainer
+            from customer_retention.stages.modeling.data_splitter import DataSplitter
+            df = context.current_df
+            target = context.target_column
+            splitter = DataSplitter()
+            X_train, X_test, y_train, y_test = splitter.split(df, target)
+            trainer = BaselineTrainer()
+            results = trainer.train_all(X_train, y_train, X_test, y_test)
+            context.model_results = results
+            best_model = max(results, key=lambda x: results[x].get("pr_auc", 0))
+            return self.create_result(
+                success=True,
+                metrics={"best_model": best_model, "pr_auc": results[best_model].get("pr_auc", 0)}
+            )
+        except Exception as e:
+            return self.create_result(success=False, errors=[str(e)])

customer_retention/core/components/components/transformer.py ADDED Viewed

@@ -0,0 +1,36 @@
+from typing import List
+from customer_retention.generators.orchestration.context import PipelineContext
+from ..base import Component, ComponentResult
+class Transformer(Component):
+    def __init__(self):
+        super().__init__(name="Transformer", chapters=[3])
+    def validate_inputs(self, context: PipelineContext) -> List[str]:
+        errors = []
+        if context.current_df is None:
+            errors.append("No DataFrame available for transformation")
+        return errors
+    def run(self, context: PipelineContext) -> ComponentResult:
+        self._start_timer()
+        try:
+            from customer_retention.stages.cleaning.missing_handler import MissingHandler
+            from customer_retention.stages.cleaning.outlier_handler import OutlierHandler
+            df = context.current_df
+            missing_handler = MissingHandler()
+            df = missing_handler.handle(df)
+            outlier_handler = OutlierHandler()
+            df = outlier_handler.handle(df)
+            context.current_df = df
+            context.current_stage = "silver"
+            return self.create_result(
+                success=True,
+                artifacts={"silver_data": context.silver_path} if context.silver_path else {},
+                metrics={"row_count": len(df)}
+            )
+        except Exception as e:
+            return self.create_result(success=False, errors=[str(e)])

customer_retention/core/components/components/validator.py ADDED Viewed

@@ -0,0 +1,37 @@
+from typing import List
+from customer_retention.generators.orchestration.context import PipelineContext
+from ..base import Component, ComponentResult
+class Validator(Component):
+    def __init__(self):
+        super().__init__(name="Validator", chapters=[6])
+    def validate_inputs(self, context: PipelineContext) -> List[str]:
+        errors = []
+        if not context.model_results:
+            errors.append("No model results available for validation")
+        return errors
+    def run(self, context: PipelineContext) -> ComponentResult:
+        self._start_timer()
+        try:
+            from customer_retention.analysis.diagnostics.calibration_analyzer import CalibrationAnalyzer
+            from customer_retention.analysis.diagnostics.leakage_detector import LeakageDetector
+            from customer_retention.analysis.diagnostics.overfitting_analyzer import OverfittingAnalyzer
+            LeakageDetector()
+            OverfittingAnalyzer()
+            CalibrationAnalyzer()
+            context.validation_results = {
+                "leakage": "checked",
+                "overfitting": "checked",
+                "calibration": "checked"
+            }
+            return self.create_result(
+                success=True,
+                metrics={"diagnostics_run": 3}
+            )
+        except Exception as e:
+            return self.create_result(success=False, errors=[str(e)])

customer_retention/core/components/enums.py ADDED Viewed

@@ -0,0 +1,33 @@
+from enum import Enum
+class Severity(str, Enum):
+    CRITICAL = "critical"
+    HIGH = "high"
+    WARNING = "warning"
+    MEDIUM = "medium"
+    LOW = "low"
+    INFO = "info"
+class ModelType(Enum):
+    LOGISTIC_REGRESSION = "logistic_regression"
+    RANDOM_FOREST = "random_forest"
+    XGBOOST = "xgboost"
+    LIGHTGBM = "lightgbm"
+    CATBOOST = "catboost"
+class RiskSegment(Enum):
+    """Customer risk segmentation levels."""
+    CRITICAL = "Critical"
+    HIGH = "High"
+    MEDIUM = "Medium"
+    LOW = "Low"
+    VERY_LOW = "Very Low"
+class Platform(str, Enum):
+    """Deployment platform options."""
+    LOCAL = "local"
+    DATABRICKS = "databricks"

customer_retention/core/components/orchestrator.py ADDED Viewed

@@ -0,0 +1,94 @@
+import time
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Dict, List
+from .base import Component, ComponentResult, ComponentStatus
+from .registry import ComponentRegistry
+if TYPE_CHECKING:
+    from customer_retention.generators.orchestration.context import PipelineContext
+@dataclass
+class OrchestratorResult:
+    success: bool
+    components_run: List[str]
+    results: Dict[str, ComponentResult]
+    total_duration_seconds: float
+    def get_summary(self) -> str:
+        status = "SUCCESS" if self.success else "FAILED"
+        return f"{status}: {len(self.components_run)} components in {self.total_duration_seconds:.1f}s"
+class Orchestrator:
+    def __init__(self, registry: ComponentRegistry, context: "PipelineContext"):
+        self.registry = registry
+        self.context = context
+    def run_training(self) -> OrchestratorResult:
+        return self.run_chapters([1, 2, 3, 4, 5, 6, 7])
+    def run_phase(self, phase: str) -> OrchestratorResult:
+        start_time = time.time()
+        registrations = self.registry.get_phase_components(phase)
+        components_run = []
+        results = {}
+        success = True
+        for reg in registrations:
+            name = self._get_name_for_registration(reg)
+            result = self._run_component(reg.component_class)
+            results[name] = result
+            components_run.append(name)
+            if not result.success:
+                success = False
+                break
+        return OrchestratorResult(
+            success=success,
+            components_run=components_run,
+            results=results,
+            total_duration_seconds=time.time() - start_time
+        )
+    def run_chapters(self, chapters: List[int]) -> OrchestratorResult:
+        start_time = time.time()
+        registrations = self.registry.get_chapters_components(chapters)
+        components_run = []
+        results = {}
+        success = True
+        for reg in registrations:
+            name = self._get_name_for_registration(reg)
+            result = self._run_component(reg.component_class)
+            results[name] = result
+            components_run.append(name)
+            if not result.success:
+                success = False
+                break
+        return OrchestratorResult(
+            success=success,
+            components_run=components_run,
+            results=results,
+            total_duration_seconds=time.time() - start_time
+        )
+    def run_single(self, component_name: str) -> ComponentResult:
+        reg = self.registry.get_component(component_name)
+        return self._run_component(reg.component_class)
+    def _run_component(self, component_class: type) -> ComponentResult:
+        component: Component = component_class()
+        errors = component.validate_inputs(self.context)
+        if errors:
+            return ComponentResult(
+                success=False, status=ComponentStatus.FAILED,
+                errors=errors
+            )
+        if component.should_skip(self.context):
+            return ComponentResult(success=True, status=ComponentStatus.SKIPPED)
+        return component.run(self.context)
+    def _get_name_for_registration(self, reg) -> str:
+        for name, r in self.registry._components.items():
+            if r == reg:
+                return name
+        return reg.component_class.__name__.lower()

customer_retention/core/components/registry.py ADDED Viewed

@@ -0,0 +1,59 @@
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Type
+from .base import Component
+@dataclass
+class ComponentRegistration:
+    component_class: Type[Component]
+    phase: str
+    dependencies: List[str] = field(default_factory=list)
+class ComponentRegistry:
+    PHASES = ["discovery", "data_preparation", "model_development", "production"]
+    def __init__(self):
+        self._components: Dict[str, ComponentRegistration] = {}
+    def register(self, name: str, component_class: Type[Component], phase: str,
+                 dependencies: Optional[List[str]] = None) -> None:
+        self._components[name] = ComponentRegistration(
+            component_class=component_class,
+            phase=phase,
+            dependencies=dependencies or []
+        )
+    def get_component(self, name: str) -> ComponentRegistration:
+        if name not in self._components:
+            raise KeyError(f"Component '{name}' not found")
+        return self._components[name]
+    def get_phase_components(self, phase: str) -> List[ComponentRegistration]:
+        return [reg for reg in self._components.values() if reg.phase == phase]
+    def get_chapters_components(self, chapters: List[int]) -> List[ComponentRegistration]:
+        result = []
+        for reg in self._components.values():
+            instance = reg.component_class()
+            if any(ch in instance.chapters for ch in chapters):
+                result.append(reg)
+        return result
+    def list_components(self) -> List[str]:
+        return list(self._components.keys())
+def get_default_registry() -> ComponentRegistry:
+    from .components import Deployer, Explainer, FeatureEngineer, Ingester, Profiler, Trainer, Transformer, Validator
+    registry = ComponentRegistry()
+    registry.register("ingester", Ingester, "data_preparation")
+    registry.register("profiler", Profiler, "data_preparation", ["ingester"])
+    registry.register("transformer", Transformer, "data_preparation", ["profiler"])
+    registry.register("feature_engineer", FeatureEngineer, "data_preparation", ["transformer"])
+    registry.register("trainer", Trainer, "model_development", ["feature_engineer"])
+    registry.register("validator", Validator, "model_development", ["trainer"])
+    registry.register("explainer", Explainer, "model_development", ["trainer"])
+    registry.register("deployer", Deployer, "production", ["validator"])
+    return registry

customer_retention/core/config/__init__.py ADDED Viewed

@@ -0,0 +1,39 @@
+from .column_config import ColumnConfig, ColumnType, DatasetGranularity
+from .experiments import (
+    DATA_DIR,
+    EXPERIMENTS_DIR,
+    FEATURE_STORE_DIR,
+    FINDINGS_DIR,
+    MLRUNS_DIR,
+    OUTPUT_DIR,
+    get_data_dir,
+    get_experiments_dir,
+    get_feature_store_dir,
+    get_findings_dir,
+    get_mlruns_dir,
+    get_notebook_experiments_dir,
+    setup_experiments_structure,
+)
+from .pipeline_config import (
+    BronzeConfig,
+    DedupStrategy,
+    GoldConfig,
+    ModelingConfig,
+    PathConfig,
+    PipelineConfig,
+    SilverConfig,
+    ValidationConfig,
+)
+from .source_config import DataSourceConfig, FileFormat, Grain, SourceType
+__all__ = [
+    "ColumnType", "ColumnConfig", "DatasetGranularity",
+    "SourceType", "FileFormat", "Grain", "DataSourceConfig",
+    "DedupStrategy", "BronzeConfig", "SilverConfig", "GoldConfig",
+    "ModelingConfig", "ValidationConfig", "PathConfig", "PipelineConfig",
+    "EXPERIMENTS_DIR", "FINDINGS_DIR", "DATA_DIR", "MLRUNS_DIR",
+    "FEATURE_STORE_DIR", "OUTPUT_DIR", "get_experiments_dir",
+    "get_findings_dir", "get_data_dir", "get_mlruns_dir",
+    "get_feature_store_dir", "get_notebook_experiments_dir",
+    "setup_experiments_structure",
+]

customer_retention/core/config/column_config.py ADDED Viewed

@@ -0,0 +1,95 @@
+from enum import Enum
+from typing import Optional
+from pydantic import BaseModel, model_validator
+class ColumnType(str, Enum):
+    IDENTIFIER = "identifier"
+    TARGET = "target"
+    FEATURE_TIMESTAMP = "feature_timestamp"
+    LABEL_TIMESTAMP = "label_timestamp"
+    NUMERIC_CONTINUOUS = "numeric_continuous"
+    NUMERIC_DISCRETE = "numeric_discrete"
+    CATEGORICAL_NOMINAL = "categorical_nominal"
+    CATEGORICAL_ORDINAL = "categorical_ordinal"
+    CATEGORICAL_CYCLICAL = "categorical_cyclical"
+    DATETIME = "datetime"
+    BINARY = "binary"
+    TEXT = "text"
+    UNKNOWN = "unknown"
+# Column types that should NEVER be used as features (leakage risk)
+NON_FEATURE_COLUMN_TYPES = frozenset({
+    ColumnType.IDENTIFIER,
+    ColumnType.TARGET,
+    ColumnType.FEATURE_TIMESTAMP,
+    ColumnType.LABEL_TIMESTAMP,
+})
+class DatasetGranularity(str, Enum):
+    """Describes the grain/granularity of a dataset.
+    ENTITY_LEVEL: One row per entity (e.g., one row per customer)
+    EVENT_LEVEL: Multiple rows per entity over time (e.g., transactions, emails)
+    UNKNOWN: Cannot determine granularity
+    """
+    ENTITY_LEVEL = "entity_level"
+    EVENT_LEVEL = "event_level"
+    UNKNOWN = "unknown"
+class ColumnConfig(BaseModel):
+    name: str
+    column_type: ColumnType
+    nullable: bool = True
+    encoding_strategy: Optional[str] = None
+    scaling_strategy: Optional[str] = None
+    missing_strategy: Optional[str] = None
+    ordinal_order: Optional[list[str]] = None
+    cyclical_max: Optional[int] = None
+    min_value: Optional[float] = None
+    max_value: Optional[float] = None
+    allowed_values: Optional[list[str]] = None
+    regex_pattern: Optional[str] = None
+    description: Optional[str] = None
+    business_name: Optional[str] = None
+    is_feature: Optional[bool] = None
+    exclude_from_model: bool = False
+    @model_validator(mode='after')
+    def validate_cyclical_and_ordinal(self):
+        if self.column_type == ColumnType.CATEGORICAL_CYCLICAL and self.cyclical_max is None:
+            raise ValueError("cyclical_max required for CATEGORICAL_CYCLICAL columns")
+        if self.column_type == ColumnType.CATEGORICAL_ORDINAL and self.ordinal_order is None:
+            raise ValueError("ordinal_order required for CATEGORICAL_ORDINAL columns")
+        return self
+    def should_be_used_as_feature(self) -> bool:
+        if self.exclude_from_model:
+            return False
+        if self.is_feature is not None:
+            return self.is_feature
+        return self.column_type not in NON_FEATURE_COLUMN_TYPES
+    def is_categorical(self) -> bool:
+        return self.column_type in [
+            ColumnType.CATEGORICAL_NOMINAL,
+            ColumnType.CATEGORICAL_ORDINAL,
+            ColumnType.CATEGORICAL_CYCLICAL,
+            ColumnType.BINARY
+        ]
+    def is_numeric(self) -> bool:
+        return self.column_type in [
+            ColumnType.NUMERIC_CONTINUOUS,
+            ColumnType.NUMERIC_DISCRETE
+        ]
+    def is_temporal(self) -> bool:
+        return self.column_type == ColumnType.DATETIME

customer_retention/core/config/experiments.py ADDED Viewed

@@ -0,0 +1,71 @@
+import os
+from pathlib import Path
+from typing import Optional
+def _find_project_root() -> Path:
+    path = Path(__file__).parent
+    for _ in range(10):
+        if (path / "pyproject.toml").exists() or (path / ".git").exists():
+            return path
+        path = path.parent
+    return Path.cwd()
+def get_experiments_dir(default: Optional[str] = None) -> Path:
+    if "CR_EXPERIMENTS_DIR" in os.environ:
+        return Path(os.environ["CR_EXPERIMENTS_DIR"])
+    if default:
+        return Path(default)
+    return _find_project_root() / "experiments"
+def get_findings_dir(default: Optional[str] = None) -> Path:
+    return get_experiments_dir(default) / "findings"
+def get_data_dir(default: Optional[str] = None) -> Path:
+    return get_experiments_dir(default) / "data"
+def get_mlruns_dir(default: Optional[str] = None) -> Path:
+    return get_experiments_dir(default) / "mlruns"
+def get_feature_store_dir(default: Optional[str] = None) -> Path:
+    return get_experiments_dir(default) / "feature_repo"
+EXPERIMENTS_DIR = get_experiments_dir()
+FINDINGS_DIR = get_findings_dir()
+DATA_DIR = get_data_dir()
+MLRUNS_DIR = get_mlruns_dir()
+FEATURE_STORE_DIR = get_feature_store_dir()
+OUTPUT_DIR = FINDINGS_DIR
+def setup_experiments_structure(experiments_dir: Optional[Path] = None) -> None:
+    base = experiments_dir or get_experiments_dir()
+    directories = [
+        base / "findings" / "snapshots",
+        base / "findings" / "unified",
+        base / "data" / "bronze",
+        base / "data" / "silver",
+        base / "data" / "gold",
+        base / "data" / "scoring",
+        base / "mlruns",
+        base / "feature_repo" / "data",
+    ]
+    for directory in directories:
+        directory.mkdir(parents=True, exist_ok=True)
+def get_notebook_experiments_dir() -> Path:
+    if "CR_EXPERIMENTS_DIR" in os.environ:
+        return Path(os.environ["CR_EXPERIMENTS_DIR"])
+    cwd = Path.cwd()
+    if (cwd.parent / "experiments").exists():
+        return cwd.parent / "experiments"
+    elif (cwd / "experiments").exists():
+        return cwd / "experiments"
+    return get_experiments_dir()