PyPI - churnkit - Versions diffs - 0.75.0a1__py3-none-any.whl - Mend

churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (302) hide show

customer_retention/generators/notebook_generator/cell_builder.py ADDED Viewed

@@ -0,0 +1,49 @@
+from typing import Dict, List, Optional
+import nbformat
+DATABRICKS_SEPARATOR = "\n# COMMAND ----------\n"
+class CellBuilder:
+    @staticmethod
+    def markdown(content: str) -> nbformat.NotebookNode:
+        return nbformat.v4.new_markdown_cell(content)
+    @staticmethod
+    def code(source: str, metadata: Optional[Dict] = None) -> nbformat.NotebookNode:
+        cell = nbformat.v4.new_code_cell(source)
+        if metadata:
+            cell.metadata.update(metadata)
+        return cell
+    @staticmethod
+    def header(title: str, level: int = 1) -> nbformat.NotebookNode:
+        return CellBuilder.markdown(f"{'#' * level} {title}")
+    @staticmethod
+    def section(title: str, description: str = "") -> nbformat.NotebookNode:
+        content = f"## {title}"
+        if description:
+            content += f"\n\n{description}"
+        return CellBuilder.markdown(content)
+    @staticmethod
+    def databricks_separator() -> str:
+        return DATABRICKS_SEPARATOR
+    @staticmethod
+    def create_notebook(cells: List[nbformat.NotebookNode]) -> nbformat.NotebookNode:
+        nb = nbformat.v4.new_notebook()
+        nb.cells = cells
+        return nb
+    @staticmethod
+    def imports_cell(imports: List[str]) -> nbformat.NotebookNode:
+        lines = [f"import {imp}" for imp in imports]
+        return CellBuilder.code("\n".join(lines))
+    @staticmethod
+    def from_imports_cell(from_imports: Dict[str, List[str]]) -> nbformat.NotebookNode:
+        lines = [f"from {module} import {', '.join(names)}" for module, names in from_imports.items()]
+        return CellBuilder.code("\n".join(lines))

customer_retention/generators/notebook_generator/config.py ADDED Viewed

@@ -0,0 +1,47 @@
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Optional
+from customer_retention.core.components.enums import Platform
+class OutputFormat(str, Enum):
+    NOTEBOOK = "notebook"
+    SCRIPT = "script"
+@dataclass
+class MLflowConfig:
+    tracking_uri: str = "./experiments/mlruns"
+    registry_uri: Optional[str] = None
+    experiment_name: str = "customer_retention"
+    model_name: str = "churn_model"
+    track_data_quality: bool = True
+    track_transformations: bool = True
+    track_pipeline_stages: bool = True
+@dataclass
+class FeatureStoreConfig:
+    base_path: str = "./experiments/feature_store"
+    catalog: str = "main"
+    schema: str = "default"
+    table_name: str = "customer_features"
+@dataclass
+class NotebookConfig:
+    project_name: str = "customer_retention"
+    platform: Platform = Platform.LOCAL
+    output_format: OutputFormat = OutputFormat.NOTEBOOK
+    mlflow: MLflowConfig = field(default_factory=MLflowConfig)
+    feature_store: FeatureStoreConfig = field(default_factory=FeatureStoreConfig)
+    model_type: str = "xgboost"
+    test_size: float = 0.2
+    threshold: float = 0.5
+    variance_threshold: float = 0.01
+    correlation_threshold: float = 0.95
+    @property
+    def use_framework(self) -> bool:
+        return self.platform == Platform.LOCAL

customer_retention/generators/notebook_generator/databricks_generator.py ADDED Viewed

@@ -0,0 +1,48 @@
+from typing import TYPE_CHECKING, Optional
+import nbformat
+from .base import NotebookGenerator, NotebookStage
+if TYPE_CHECKING:
+    from customer_retention.analysis.auto_explorer import ExplorationFindings
+from .cell_builder import CellBuilder
+from .config import NotebookConfig, Platform
+from .stages import (
+    BatchInferenceStage,
+    CleaningStage,
+    DeploymentStage,
+    FeatureEngineeringStage,
+    FeatureSelectionStage,
+    IngestionStage,
+    ModelTrainingStage,
+    MonitoringStage,
+    ProfilingStage,
+    TransformationStage,
+)
+class DatabricksNotebookGenerator(NotebookGenerator):
+    def __init__(self, config: NotebookConfig, findings: Optional["ExplorationFindings"]):
+        config.platform = Platform.DATABRICKS
+        super().__init__(config, findings)
+        self.stage_generators = self._build_stage_generators(config, findings)
+    def _build_stage_generators(self, config: NotebookConfig, findings) -> dict:
+        return {
+            NotebookStage.INGESTION: IngestionStage(config, findings),
+            NotebookStage.PROFILING: ProfilingStage(config, findings),
+            NotebookStage.CLEANING: CleaningStage(config, findings),
+            NotebookStage.TRANSFORMATION: TransformationStage(config, findings),
+            NotebookStage.FEATURE_ENGINEERING: FeatureEngineeringStage(config, findings),
+            NotebookStage.FEATURE_SELECTION: FeatureSelectionStage(config, findings),
+            NotebookStage.MODEL_TRAINING: ModelTrainingStage(config, findings),
+            NotebookStage.DEPLOYMENT: DeploymentStage(config, findings),
+            NotebookStage.MONITORING: MonitoringStage(config, findings),
+            NotebookStage.BATCH_INFERENCE: BatchInferenceStage(config, findings),
+        }
+    def generate_stage(self, stage: NotebookStage) -> nbformat.NotebookNode:
+        generator = self.stage_generators[stage]
+        cells = generator.generate(Platform.DATABRICKS)
+        return CellBuilder.create_notebook(cells)

customer_retention/generators/notebook_generator/local_generator.py ADDED Viewed

@@ -0,0 +1,48 @@
+from typing import TYPE_CHECKING, Optional
+import nbformat
+from .base import NotebookGenerator, NotebookStage
+if TYPE_CHECKING:
+    from customer_retention.analysis.auto_explorer import ExplorationFindings
+from .cell_builder import CellBuilder
+from .config import NotebookConfig, Platform
+from .stages import (
+    BatchInferenceStage,
+    CleaningStage,
+    DeploymentStage,
+    FeatureEngineeringStage,
+    FeatureSelectionStage,
+    IngestionStage,
+    ModelTrainingStage,
+    MonitoringStage,
+    ProfilingStage,
+    TransformationStage,
+)
+class LocalNotebookGenerator(NotebookGenerator):
+    def __init__(self, config: NotebookConfig, findings: Optional["ExplorationFindings"]):
+        config.platform = Platform.LOCAL
+        super().__init__(config, findings)
+        self.stage_generators = self._build_stage_generators(config, findings)
+    def _build_stage_generators(self, config: NotebookConfig, findings) -> dict:
+        return {
+            NotebookStage.INGESTION: IngestionStage(config, findings),
+            NotebookStage.PROFILING: ProfilingStage(config, findings),
+            NotebookStage.CLEANING: CleaningStage(config, findings),
+            NotebookStage.TRANSFORMATION: TransformationStage(config, findings),
+            NotebookStage.FEATURE_ENGINEERING: FeatureEngineeringStage(config, findings),
+            NotebookStage.FEATURE_SELECTION: FeatureSelectionStage(config, findings),
+            NotebookStage.MODEL_TRAINING: ModelTrainingStage(config, findings),
+            NotebookStage.DEPLOYMENT: DeploymentStage(config, findings),
+            NotebookStage.MONITORING: MonitoringStage(config, findings),
+            NotebookStage.BATCH_INFERENCE: BatchInferenceStage(config, findings),
+        }
+    def generate_stage(self, stage: NotebookStage) -> nbformat.NotebookNode:
+        generator = self.stage_generators[stage]
+        cells = generator.generate(Platform.LOCAL)
+        return CellBuilder.create_notebook(cells)

customer_retention/generators/notebook_generator/project_init.py ADDED Viewed

@@ -0,0 +1,174 @@
+import shutil
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Optional
+@dataclass
+class ProjectInitializer:
+    project_name: str
+    generate_orchestration: bool = False
+    platforms: Optional[List[str]] = None
+    def initialize(self, output_dir: str) -> Dict[str, any]:
+        project_path = Path(output_dir)
+        project_path.mkdir(parents=True, exist_ok=True)
+        self._create_directories(project_path)
+        readme_path = self._create_readme(project_path)
+        gitignore_path = self._create_gitignore(project_path)
+        pyproject_path = self._create_pyproject(project_path)
+        exploration_notebooks = self._copy_exploration_notebooks(project_path)
+        if self.generate_orchestration:
+            self._generate_orchestration(project_path)
+        return {
+            "readme_path": str(readme_path),
+            "gitignore_path": str(gitignore_path),
+            "pyproject_path": str(pyproject_path),
+            "exploration_notebooks": exploration_notebooks,
+        }
+    def _create_directories(self, project_path: Path) -> None:
+        directories = [
+            "exploration_notebooks",
+            "generated_pipelines/local",
+            "generated_pipelines/databricks",
+            "experiments/findings",
+            "experiments/data/bronze",
+            "experiments/data/silver",
+            "experiments/data/gold",
+            "experiments/data/models",
+            "experiments/data/predictions",
+            "experiments/mlruns",
+            "experiments/feature_store",
+        ]
+        for directory in directories:
+            (project_path / directory).mkdir(parents=True, exist_ok=True)
+    def _create_readme(self, project_path: Path) -> Path:
+        readme_path = project_path / "README.md"
+        readme_path.write_text(self._readme_content())
+        return readme_path
+    def _readme_content(self) -> str:
+        return f"""# {self.project_name}
+Customer retention analysis project using the churnkit framework.
+## Structure
+### Code (version controlled)
+- `exploration_notebooks/` - Interactive exploration notebooks
+- `generated_pipelines/` - Auto-generated pipeline notebooks/scripts
+  - `local/` - Local platform notebooks
+  - `databricks/` - Databricks platform notebooks
+### Data (gitignored)
+- `experiments/` - All experiment outputs
+  - `findings/` - Exploration findings (YAML files)
+  - `data/` - Pipeline outputs (bronze/silver/gold layers)
+  - `mlruns/` - MLflow experiment tracking
+  - `feature_store/` - Feast feature store
+## Getting Started
+1. Place your data in `experiments/data/` or configure a data source
+2. Run exploration notebooks to understand your data
+3. Generate orchestration pipelines for production
+## Usage
+```python
+from customer_retention.generators.notebook_generator import generate_orchestration_notebooks, Platform
+results = generate_orchestration_notebooks(
+    findings_path="experiments/findings/your_data_findings.yaml",
+    output_dir="generated_pipelines",
+    platforms=[Platform.LOCAL, Platform.DATABRICKS]
+)
+```
+"""
+    def _create_gitignore(self, project_path: Path) -> Path:
+        gitignore_path = project_path / ".gitignore"
+        gitignore_path.write_text(self._gitignore_content())
+        return gitignore_path
+    def _gitignore_content(self) -> str:
+        return """.venv/
+__pycache__/
+*.pyc
+.ipynb_checkpoints/
+experiments/
+*.egg-info/
+dist/
+build/
+.pytest_cache/
+.coverage
+"""
+    def _create_pyproject(self, project_path: Path) -> Path:
+        pyproject_path = project_path / "pyproject.toml"
+        pyproject_path.write_text(self._pyproject_content())
+        return pyproject_path
+    def _pyproject_content(self) -> str:
+        return f"""[project]
+name = "{self.project_name}"
+version = "0.1.0"
+description = "Customer retention analysis using churnkit framework"
+requires-python = ">=3.9"
+dependencies = [
+    "churnkit",
+    "pandas>=2.0",
+    "jupyter>=1.0",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=7.0",
+    "ruff>=0.1",
+]
+"""
+    def _copy_exploration_notebooks(self, project_path: Path) -> List[str]:
+        source_dir = self._get_exploration_source_dir()
+        dest_dir = project_path / "exploration_notebooks"
+        copied = []
+        if source_dir and source_dir.exists():
+            for notebook in source_dir.glob("*.ipynb"):
+                dest_path = dest_dir / notebook.name
+                shutil.copy2(notebook, dest_path)
+                copied.append(str(dest_path))
+        return copied
+    def _get_exploration_source_dir(self) -> Optional[Path]:
+        possible_paths = [
+            Path(__file__).parent.parent.parent.parent / "exploration_notebooks",
+            Path("exploration_notebooks"),
+        ]
+        for path in possible_paths:
+            if path.exists():
+                return path
+        return None
+    def _generate_orchestration(self, project_path: Path) -> None:
+        from . import Platform, generate_orchestration_notebooks
+        platforms = [Platform(p) for p in (self.platforms or ["local", "databricks"])]
+        output_dir = project_path / "generated_pipelines"
+        generate_orchestration_notebooks(
+            output_dir=str(output_dir),
+            platforms=platforms,
+        )
+def initialize_project(
+    output_dir: str,
+    project_name: str,
+    generate_orchestration: bool = False,
+) -> Dict[str, any]:
+    initializer = ProjectInitializer(
+        project_name=project_name,
+        generate_orchestration=generate_orchestration,
+    )
+    return initializer.initialize(output_dir)

customer_retention/generators/notebook_generator/runner.py ADDED Viewed

@@ -0,0 +1,150 @@
+import time
+from dataclasses import dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import List, Optional
+import nbformat
+@dataclass
+class NotebookValidationResult:
+    notebook_name: str
+    success: bool
+    duration_seconds: float
+    error: Optional[str] = None
+    cell_errors: List[str] = field(default_factory=list)
+@dataclass
+class ValidationReport:
+    results: List[NotebookValidationResult]
+    platform: str
+    timestamp: datetime = field(default_factory=datetime.now)
+    @property
+    def all_passed(self) -> bool:
+        return all(r.success for r in self.results)
+    @property
+    def total_notebooks(self) -> int:
+        return len(self.results)
+    @property
+    def passed_count(self) -> int:
+        return sum(1 for r in self.results if r.success)
+    @property
+    def failed_count(self) -> int:
+        return sum(1 for r in self.results if not r.success)
+    @property
+    def total_duration_seconds(self) -> float:
+        return sum(r.duration_seconds for r in self.results)
+    def to_markdown(self) -> str:
+        lines = [
+            f"# Notebook Validation Report - {self.platform.upper()}",
+            f"**Timestamp:** {self.timestamp.strftime('%Y-%m-%d %H:%M:%S')}",
+            f"**Total Duration:** {self.total_duration_seconds:.2f}s",
+            "",
+            "## Summary",
+            f"- **Total Notebooks:** {self.total_notebooks}",
+            f"- **Passed:** {self.passed_count}",
+            f"- **Failed:** {self.failed_count}",
+            f"- **Status:** {'PASSED' if self.all_passed else 'FAILED'}",
+            "",
+            "## Results",
+            "| Notebook | Status | Duration | Error |",
+            "|----------|--------|----------|-------|",
+        ]
+        for r in self.results:
+            status = "PASS" if r.success else "FAIL"
+            error = r.error[:50] + "..." if r.error and len(r.error) > 50 else (r.error or "-")
+            lines.append(f"| {r.notebook_name} | {status} | {r.duration_seconds:.2f}s | {error} |")
+        return "\n".join(lines)
+class NotebookRunner:
+    def __init__(self, dry_run: bool = False, stop_on_failure: bool = False):
+        self.dry_run = dry_run
+        self.stop_on_failure = stop_on_failure
+    def validate_syntax(self, code: str) -> bool:
+        try:
+            compile(code, "<notebook>", "exec")
+            return True
+        except SyntaxError:
+            return False
+    def extract_code(self, notebook_path: str) -> str:
+        with open(notebook_path, "r", encoding="utf-8") as f:
+            nb = nbformat.read(f, as_version=4)
+        return "\n".join(cell.source for cell in nb.cells if cell.cell_type == "code")
+    def validate_notebook(self, notebook_path: str) -> NotebookValidationResult:
+        notebook_name = Path(notebook_path).stem
+        start_time = time.time()
+        try:
+            code = self.extract_code(notebook_path)
+            if self.validate_syntax(code):
+                return NotebookValidationResult(notebook_name, True, time.time() - start_time)
+            return NotebookValidationResult(notebook_name, False, time.time() - start_time, error="Syntax validation failed")
+        except Exception as e:
+            return NotebookValidationResult(notebook_name, False, time.time() - start_time, error=str(e))
+    def validate_sequence(self, notebooks_dir: str, platform: str) -> ValidationReport:
+        notebook_files = sorted(Path(notebooks_dir).glob("*.ipynb"))
+        results = []
+        for nb_path in notebook_files:
+            result = self.validate_notebook(str(nb_path))
+            results.append(result)
+            if self.stop_on_failure and not result.success:
+                break
+        return ValidationReport(results=results, platform=platform)
+def validate_generated_notebooks(output_dir: str, platforms: Optional[List[str]] = None) -> dict:
+    if platforms is None:
+        platforms = ["local", "databricks"]
+    runner = NotebookRunner(dry_run=True)
+    reports = {}
+    for platform in platforms:
+        platform_dir = Path(output_dir) / platform
+        if platform_dir.exists():
+            reports[platform] = runner.validate_sequence(str(platform_dir), platform)
+    return reports
+class ScriptRunner:
+    def __init__(self, dry_run: bool = False, stop_on_failure: bool = False):
+        self.dry_run = dry_run
+        self.stop_on_failure = stop_on_failure
+    def validate_syntax(self, code: str) -> bool:
+        try:
+            compile(code, "<script>", "exec")
+            return True
+        except SyntaxError:
+            return False
+    def validate_script(self, script_path: str) -> NotebookValidationResult:
+        script_name = Path(script_path).stem
+        start_time = time.time()
+        try:
+            code = Path(script_path).read_text(encoding="utf-8")
+            if self.validate_syntax(code):
+                return NotebookValidationResult(script_name, True, time.time() - start_time)
+            return NotebookValidationResult(script_name, False, time.time() - start_time, error="Syntax validation failed")
+        except Exception as e:
+            return NotebookValidationResult(script_name, False, time.time() - start_time, error=str(e))
+    def validate_sequence(self, scripts_dir: str, platform: str) -> ValidationReport:
+        script_files = sorted(Path(scripts_dir).glob("*.py"))
+        results = []
+        for script_path in script_files:
+            result = self.validate_script(str(script_path))
+            results.append(result)
+            if self.stop_on_failure and not result.success:
+                break
+        return ValidationReport(results=results, platform=platform)

customer_retention/generators/notebook_generator/script_generator.py ADDED Viewed

@@ -0,0 +1,110 @@
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import TYPE_CHECKING, Dict, List, Optional
+from .base import NotebookStage
+if TYPE_CHECKING:
+    from customer_retention.analysis.auto_explorer import ExplorationFindings
+    from .stages.base_stage import StageGenerator
+from .config import NotebookConfig, Platform
+from .stages import (
+    BatchInferenceStage,
+    CleaningStage,
+    DeploymentStage,
+    FeatureEngineeringStage,
+    FeatureSelectionStage,
+    IngestionStage,
+    ModelTrainingStage,
+    MonitoringStage,
+    ProfilingStage,
+    TransformationStage,
+)
+class ScriptGenerator(ABC):
+    def __init__(self, config: NotebookConfig, findings: Optional["ExplorationFindings"]):
+        self.config = config
+        self.findings = findings
+        self.stage_generators = self._create_stage_generators()
+    @abstractmethod
+    def _create_stage_generators(self) -> Dict[NotebookStage, "StageGenerator"]:
+        pass
+    @property
+    @abstractmethod
+    def platform(self) -> Platform:
+        pass
+    def generate_stage_code(self, stage: NotebookStage) -> str:
+        generator = self.stage_generators[stage]
+        cells = generator.generate(self.platform)
+        return self._cells_to_script(cells, generator.title, generator.description)
+    def _cells_to_script(self, cells: list, title: str, description: str) -> str:
+        lines = [f'"""{title}', "", description, '"""', ""]
+        for cell in cells:
+            if cell.cell_type == "code":
+                lines.append(cell.source)
+                lines.append("")
+        lines.append("")
+        lines.append('if __name__ == "__main__":')
+        lines.append("    pass")
+        return "\n".join(lines)
+    def generate_all(self) -> Dict[NotebookStage, str]:
+        return {stage: self.generate_stage_code(stage) for stage in self.stage_generators.keys()}
+    def save_all(self, output_dir: str) -> List[str]:
+        output_path = Path(output_dir)
+        output_path.mkdir(parents=True, exist_ok=True)
+        saved_paths = []
+        for stage, code in self.generate_all().items():
+            file_path = output_path / f"{stage.value}.py"
+            file_path.write_text(code, encoding="utf-8")
+            saved_paths.append(str(file_path))
+        return saved_paths
+class LocalScriptGenerator(ScriptGenerator):
+    @property
+    def platform(self) -> Platform:
+        return Platform.LOCAL
+    def _create_stage_generators(self) -> Dict[NotebookStage, "StageGenerator"]:
+        self.config.platform = Platform.LOCAL
+        return {
+            NotebookStage.INGESTION: IngestionStage(self.config, self.findings),
+            NotebookStage.PROFILING: ProfilingStage(self.config, self.findings),
+            NotebookStage.CLEANING: CleaningStage(self.config, self.findings),
+            NotebookStage.TRANSFORMATION: TransformationStage(self.config, self.findings),
+            NotebookStage.FEATURE_ENGINEERING: FeatureEngineeringStage(self.config, self.findings),
+            NotebookStage.FEATURE_SELECTION: FeatureSelectionStage(self.config, self.findings),
+            NotebookStage.MODEL_TRAINING: ModelTrainingStage(self.config, self.findings),
+            NotebookStage.DEPLOYMENT: DeploymentStage(self.config, self.findings),
+            NotebookStage.MONITORING: MonitoringStage(self.config, self.findings),
+            NotebookStage.BATCH_INFERENCE: BatchInferenceStage(self.config, self.findings),
+        }
+class DatabricksScriptGenerator(ScriptGenerator):
+    @property
+    def platform(self) -> Platform:
+        return Platform.DATABRICKS
+    def _create_stage_generators(self) -> Dict[NotebookStage, "StageGenerator"]:
+        self.config.platform = Platform.DATABRICKS
+        return {
+            NotebookStage.INGESTION: IngestionStage(self.config, self.findings),
+            NotebookStage.PROFILING: ProfilingStage(self.config, self.findings),
+            NotebookStage.CLEANING: CleaningStage(self.config, self.findings),
+            NotebookStage.TRANSFORMATION: TransformationStage(self.config, self.findings),
+            NotebookStage.FEATURE_ENGINEERING: FeatureEngineeringStage(self.config, self.findings),
+            NotebookStage.FEATURE_SELECTION: FeatureSelectionStage(self.config, self.findings),
+            NotebookStage.MODEL_TRAINING: ModelTrainingStage(self.config, self.findings),
+            NotebookStage.DEPLOYMENT: DeploymentStage(self.config, self.findings),
+            NotebookStage.MONITORING: MonitoringStage(self.config, self.findings),
+            NotebookStage.BATCH_INFERENCE: BatchInferenceStage(self.config, self.findings),
+        }

customer_retention/generators/notebook_generator/stages/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+from .base_stage import StageGenerator
+from .s01_ingestion import IngestionStage
+from .s02_profiling import ProfilingStage
+from .s03_cleaning import CleaningStage
+from .s04_transformation import TransformationStage
+from .s05_feature_engineering import FeatureEngineeringStage
+from .s06_feature_selection import FeatureSelectionStage
+from .s07_model_training import ModelTrainingStage
+from .s08_deployment import DeploymentStage
+from .s09_monitoring import MonitoringStage
+from .s10_batch_inference import BatchInferenceStage
+from .s11_feature_store import FeatureStoreStage
+__all__ = [
+    "StageGenerator",
+    "IngestionStage", "ProfilingStage", "CleaningStage", "TransformationStage",
+    "FeatureEngineeringStage", "FeatureSelectionStage", "ModelTrainingStage",
+    "DeploymentStage", "MonitoringStage", "BatchInferenceStage", "FeatureStoreStage",
+]