PyPI - churnkit - Versions diffs - 0.75.0a1__py3-none-any.whl - Mend

churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (302) hide show

customer_retention/integrations/llm_context/context_builder.py ADDED Viewed

@@ -0,0 +1,201 @@
+from customer_retention.analysis.auto_explorer.findings import ExplorationFindings
+from customer_retention.core.config.column_config import ColumnType
+class LLMContextBuilder:
+    def __init__(self,
+                 include_databricks: bool = False,
+                 include_framework_docs: bool = True,
+                 max_sample_values: int = 10):
+        self.include_databricks = include_databricks
+        self.include_framework_docs = include_framework_docs
+        self.max_sample_values = max_sample_values
+    def build_exploration_context(self, findings: ExplorationFindings) -> str:
+        lines = [
+            "# Data Exploration Context",
+            "",
+            "## Dataset Overview",
+            f"- **Source:** {findings.source_path}",
+            f"- **Format:** {findings.source_format}",
+            f"- **Rows:** {findings.row_count:,}",
+            f"- **Columns:** {findings.column_count}",
+            f"- **Overall Quality Score:** {findings.overall_quality_score:.1f}/100",
+            ""
+        ]
+        if findings.target_column:
+            lines.extend([
+                "## Target Information",
+                f"- **Target Column:** {findings.target_column}",
+                f"- **Target Type:** {findings.target_type}",
+                ""
+            ])
+        lines.extend([
+            "## Column Details",
+            "",
+            "| Column | Type | Confidence | Nulls | Notes |",
+            "|--------|------|------------|-------|-------|"
+        ])
+        for name, col in findings.columns.items():
+            null_pct = col.universal_metrics.get("null_percentage", 0)
+            notes = "; ".join(col.evidence[:2]) if col.evidence else ""
+            lines.append(
+                f"| {name} | {col.inferred_type.value} | {col.confidence:.0%} | {null_pct:.1f}% | {notes[:50]} |"
+            )
+        lines.append("")
+        lines.extend(self._build_column_details(findings))
+        if findings.critical_issues:
+            lines.extend([
+                "## Critical Issues",
+                ""
+            ])
+            for issue in findings.critical_issues:
+                lines.append(f"- {issue}")
+            lines.append("")
+        if findings.warnings:
+            lines.extend([
+                "## Warnings",
+                ""
+            ])
+            for warning in findings.warnings:
+                lines.append(f"- {warning}")
+            lines.append("")
+        return "\n".join(lines)
+    def _build_column_details(self, findings: ExplorationFindings) -> list:
+        lines = ["## Detailed Column Information", ""]
+        for name, col in findings.columns.items():
+            lines.append(f"### {name}")
+            lines.append(f"- **Type:** {col.inferred_type.value}")
+            lines.append(f"- **Confidence:** {col.confidence:.0%}")
+            if col.universal_metrics:
+                metrics = col.universal_metrics
+                lines.append(f"- **Null Count:** {metrics.get('null_count', 0)}")
+                lines.append(f"- **Distinct Count:** {metrics.get('distinct_count', 'N/A')}")
+            if col.type_metrics:
+                metrics = col.type_metrics
+                if "mean" in metrics:
+                    lines.append(f"- **Mean:** {metrics['mean']:.2f}")
+                if "std" in metrics:
+                    lines.append(f"- **Std:** {metrics['std']:.2f}")
+                if "min_value" in metrics:
+                    lines.append(f"- **Range:** {metrics['min_value']} to {metrics.get('max_value', 'N/A')}")
+                if "top_categories" in metrics:
+                    top = metrics["top_categories"][:3]
+                    lines.append(f"- **Top Categories:** {top}")
+            lines.append("")
+        return lines
+    def build_configuration_context(self, findings: ExplorationFindings, user_goal: str) -> str:
+        lines = [
+            "# Pipeline Configuration Context",
+            "",
+            "## User Goal",
+            f"{user_goal}",
+            "",
+            self.build_exploration_context(findings),
+            "",
+            "## Recommendations Summary",
+            ""
+        ]
+        lines.append("### Suggested Transformations")
+        for name, col in findings.columns.items():
+            if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]:
+                lines.append(f"- **{name}:** Apply standard scaling")
+            elif col.inferred_type in [ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL]:
+                lines.append(f"- **{name}:** Apply encoding (one-hot or target)")
+            elif col.inferred_type == ColumnType.DATETIME:
+                lines.append(f"- **{name}:** Extract temporal features")
+        lines.append("")
+        return "\n".join(lines)
+    def build_databricks_context(self, findings: ExplorationFindings) -> str:
+        lines = [
+            "# Databricks Integration Context",
+            "",
+            "## Available Databricks Features",
+            "",
+            "### Delta Lake",
+            "- ACID transactions for data reliability",
+            "- Schema enforcement and evolution",
+            "- Time travel for data versioning",
+            "",
+            "### Delta Live Tables (DLT)",
+            "- Declarative pipeline definitions",
+            "- Automatic dependency management",
+            "- Built-in expectations for quality",
+            "",
+            "### Unity Catalog",
+            "- Centralized data governance",
+            "- Fine-grained access control",
+            "- Data lineage tracking",
+            "",
+            "### Feature Store",
+            "- Centralized feature repository",
+            "- Point-in-time feature lookups",
+            "- Online/offline feature serving",
+            "",
+            "### Spark Considerations",
+            f"- Dataset has {findings.row_count:,} rows",
+        ]
+        if findings.row_count > 1_000_000:
+            lines.append("- Consider partitioning for large dataset")
+        lines.extend([
+            "- Use DataFrame API for transformations",
+            "- Leverage Spark ML for scalable modeling",
+            ""
+        ])
+        return "\n".join(lines)
+    def build_framework_docs_context(self) -> str:
+        lines = [
+            "# Customer Retention Framework Documentation",
+            "",
+            "## ColumnType Reference",
+            "",
+            "Available column types in the framework:",
+            ""
+        ]
+        for col_type in ColumnType:
+            lines.append(f"- **{col_type.name}:** {col_type.value}")
+        lines.extend([
+            "",
+            "## Key Modules",
+            "",
+            "### Profiling",
+            "- TypeDetector: Automatic type inference",
+            "- ColumnProfiler: Statistical profiling per type",
+            "- QualityChecks: Data quality validation",
+            "",
+            "### Transformation",
+            "- NumericTransformer: Scaling, log transforms, binning",
+            "- CategoricalEncoder: One-hot, target, ordinal encoding",
+            "- DatetimeTransformer: Temporal feature extraction",
+            "",
+            "### Modeling",
+            "- BaselineTrainer: Quick baseline models",
+            "- CrossValidator: Robust cross-validation",
+            "- HyperparameterTuner: Automated tuning",
+            "",
+            "### Validation",
+            "- DataQualityGate: Data quality checks",
+            "- LeakageGate: Feature leakage detection",
+            "- ModelValidityGate: Model performance validation",
+            ""
+        ])
+        return "\n".join(lines)
+    def build_full_context(self, findings: ExplorationFindings, user_goal: str = "") -> str:
+        sections = [
+            self.build_exploration_context(findings),
+            "---",
+        ]
+        if user_goal:
+            sections.append(f"## User Goal\n{user_goal}\n")
+            sections.append("---")
+        if self.include_framework_docs:
+            sections.append(self.build_framework_docs_context())
+            sections.append("---")
+        if self.include_databricks:
+            sections.append(self.build_databricks_context(findings))
+        return "\n\n".join(sections)

customer_retention/integrations/llm_context/prompts.py ADDED Viewed

@@ -0,0 +1,100 @@
+class PromptTemplates:
+    INFER_COLUMN_TYPES = """Given the following column information, infer the semantic type for each column.
+Available column types:
+- IDENTIFIER: Unique keys, IDs, codes
+- TARGET: The prediction target (binary or multiclass)
+- BINARY: Two-value columns (yes/no, true/false, 0/1)
+- NUMERIC_CONTINUOUS: Continuous numeric values (amounts, measurements)
+- NUMERIC_DISCRETE: Discrete numeric values (counts, ratings)
+- CATEGORICAL_NOMINAL: Categories without order (colors, types)
+- CATEGORICAL_ORDINAL: Categories with order (ratings, levels)
+- CATEGORICAL_CYCLICAL: Cyclical categories (days, months)
+- DATETIME: Date/time values
+- TEXT: Free-form text
+For each column, provide:
+1. Inferred type
+2. Confidence (0-100%)
+3. Evidence supporting your inference
+{context}
+Please analyze each column and provide your type inference."""
+    SUGGEST_TARGET_COLUMN = """Based on the data exploration findings below, suggest the most appropriate target column for a machine learning model.
+Consider:
+- Column names that suggest outcomes (churn, target, label, outcome, class)
+- Binary or low-cardinality categorical columns
+- Columns that seem to represent what we want to predict
+{context}
+Provide:
+1. Recommended target column
+2. Confidence level
+3. Rationale for your choice
+4. Alternative candidates (if any)"""
+    RECOMMEND_FEATURES = """Based on the data exploration findings, recommend feature engineering opportunities.
+Consider:
+- Datetime columns: temporal features (year, month, day, day of week, days since)
+- Numeric columns: binning, scaling, log transforms for skewed data
+- Categorical columns: encoding strategies, interaction features
+- Cross-column features: ratios, differences, combinations
+{context}
+For each recommendation, provide:
+1. Source column(s)
+2. Proposed feature name
+3. Feature type and computation
+4. Priority (high/medium/low)
+5. Implementation hint"""
+    GENERATE_PIPELINE_CONFIG = """Generate a production pipeline configuration based on the exploration findings.
+The configuration should include:
+- Data source specifications
+- Schema definitions
+- Bronze layer transforms (raw data ingestion)
+- Silver layer transforms (cleaning and standardization)
+- Gold layer transforms (feature engineering)
+- Model configuration
+- Quality gates
+{context}
+User Goal: {user_goal}
+Generate a complete pipeline specification in YAML format."""
+    EXPLAIN_QUALITY_ISSUES = """Explain the data quality issues found in the exploration and provide remediation recommendations.
+For each issue:
+1. Describe the problem in business terms
+2. Explain the potential impact on model performance
+3. Recommend specific remediation steps
+4. Prioritize by severity
+{context}
+Provide a clear, actionable quality improvement plan."""
+    GENERATE_DLT_CODE = """Generate Databricks Delta Live Tables (DLT) code based on the pipeline specification.
+Requirements:
+- Use @dlt.table decorators
+- Include expectations for quality checks
+- Follow medallion architecture (bronze/silver/gold)
+- Include proper schema definitions
+{context}
+Generate production-ready DLT Python code."""
+    @classmethod
+    def format_prompt(cls, template: str, **kwargs) -> str:
+        return template.format(**kwargs)

customer_retention/integrations/streaming/__init__.py ADDED Viewed

@@ -0,0 +1,103 @@
+from .batch_integration import (
+    BatchStreamingBridge,
+    ProcessingConfig,
+    ProcessingMetrics,
+    ProcessingResult,
+    ScoreCombinationStrategy,
+    ScoreResult,
+    StreamProcessor,
+)
+from .early_warning_model import (
+    EarlyWarningConfig,
+    EarlyWarningModel,
+    SignalDetector,
+    SignalType,
+    WarningLevel,
+    WarningResult,
+)
+from .event_schema import (
+    BatchValidationResult,
+    Event,
+    EventSchema,
+    EventSource,
+    EventType,
+    EventValidator,
+    SchemaRegistry,
+    ValidationResult,
+)
+from .online_store_writer import (
+    BatchSyncResult,
+    FeatureLookup,
+    FeatureRecord,
+    FeatureStoreConfig,
+    FeatureStoreMetrics,
+    FeatureStoreSchema,
+    FeatureWriteResult,
+    FreshnessMetrics,
+    OnlineFeatureStore,
+    TTLConfig,
+)
+from .realtime_scorer import (
+    AutoScaler,
+    EndpointHealth,
+    RealtimeScorer,
+    RiskFactor,
+    ScalingDecision,
+    ScalingMetrics,
+    ScorerMetrics,
+    ScoringConfig,
+    ScoringRequest,
+    ScoringResponse,
+    SLAMetrics,
+)
+from .trigger_engine import (
+    ActionType,
+    AnomalyTrigger,
+    CompositeTrigger,
+    PatternTrigger,
+    StreamTriggerType,
+    ThresholdTrigger,
+    TriggerConfig,
+    TriggerContext,
+    TriggerDefinition,
+    TriggerEngine,
+    TriggerResult,
+)
+from .window_aggregator import (
+    AggregationResult,
+    FeatureComputer,
+    FeatureComputeResult,
+    SessionMetrics,
+    SessionWindow,
+    SlidingWindow,
+    StreamingFeature,
+    StreamState,
+    TumblingWindow,
+    WatermarkConfig,
+    Window,
+    WindowAggregator,
+    WindowType,
+)
+__all__ = [
+    "Event", "EventType", "EventSource", "EventSchema",
+    "EventValidator", "ValidationResult", "BatchValidationResult",
+    "SchemaRegistry",
+    "WindowType", "Window", "TumblingWindow", "SlidingWindow", "SessionWindow",
+    "WatermarkConfig", "AggregationResult", "SessionMetrics",
+    "WindowAggregator", "StreamState", "StreamingFeature",
+    "FeatureComputer", "FeatureComputeResult",
+    "FeatureStoreConfig", "TTLConfig", "FeatureRecord", "FeatureWriteResult",
+    "BatchSyncResult", "FeatureStoreMetrics", "FreshnessMetrics",
+    "FeatureStoreSchema", "OnlineFeatureStore", "FeatureLookup",
+    "WarningLevel", "SignalType", "EarlyWarningConfig", "WarningResult",
+    "SignalDetector", "EarlyWarningModel",
+    "StreamTriggerType", "ActionType", "TriggerConfig", "TriggerContext",
+    "TriggerResult", "TriggerDefinition", "ThresholdTrigger",
+    "PatternTrigger", "AnomalyTrigger", "CompositeTrigger", "TriggerEngine",
+    "ScoringConfig", "ScoringRequest", "ScoringResponse", "RiskFactor",
+    "EndpointHealth", "ScalingMetrics", "ScalingDecision", "SLAMetrics",
+    "ScorerMetrics", "AutoScaler", "RealtimeScorer",
+    "ScoreCombinationStrategy", "ScoreResult", "BatchStreamingBridge",
+    "ProcessingConfig", "ProcessingResult", "ProcessingMetrics", "StreamProcessor"
+]

customer_retention/integrations/streaming/batch_integration.py ADDED Viewed

@@ -0,0 +1,149 @@
+from dataclasses import dataclass
+from datetime import datetime
+from enum import Enum
+from typing import Dict, List, Optional
+class ScoreCombinationStrategy(Enum):
+    BATCH_ONLY = "batch_only"
+    STREAMING_OVERRIDE = "streaming_override"
+    ENSEMBLE = "ensemble"
+    MAXIMUM = "maximum"
+    SIGNAL_BOOST = "signal_boost"
+@dataclass
+class ScoreResult:
+    score: float
+    source: str
+    timestamp: Optional[datetime] = None
+class BatchStreamingBridge:
+    def __init__(self):
+        self._feature_mapping = {
+            "days_since_last_order": "minutes_since_last_order",
+            "email_engagement_score": ("email_opens_7d", "emails_sent_7d"),
+            "order_frequency": "orders_7d"
+        }
+    def combine_scores(self, batch_score: Optional[float], streaming_score: Optional[float],
+                       strategy: ScoreCombinationStrategy = ScoreCombinationStrategy.MAXIMUM,
+                       weights: Optional[Dict[str, float]] = None,
+                       batch_timestamp: Optional[datetime] = None,
+                       streaming_timestamp: Optional[datetime] = None,
+                       freshness_threshold_hours: int = 1) -> float:
+        if streaming_score is None and batch_score is None:
+            return 0.0
+        if streaming_score is None:
+            return batch_score
+        if batch_score is None:
+            return streaming_score
+        if strategy == ScoreCombinationStrategy.BATCH_ONLY:
+            return batch_score
+        elif strategy == ScoreCombinationStrategy.STREAMING_OVERRIDE:
+            if streaming_timestamp and batch_timestamp:
+                streaming_age = (datetime.now() - streaming_timestamp).total_seconds() / 3600
+                if streaming_age < freshness_threshold_hours:
+                    return streaming_score
+            return streaming_score if streaming_score is not None else batch_score
+        elif strategy == ScoreCombinationStrategy.ENSEMBLE:
+            w = weights or {"batch": 0.5, "streaming": 0.5}
+            return batch_score * w.get("batch", 0.5) + streaming_score * w.get("streaming", 0.5)
+        elif strategy == ScoreCombinationStrategy.MAXIMUM:
+            return max(batch_score, streaming_score)
+        elif strategy == ScoreCombinationStrategy.SIGNAL_BOOST:
+            boost = 0.1 if streaming_score > batch_score else 0.0
+            return min(batch_score + boost, 1.0)
+        return batch_score
+    def map_features(self, batch_features: Dict[str, float], streaming_features: Dict[str, float],
+                     prefer_streaming_recency: bool = False) -> Dict[str, float]:
+        result = batch_features.copy()
+        result.update(streaming_features)
+        if prefer_streaming_recency and "minutes_since_last_order" in streaming_features:
+            result["days_since_last_order"] = streaming_features["minutes_since_last_order"] / (24 * 60)
+        return result
+    def get_best_available_score(self, realtime_score: Optional[float] = None,
+                                 streaming_score: Optional[float] = None,
+                                 batch_score: Optional[float] = None,
+                                 cached_score: Optional[float] = None) -> ScoreResult:
+        if realtime_score is not None:
+            return ScoreResult(score=realtime_score, source="realtime")
+        if streaming_score is not None:
+            return ScoreResult(score=streaming_score, source="streaming")
+        if batch_score is not None:
+            return ScoreResult(score=batch_score, source="batch")
+        if cached_score is not None:
+            return ScoreResult(score=cached_score, source="cached")
+        return ScoreResult(score=0.0, source="default")
+@dataclass
+class ProcessingConfig:
+    checkpoint_interval_seconds: int = 60
+    watermark_delay_minutes: int = 10
+    trigger_interval_seconds: int = 60
+@dataclass
+class ProcessingResult:
+    events_processed: int = 0
+    features_computed: int = 0
+    errors: int = 0
+    processing_time_ms: float = 0.0
+@dataclass
+class ProcessingMetrics:
+    avg_processing_latency_ms: float = 0.0
+    events_per_second: float = 0.0
+class StreamProcessor:
+    def __init__(self, config: Optional[ProcessingConfig] = None):
+        self._config = config or ProcessingConfig()
+        self._state: Dict[str, Dict[str, float]] = {}
+        self._processing_times: List[float] = []
+        self._events_processed = 0
+        self._start_time = datetime.now()
+    def process_batch(self, events: List) -> ProcessingResult:
+        import time
+        start = time.time()
+        from .window_aggregator import FeatureComputer
+        computer = FeatureComputer()
+        features_computed = 0
+        by_customer: Dict[str, List] = {}
+        for event in events:
+            cust_id = event.customer_id
+            if cust_id not in by_customer:
+                by_customer[cust_id] = []
+            by_customer[cust_id].append(event)
+        for customer_id, customer_events in by_customer.items():
+            result = computer.compute_all_features(customer_events, customer_id)
+            if customer_id not in self._state:
+                self._state[customer_id] = {}
+            for feature_name, value in result.features.items():
+                self._state[customer_id][feature_name] = self._state[customer_id].get(feature_name, 0) + value
+            features_computed += len(result.features)
+        elapsed = (time.time() - start) * 1000
+        self._processing_times.append(elapsed)
+        self._events_processed += len(events)
+        return ProcessingResult(
+            events_processed=len(events),
+            features_computed=features_computed,
+            processing_time_ms=elapsed
+        )
+    def get_state(self, customer_id: str) -> Dict[str, float]:
+        return self._state.get(customer_id, {}).copy()
+    def get_metrics(self) -> ProcessingMetrics:
+        import statistics
+        elapsed_seconds = max((datetime.now() - self._start_time).total_seconds(), 1)
+        return ProcessingMetrics(
+            avg_processing_latency_ms=statistics.mean(self._processing_times) if self._processing_times else 0.0,
+            events_per_second=self._events_processed / elapsed_seconds
+        )