PyPI - churnkit - Versions diffs - 0.76.1a1__py3-none-any.whl → 0.76.1a2__py3-none-any.whl - Mend

churnkit 0.76.1a1py3-none-any.whl → 0.76.1a2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

{churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb RENAMED Viewed

@@ -82,18 +82,19 @@
    "outputs": [],
    "source": [
     "from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
+    "\n",
     "track_and_export_previous(\"07_modeling_readiness.ipynb\")\n",
     "\n",
-    "from customer_retention.analysis.auto_explorer import ExplorationFindings\n",
-    "from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table\n",
-    "from customer_retention.core.config.column_config import ColumnType\n",
-    "from customer_retention.stages.modeling import ImbalanceRecommender, ImbalanceHandler, ImbalanceStrategy\n",
     "import pandas as pd\n",
-    "import numpy as np\n",
     "import plotly.graph_objects as go\n",
-    "import plotly.express as px\n",
-    "from plotly.subplots import make_subplots\n",
-    "from customer_retention.core.config.experiments import FINDINGS_DIR, EXPERIMENTS_DIR, OUTPUT_DIR, setup_experiments_structure"
+    "\n",
+    "from customer_retention.analysis.auto_explorer import ExplorationFindings\n",
+    "from customer_retention.analysis.visualization import ChartBuilder, display_figure\n",
+    "from customer_retention.core.config.column_config import ColumnType\n",
+    "from customer_retention.core.config.experiments import (\n",
+    "    FINDINGS_DIR,\n",
+    ")\n",
+    "from customer_retention.stages.modeling import ImbalanceRecommender"
    ]
   },
   {
@@ -150,7 +151,7 @@
     "findings = ExplorationFindings.load(FINDINGS_PATH)\n",
     "\n",
     "# Load data - handle aggregated vs standard paths\n",
-    "from customer_retention.stages.temporal import load_data_with_snapshot_preference, TEMPORAL_METADATA_COLS\n",
+    "from customer_retention.stages.temporal import TEMPORAL_METADATA_COLS, load_data_with_snapshot_preference\n",
     "\n",
     "# For aggregated data, load directly from the parquet source\n",
     "if \"_aggregated\" in FINDINGS_PATH and findings.source_path.endswith('.parquet'):\n",
@@ -217,11 +218,11 @@
     "has_target = findings.target_column is not None\n",
     "checklist.append({\"Check\": \"Target column identified\", \"Status\": \"Pass\" if has_target else \"Fail\"})\n",
     "\n",
-    "has_features = len([c for c in findings.columns.values() \n",
+    "has_features = len([c for c in findings.columns.values()\n",
     "                   if c.inferred_type not in [ColumnType.IDENTIFIER, ColumnType.TARGET]]) > 0\n",
     "checklist.append({\"Check\": \"Feature columns available\", \"Status\": \"Pass\" if has_features else \"Fail\"})\n",
     "\n",
-    "high_missing = any(c.universal_metrics.get(\"null_percentage\", 0) > 50 \n",
+    "high_missing = any(c.universal_metrics.get(\"null_percentage\", 0) > 50\n",
     "                   for c in findings.columns.values())\n",
     "checklist.append({\"Check\": \"No columns with >50% missing\", \"Status\": \"Fail\" if high_missing else \"Pass\"})\n",
     "\n",
@@ -289,31 +290,31 @@
     "if findings.target_column:\n",
     "    target = findings.target_column\n",
     "    target_series = df[target]\n",
-    "    \n",
+    "\n",
     "    print(\"=\" * 70)\n",
     "    print(\"CLASS IMBALANCE ANALYSIS\")\n",
     "    print(\"=\" * 70)\n",
-    "    \n",
+    "\n",
     "    print(f\"\\nTarget Column: {target}\")\n",
     "    print(f\"Target Type: {findings.target_type}\")\n",
     "    print(f\"Missing Values: {target_series.isnull().sum()}\")\n",
-    "    \n",
+    "\n",
     "    if findings.target_type == \"binary\":\n",
     "        value_counts = target_series.value_counts()\n",
     "        majority_class = value_counts.idxmax()\n",
     "        minority_class = value_counts.idxmin()\n",
     "        majority_count = value_counts.max()\n",
     "        minority_count = value_counts.min()\n",
-    "        \n",
-    "        print(f\"\\n📊 CLASS DISTRIBUTION:\")\n",
+    "\n",
+    "        print(\"\\n📊 CLASS DISTRIBUTION:\")\n",
     "        print(f\"  Majority Class ({majority_class}): {majority_count:,} ({majority_count/len(df)*100:.1f}%)\")\n",
     "        print(f\"  Minority Class ({minority_class}): {minority_count:,} ({minority_count/len(df)*100:.1f}%)\")\n",
-    "        \n",
+    "\n",
     "        # Use framework recommender for strategy recommendations\n",
     "        recommender = ImbalanceRecommender()\n",
     "        rec = recommender.recommend(target_series, n_samples=len(df))\n",
     "        rec.print_recommendation()\n",
-    "        \n",
+    "\n",
     "        # Visualize\n",
     "        severity_colors = {\"low\": \"#2ca02c\", \"moderate\": \"#ffbb00\", \"high\": \"#ff7f0e\", \"severe\": \"#d62728\"}\n",
     "        fig = go.Figure(go.Bar(\n",
@@ -330,14 +331,14 @@
     "            template='plotly_white', height=400\n",
     "        )\n",
     "        display_figure(fig)\n",
-    "        \n",
+    "\n",
     "        # Show sklearn class weights\n",
-    "        print(f\"\\n💡 SKLEARN CLASS WEIGHTS:\")\n",
+    "        print(\"\\n💡 SKLEARN CLASS WEIGHTS:\")\n",
     "        weight_minority = len(df) / (2 * minority_count)\n",
     "        weight_majority = len(df) / (2 * majority_count)\n",
     "        print(f\"  class_weight={{0: {weight_majority:.3f}, 1: {weight_minority:.3f}}}\")\n",
-    "        print(f\"  Or use class_weight='balanced'\")\n",
-    "        \n",
+    "        print(\"  Or use class_weight='balanced'\")\n",
+    "\n",
     "        # Store recommendation for later use\n",
     "        imbalance_recommendation = rec\n",
     "else:\n",
@@ -397,11 +398,11 @@
     "\n",
     "if findings.target_column:\n",
     "    target = findings.target_column\n",
-    "    \n",
+    "\n",
     "    for col_name, col_info in findings.columns.items():\n",
     "        if col_name == target or col_info.inferred_type == ColumnType.IDENTIFIER or col_name in TEMPORAL_METADATA_COLS:\n",
     "            continue\n",
-    "        \n",
+    "\n",
     "        if col_info.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]:\n",
     "            corr = df[[col_name, target]].corr().iloc[0, 1]\n",
     "            if abs(corr) > 0.9:\n",
@@ -410,7 +411,7 @@
     "                    \"Risk\": \"High\",\n",
     "                    \"Reason\": f\"Very high correlation ({corr:.3f}) - potential leakage\"\n",
     "                })\n",
-    "        \n",
+    "\n",
     "        if any(kw in col_name.lower() for kw in ['future', 'outcome', 'result', 'after']):\n",
     "            leakage_risks.append({\n",
     "                \"Column\": col_name,\n",
@@ -473,7 +474,7 @@
     "for col_type, count in sorted(type_summary.items()):\n",
     "    print(f\"  {col_type}: {count}\")\n",
     "\n",
-    "usable_features = sum(1 for c in findings.columns.values() \n",
+    "usable_features = sum(1 for c in findings.columns.values()\n",
     "                      if c.inferred_type not in [ColumnType.IDENTIFIER, ColumnType.TARGET])\n",
     "print(f\"\\nUsable features for modeling: {usable_features}\")"
    ]
@@ -593,10 +594,10 @@
     "            print(f\"   • {col} ({info.availability_type}, {info.coverage_pct:.0f}% coverage)\")\n",
     "    if len(findings.problematic_availability_columns) > 10:\n",
     "        print(f\"   ... and {len(findings.problematic_availability_columns) - 10} more\")\n",
-    "    \n",
+    "\n",
     "    action = findings.metadata.get(\"availability_action\", \"exclude\")\n",
     "    print(f\"\\n📋 Action: {action.upper()}\")\n",
-    "    print(f\"   These features will be excluded in notebook 08.\")\n",
+    "    print(\"   These features will be excluded in notebook 08.\")\n",
     "else:\n",
     "    print(\"\\n✅ All features have full temporal coverage.\")"
    ]
@@ -700,15 +701,15 @@
     "if 'X' in dir() and 'y' in dir():\n",
     "    detector = LeakageDetector()\n",
     "    result = detector.run_all_checks(X, y, include_pit=False)\n",
-    "    \n",
+    "\n",
     "    print(\"=\" * 70)\n",
     "    print(\"FINAL LEAKAGE VALIDATION\")\n",
     "    print(\"=\" * 70)\n",
-    "    \n",
+    "\n",
     "    if result.passed:\n",
     "        print(\"\\n✅ PASSED: No critical leakage issues\")\n",
     "        print(f\"   Checks run: {len(result.checks)}\")\n",
-    "        \n",
+    "\n",
     "        # Show warnings for HIGH severity issues\n",
     "        high_issues = [c for c in result.checks if c.severity.value == 'high']\n",
     "        if high_issues:\n",

{churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb RENAMED Viewed

@@ -83,25 +83,29 @@
    "outputs": [],
    "source": [
     "from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
+    "\n",
     "track_and_export_previous(\"08_baseline_experiments.ipynb\")\n",
     "\n",
-    "from customer_retention.analysis.auto_explorer import ExplorationFindings\n",
-    "from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table\n",
-    "from customer_retention.core.config.column_config import ColumnType\n",
     "import pandas as pd\n",
-    "import numpy as np\n",
-    "\n",
-    "from sklearn.model_selection import train_test_split, cross_val_score\n",
-    "from sklearn.preprocessing import StandardScaler, LabelEncoder\n",
-    "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
+    "from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier\n",
     "from sklearn.linear_model import LogisticRegression\n",
-    "from sklearn.metrics import (roc_auc_score, classification_report, confusion_matrix,\n",
-    "                             roc_curve, precision_recall_curve, average_precision_score,\n",
-    "                             f1_score, precision_score, recall_score)\n",
+    "from sklearn.metrics import (\n",
+    "    average_precision_score,\n",
+    "    classification_report,\n",
+    "    f1_score,\n",
+    "    precision_score,\n",
+    "    recall_score,\n",
+    "    roc_auc_score,\n",
+    ")\n",
+    "from sklearn.model_selection import cross_val_score, train_test_split\n",
+    "from sklearn.preprocessing import LabelEncoder, StandardScaler\n",
     "\n",
-    "import plotly.graph_objects as go\n",
-    "from plotly.subplots import make_subplots\n",
-    "from customer_retention.core.config.experiments import FINDINGS_DIR, EXPERIMENTS_DIR, OUTPUT_DIR, setup_experiments_structure"
+    "from customer_retention.analysis.auto_explorer import ExplorationFindings\n",
+    "from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table\n",
+    "from customer_retention.core.config.column_config import ColumnType\n",
+    "from customer_retention.core.config.experiments import (\n",
+    "    FINDINGS_DIR,\n",
+    ")"
    ]
   },
   {
@@ -158,7 +162,7 @@
     "findings = ExplorationFindings.load(FINDINGS_PATH)\n",
     "\n",
     "# Load data - handle aggregated vs standard paths\n",
-    "from customer_retention.stages.temporal import load_data_with_snapshot_preference, TEMPORAL_METADATA_COLS\n",
+    "from customer_retention.stages.temporal import TEMPORAL_METADATA_COLS, load_data_with_snapshot_preference\n",
     "\n",
     "# For aggregated data, load directly from the parquet source\n",
     "if \"_aggregated\" in FINDINGS_PATH and findings.source_path.endswith('.parquet'):\n",
@@ -269,7 +273,7 @@
     "    print(f\"   {col_type}: {count}\")\n",
     "\n",
     "# Show excluded columns\n",
-    "excluded = [name for name, col in findings.columns.items() \n",
+    "excluded = [name for name, col in findings.columns.items()\n",
     "            if col.inferred_type in [ColumnType.IDENTIFIER, ColumnType.TARGET, ColumnType.TEXT]]\n",
     "if excluded:\n",
     "    print(f\"\\n⛔ Excluded Columns: {', '.join(excluded)}\")"
@@ -309,19 +313,19 @@
     "    selector = FeatureSelector(target_column=findings.target_column)\n",
     "    availability_recs = selector.get_availability_recommendations(findings.feature_availability)\n",
     "    unavailable_features = [rec.column for rec in availability_recs]\n",
-    "    \n",
+    "\n",
     "    print(f\"\\n⚠️  {len(availability_recs)} feature(s) have availability issues:\\n\")\n",
     "    for rec in availability_recs:\n",
     "        print(f\"   • {rec.column} ({rec.issue_type}, {rec.coverage_pct:.0f}% coverage)\")\n",
-    "    \n",
+    "\n",
     "    print(\"\\n📋 Alternative approaches (for investigation):\")\n",
     "    print(\"   • segment_by_cohort: Train separate models per availability period\")\n",
     "    print(\"   • add_indicator: Create availability flags and impute missing\")\n",
     "    print(\"   • filter_window: Restrict data to feature's available period\")\n",
-    "    \n",
+    "\n",
     "    original_count = len(feature_cols)\n",
     "    feature_cols = [f for f in feature_cols if f not in unavailable_features]\n",
-    "    \n",
+    "\n",
     "    print(f\"\\n🗑️  Removed {original_count - len(feature_cols)} unavailable features\")\n",
     "    print(f\"📊 Features remaining: {len(feature_cols)}\")\n",
     "else:\n",
@@ -377,7 +381,7 @@
     "\n",
     "print(f\"Train size: {len(X_train):,} ({len(X_train)/len(X)*100:.0f}%)\")\n",
     "print(f\"Test size: {len(X_test):,} ({len(X_test)/len(X)*100:.0f}%)\")\n",
-    "print(f\"\\nTrain class distribution:\")\n",
+    "print(\"\\nTrain class distribution:\")\n",
     "print(f\"  Retained (1): {(y_train == 1).sum():,} ({(y_train == 1).sum()/len(y_train)*100:.1f}%)\")\n",
     "print(f\"  Churned (0): {(y_train == 0).sum():,} ({(y_train == 0).sum()/len(y_train)*100:.1f}%)\")"
    ]
@@ -438,7 +442,7 @@
     "\n",
     "for name, model in models.items():\n",
     "    print(f\"Training {name}...\")\n",
-    "    \n",
+    "\n",
     "    # Use scaled data for Logistic Regression, unscaled for tree-based\n",
     "    if \"Logistic\" in name:\n",
     "        model.fit(X_train_scaled, y_train)\n",
@@ -448,20 +452,20 @@
     "        model.fit(X_train, y_train)\n",
     "        y_pred = model.predict(X_test)\n",
     "        y_pred_proba = model.predict_proba(X_test)[:, 1]\n",
-    "    \n",
+    "\n",
     "    # Calculate metrics\n",
     "    auc = roc_auc_score(y_test, y_pred_proba)\n",
     "    pr_auc = average_precision_score(y_test, y_pred_proba)\n",
     "    f1 = f1_score(y_test, y_pred)\n",
     "    precision = precision_score(y_test, y_pred)\n",
     "    recall = recall_score(y_test, y_pred)\n",
-    "    \n",
+    "\n",
     "    # Cross-validation\n",
     "    if \"Logistic\" in name:\n",
     "        cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='roc_auc')\n",
     "    else:\n",
     "        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')\n",
-    "    \n",
+    "\n",
     "    results.append({\n",
     "        \"Model\": name,\n",
     "        \"Test AUC\": auc,\n",
@@ -472,7 +476,7 @@
     "        \"CV AUC Mean\": cv_scores.mean(),\n",
     "        \"CV AUC Std\": cv_scores.std()\n",
     "    })\n",
-    "    \n",
+    "\n",
     "    model_predictions[name] = {\n",
     "        'y_pred': y_pred,\n",
     "        'y_pred_proba': y_pred_proba,\n",
@@ -799,12 +803,12 @@
     "print(f\"   PR-AUC: {best_model['PR-AUC']:.4f}\")\n",
     "print(f\"   F1-Score: {best_model['F1-Score']:.4f}\")\n",
     "\n",
-    "print(f\"\\n📊 TOP 3 IMPORTANT FEATURES:\")\n",
+    "print(\"\\n📊 TOP 3 IMPORTANT FEATURES:\")\n",
     "for i, feat in enumerate(importance_df.head(3)['Feature'].tolist(), 1):\n",
     "    imp = importance_df[importance_df['Feature'] == feat]['Importance'].values[0]\n",
     "    print(f\"   {i}. {feat} ({imp:.3f})\")\n",
     "\n",
-    "print(f\"\\n📈 MODEL PERFORMANCE ASSESSMENT:\")\n",
+    "print(\"\\n📈 MODEL PERFORMANCE ASSESSMENT:\")\n",
     "if best_model['Test AUC'] > 0.90:\n",
     "    print(\"   Excellent predictive signal - likely production-ready with tuning\")\n",
     "elif best_model['Test AUC'] > 0.80:\n",
@@ -814,7 +818,7 @@
     "else:\n",
     "    print(\"   Weak signal - may need more data or different features\")\n",
     "\n",
-    "print(f\"\\n💡 NEXT STEPS:\")\n",
+    "print(\"\\n💡 NEXT STEPS:\")\n",
     "print(\"   1. Feature engineering with derived features (notebook 05)\")\n",
     "print(\"   2. Hyperparameter tuning (GridSearchCV)\")\n",
     "print(\"   3. Threshold optimization for business metrics\")\n",

{churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb RENAMED Viewed

@@ -66,13 +66,15 @@
    "outputs": [],
    "source": [
     "from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
+    "\n",
     "track_and_export_previous(\"09_business_alignment.ipynb\")\n",
     "\n",
-    "from customer_retention.analysis.auto_explorer import ExplorationFindings\n",
-    "from customer_retention.analysis.visualization import display_table\n",
     "import pandas as pd\n",
-    "from customer_retention.core.config.experiments import FINDINGS_DIR, EXPERIMENTS_DIR, OUTPUT_DIR, setup_experiments_structure\n",
-    "from customer_retention.stages.temporal import TEMPORAL_METADATA_COLS\n"
+    "\n",
+    "from customer_retention.analysis.auto_explorer import ExplorationFindings\n",
+    "from customer_retention.core.config.experiments import (\n",
+    "    FINDINGS_DIR,\n",
+    ")\n"
    ]
   },
   {
@@ -98,7 +100,6 @@
    "outputs": [],
    "source": [
     "# === CONFIGURATION ===\n",
-    "from pathlib import Path\n",
     "\n",
     "# FINDINGS_DIR imported from customer_retention.core.config.experiments\n",
     "\n",

churnkit 0.76.1a1__py3-none-any.whl → 0.76.1a2__py3-none-any.whl

churnkit 0.76.1a1py3-none-any.whl → 0.76.1a2py3-none-any.whl