PyPI - churnkit - Versions diffs - 0.76.1a1__py3-none-any.whl → 0.76.1a2__py3-none-any.whl - Mend

churnkit 0.76.1a1py3-none-any.whl → 0.76.1a2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

{churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb RENAMED Viewed

@@ -82,21 +82,22 @@
    "outputs": [],
    "source": [
     "from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
+    "\n",
     "track_and_export_previous(\"04_relationship_analysis.ipynb\")\n",
     "\n",
-    "from customer_retention.analysis.auto_explorer import ExplorationFindings, RecommendationRegistry\n",
-    "from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table\n",
-    "from customer_retention.core.config.column_config import ColumnType\n",
-    "from customer_retention.stages.profiling import (\n",
-    "    RelationshipRecommender, RecommendationCategory\n",
-    ")\n",
-    "import yaml\n",
-    "import pandas as pd\n",
     "import numpy as np\n",
+    "import pandas as pd\n",
     "import plotly.graph_objects as go\n",
-    "import plotly.express as px\n",
+    "import yaml\n",
     "from plotly.subplots import make_subplots\n",
-    "from customer_retention.core.config.experiments import FINDINGS_DIR, EXPERIMENTS_DIR, OUTPUT_DIR, setup_experiments_structure\n"
+    "\n",
+    "from customer_retention.analysis.auto_explorer import ExplorationFindings, RecommendationRegistry\n",
+    "from customer_retention.analysis.visualization import ChartBuilder, display_figure\n",
+    "from customer_retention.core.config.column_config import ColumnType\n",
+    "from customer_retention.core.config.experiments import (\n",
+    "    FINDINGS_DIR,\n",
+    ")\n",
+    "from customer_retention.stages.profiling import RecommendationCategory, RelationshipRecommender\n"
    ]
   },
   {
@@ -159,7 +160,7 @@
     "findings = ExplorationFindings.load(FINDINGS_PATH)\n",
     "\n",
     "# Load data - handle aggregated vs standard paths\n",
-    "from customer_retention.stages.temporal import load_data_with_snapshot_preference, TEMPORAL_METADATA_COLS\n",
+    "from customer_retention.stages.temporal import TEMPORAL_METADATA_COLS, load_data_with_snapshot_preference\n",
     "\n",
     "# For aggregated data, load directly from the parquet source\n",
     "if \"_aggregated\" in FINDINGS_PATH and findings.source_path.endswith('.parquet'):\n",
@@ -376,19 +377,19 @@
     "# Feature Distributions by Retention Status\n",
     "if findings.target_column and findings.target_column in df.columns:\n",
     "    target = findings.target_column\n",
-    "    \n",
+    "\n",
     "    feature_cols = [\n",
     "        name for name, col in findings.columns.items()\n",
     "        if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]\n",
     "        and name != target\n",
     "        and name not in TEMPORAL_METADATA_COLS\n",
     "    ]\n",
-    "    \n",
+    "\n",
     "    if feature_cols:\n",
     "        print(\"=\" * 80)\n",
     "        print(f\"FEATURE DISTRIBUTIONS BY TARGET: {target}\")\n",
     "        print(\"=\" * 80)\n",
-    "        \n",
+    "\n",
     "        # Calculate summary statistics by target\n",
     "        summary_by_target = []\n",
     "        for col in feature_cols:\n",
@@ -403,16 +404,16 @@
     "                        \"Median\": subset.median(),\n",
     "                        \"Std\": subset.std()\n",
     "                    })\n",
-    "        \n",
+    "\n",
     "        if summary_by_target:\n",
     "            summary_df = pd.DataFrame(summary_by_target)\n",
-    "            \n",
+    "\n",
     "            # Display summary table\n",
     "            print(\"\\n📊 Summary Statistics by Retention Status:\")\n",
     "            display_summary = summary_df.pivot(index=\"Feature\", columns=\"Group\", values=[\"Mean\", \"Median\"])\n",
     "            display_summary.columns = [f\"{stat} ({group})\" for stat, group in display_summary.columns]\n",
     "            display(display_summary.round(3))\n",
-    "        \n",
+    "\n",
     "        # Calculate effect size (Cohen's d) for each feature\n",
     "        print(\"\\n📈 Feature Importance Indicators (Effect Size - Cohen's d):\")\n",
     "        print(\"-\" * 70)\n",
@@ -420,16 +421,16 @@
     "        for col in feature_cols:\n",
     "            churned = df[df[target] == 0][col].dropna()\n",
     "            retained = df[df[target] == 1][col].dropna()\n",
-    "            \n",
+    "\n",
     "            if len(churned) > 0 and len(retained) > 0:\n",
     "                # Cohen's d\n",
-    "                pooled_std = np.sqrt(((len(churned)-1)*churned.std()**2 + (len(retained)-1)*retained.std()**2) / \n",
+    "                pooled_std = np.sqrt(((len(churned)-1)*churned.std()**2 + (len(retained)-1)*retained.std()**2) /\n",
     "                                     (len(churned) + len(retained) - 2))\n",
     "                if pooled_std > 0:\n",
     "                    d = (retained.mean() - churned.mean()) / pooled_std\n",
     "                else:\n",
     "                    d = 0\n",
-    "                \n",
+    "\n",
     "                # Interpret effect size\n",
     "                abs_d = abs(d)\n",
     "                if abs_d >= 0.8:\n",
@@ -444,17 +445,17 @@
     "                else:\n",
     "                    interpretation = \"Negligible\"\n",
     "                    emoji = \"⚪\"\n",
-    "                \n",
+    "\n",
     "                effect_sizes.append({\n",
     "                    \"feature\": col,\n",
     "                    \"cohens_d\": d,\n",
     "                    \"abs_d\": abs_d,\n",
     "                    \"interpretation\": interpretation\n",
     "                })\n",
-    "                \n",
+    "\n",
     "                direction = \"↑ Higher in retained\" if d > 0 else \"↓ Lower in retained\"\n",
     "                print(f\"  {emoji} {col}: d={d:+.3f} ({interpretation}) {direction}\")\n",
-    "        \n",
+    "\n",
     "        # Sort by effect size for identifying important features\n",
     "        if effect_sizes:\n",
     "            effect_df = pd.DataFrame(effect_sizes).sort_values(\"abs_d\", ascending=False)\n",
@@ -535,27 +536,27 @@
     "# Box Plots: Visual comparison of distributions\n",
     "if findings.target_column and findings.target_column in df.columns:\n",
     "    target = findings.target_column\n",
-    "    \n",
+    "\n",
     "    feature_cols = [\n",
     "        name for name, col in findings.columns.items()\n",
     "        if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]\n",
     "        and name != target\n",
     "        and name not in TEMPORAL_METADATA_COLS\n",
     "    ]\n",
-    "    \n",
+    "\n",
     "    if feature_cols:\n",
     "        # Create box plots - one subplot per feature for better control\n",
     "        n_features = min(len(feature_cols), 6)\n",
-    "        \n",
+    "\n",
     "        fig = make_subplots(\n",
     "            rows=1, cols=n_features,\n",
     "            subplot_titles=feature_cols[:n_features],\n",
     "            horizontal_spacing=0.05\n",
     "        )\n",
-    "        \n",
+    "\n",
     "        for i, col in enumerate(feature_cols[:n_features]):\n",
     "            col_num = i + 1\n",
-    "            \n",
+    "\n",
     "            # Retained (1) - Green\n",
     "            retained_data = df[df[target] == 1][col].dropna()\n",
     "            fig.add_trace(\n",
@@ -577,7 +578,7 @@
     "                ),\n",
     "                row=1, col=col_num\n",
     "            )\n",
-    "            \n",
+    "\n",
     "            # Churned (0) - Red\n",
     "            churned_data = df[df[target] == 0][col].dropna()\n",
     "            fig.add_trace(\n",
@@ -599,7 +600,7 @@
     "                ),\n",
     "                row=1, col=col_num\n",
     "            )\n",
-    "        \n",
+    "\n",
     "        fig.update_layout(\n",
     "            height=450,\n",
     "            title_text=\"Feature Distributions: Retained (Green) vs Churned (Red)\",\n",
@@ -610,12 +611,12 @@
     "            boxgap=0.3,\n",
     "            boxgroupgap=0.1\n",
     "        )\n",
-    "        \n",
+    "\n",
     "        # Center the boxes by removing x-axis tick labels (title is above each subplot)\n",
     "        fig.update_xaxes(showticklabels=False)\n",
-    "        \n",
+    "\n",
     "        display_figure(fig)\n",
-    "        \n",
+    "\n",
     "        # Print mean comparison\n",
     "        print(\"\\n📊 MEAN COMPARISON BY RETENTION STATUS:\")\n",
     "        print(\"-\" * 70)\n",
@@ -682,15 +683,15 @@
     "        and name != target\n",
     "        and name not in TEMPORAL_METADATA_COLS\n",
     "    ]\n",
-    "    \n",
+    "\n",
     "    if feature_cols:\n",
     "        correlations = []\n",
     "        for col in feature_cols:\n",
     "            corr = df[[col, target]].corr().iloc[0, 1]\n",
     "            correlations.append({\"Feature\": col, \"Correlation\": corr})\n",
-    "        \n",
+    "\n",
     "        corr_df = pd.DataFrame(correlations).sort_values(\"Correlation\", key=abs, ascending=False)\n",
-    "        \n",
+    "\n",
     "        fig = charts.bar_chart(\n",
     "            corr_df[\"Feature\"].tolist(),\n",
     "            corr_df[\"Correlation\"].tolist(),\n",
@@ -757,23 +758,23 @@
     "if findings.target_column:\n",
     "    target = findings.target_column\n",
     "    overall_retention = df[target].mean()\n",
-    "    \n",
+    "\n",
     "    categorical_cols = [\n",
     "        name for name, col in findings.columns.items()\n",
     "        if col.inferred_type in [ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL]\n",
     "        and name not in TEMPORAL_METADATA_COLS\n",
     "    ]\n",
-    "    \n",
+    "\n",
     "    print(\"=\" * 80)\n",
     "    print(\"CATEGORICAL FEATURE ANALYSIS\")\n",
     "    print(\"=\" * 80)\n",
     "    print(f\"Overall retention rate: {overall_retention:.1%}\")\n",
-    "    \n",
+    "\n",
     "    if categorical_cols:\n",
     "        # Use framework analyzer for summary\n",
     "        cat_analyzer = CategoricalTargetAnalyzer(min_samples_per_category=10)\n",
     "        summary_df = cat_analyzer.analyze_multiple(df, categorical_cols, target)\n",
-    "        \n",
+    "\n",
     "        print(\"\\n📈 Categorical Feature Strength (Cramér's V):\")\n",
     "        print(\"-\" * 60)\n",
     "        for _, row in summary_df.iterrows():\n",
@@ -788,15 +789,15 @@
     "                emoji = \"🟢\"\n",
     "            sig = \"***\" if row[\"p_value\"] < 0.001 else \"**\" if row[\"p_value\"] < 0.01 else \"*\" if row[\"p_value\"] < 0.05 else \"\"\n",
     "            print(f\"  {emoji} {row['feature']}: V={row['cramers_v']:.3f} ({strength}) {sig}\")\n",
-    "        \n",
+    "\n",
     "        # Detailed analysis for each categorical feature\n",
     "        for col_name in categorical_cols[:5]:\n",
     "            result = cat_analyzer.analyze(df, col_name, target)\n",
-    "            \n",
+    "\n",
     "            print(f\"\\n{'='*60}\")\n",
     "            print(f\"📊 {col_name.upper()}\")\n",
     "            print(\"=\"*60)\n",
-    "            \n",
+    "\n",
     "            # Display stats table\n",
     "            if len(result.category_stats) > 0:\n",
     "                display_stats = result.category_stats[['category', 'total_count', 'retention_rate', 'lift', 'pct_of_total']].copy()\n",
@@ -805,15 +806,15 @@
     "                display_stats['pct_of_total'] = display_stats['pct_of_total'].apply(lambda x: f\"{x:.1%}\")\n",
     "                display_stats.columns = [col_name, 'Count', 'Retention Rate', 'Lift', '% of Data']\n",
     "                display(display_stats)\n",
-    "                \n",
+    "\n",
     "                # Stacked bar chart\n",
     "                cat_stats = result.category_stats\n",
     "                categories = cat_stats['category'].tolist()\n",
     "                retained_counts = cat_stats['retained_count'].tolist()\n",
     "                churned_counts = cat_stats['churned_count'].tolist()\n",
-    "                \n",
+    "\n",
     "                fig = go.Figure()\n",
-    "                \n",
+    "\n",
     "                fig.add_trace(go.Bar(\n",
     "                    name='Retained',\n",
     "                    x=categories,\n",
@@ -823,7 +824,7 @@
     "                    textposition='inside',\n",
     "                    textfont=dict(color='white', size=12)\n",
     "                ))\n",
-    "                \n",
+    "\n",
     "                fig.add_trace(go.Bar(\n",
     "                    name='Churned',\n",
     "                    x=categories,\n",
@@ -833,7 +834,7 @@
     "                    textposition='inside',\n",
     "                    textfont=dict(color='white', size=12)\n",
     "                ))\n",
-    "                \n",
+    "\n",
     "                fig.update_layout(\n",
     "                    barmode='stack',\n",
     "                    title=f\"Retention by {col_name}\",\n",
@@ -844,10 +845,10 @@
     "                    legend=dict(orientation=\"h\", yanchor=\"bottom\", y=1.02, xanchor=\"center\", x=0.5)\n",
     "                )\n",
     "                display_figure(fig)\n",
-    "                \n",
+    "\n",
     "                # Flag high-risk categories from framework result\n",
     "                if result.high_risk_categories:\n",
-    "                    print(f\"\\n  ⚠️ High-risk categories (lift < 0.9x):\")\n",
+    "                    print(\"\\n  ⚠️ High-risk categories (lift < 0.9x):\")\n",
     "                    for cat in result.high_risk_categories:\n",
     "                        cat_row = cat_stats[cat_stats['category'] == cat].iloc[0]\n",
     "                        print(f\"     • {cat}: {cat_row['retention_rate']:.1%} retention ({cat_row['lift']:.2f}x lift)\")\n",
@@ -1033,33 +1034,33 @@
     "if datetime_cols and findings.target_column:\n",
     "    target = findings.target_column\n",
     "    overall_retention = df[target].mean()\n",
-    "    \n",
+    "\n",
     "    # Use framework analyzer\n",
     "    temporal_analyzer = TemporalTargetAnalyzer(min_samples_per_period=10)\n",
-    "    \n",
+    "\n",
     "    for col_name in datetime_cols[:3]:\n",
     "        result = temporal_analyzer.analyze(df, col_name, target)\n",
-    "        \n",
+    "\n",
     "        print(f\"\\n{'='*60}\")\n",
     "        print(f\"📅 {col_name.upper()}\")\n",
     "        print(\"=\"*60)\n",
-    "        \n",
+    "\n",
     "        if result.n_valid_dates == 0:\n",
     "            print(\"  No valid dates found\")\n",
     "            continue\n",
-    "        \n",
+    "\n",
     "        print(f\"  Date range: {result.min_date} to {result.max_date}\")\n",
     "        print(f\"  Valid dates: {result.n_valid_dates:,}\")\n",
-    "        \n",
+    "\n",
     "        # 1. Retention by Year (from framework result)\n",
     "        if len(result.yearly_stats) > 1:\n",
     "            print(f\"\\n  📊 Retention by Year: Trend is {result.yearly_trend}\")\n",
-    "            \n",
+    "\n",
     "            year_stats = result.yearly_stats\n",
-    "            \n",
+    "\n",
     "            fig = make_subplots(rows=1, cols=2, subplot_titles=[\"Retention Rate by Year\", \"Customer Count by Year\"],\n",
     "                               column_widths=[0.6, 0.4])\n",
-    "            \n",
+    "\n",
     "            fig.add_trace(\n",
     "                go.Scatter(\n",
     "                    x=year_stats['period'].astype(str),\n",
@@ -1073,7 +1074,7 @@
     "            )\n",
     "            fig.add_hline(y=overall_retention, line_dash=\"dash\", line_color=\"gray\",\n",
     "                         annotation_text=f\"Overall: {overall_retention:.1%}\", row=1, col=1)\n",
-    "            \n",
+    "\n",
     "            fig.add_trace(\n",
     "                go.Bar(\n",
     "                    x=year_stats['period'].astype(str),\n",
@@ -1083,19 +1084,19 @@
     "                ),\n",
     "                row=1, col=2\n",
     "            )\n",
-    "            \n",
+    "\n",
     "            fig.update_layout(height=350, template='plotly_white', showlegend=False)\n",
     "            fig.update_yaxes(tickformat='.0%', row=1, col=1)\n",
     "            display_figure(fig)\n",
-    "        \n",
+    "\n",
     "        # 2. Retention by Month (from framework result)\n",
     "        if len(result.monthly_stats) > 1:\n",
-    "            print(f\"\\n  📊 Retention by Month (Seasonality):\")\n",
-    "            \n",
+    "            print(\"\\n  📊 Retention by Month (Seasonality):\")\n",
+    "\n",
     "            month_stats = result.monthly_stats\n",
-    "            colors = ['rgba(46, 204, 113, 0.7)' if r >= overall_retention else 'rgba(231, 76, 60, 0.7)' \n",
+    "            colors = ['rgba(46, 204, 113, 0.7)' if r >= overall_retention else 'rgba(231, 76, 60, 0.7)'\n",
     "                     for r in month_stats['retention_rate']]\n",
-    "            \n",
+    "\n",
     "            fig = go.Figure()\n",
     "            fig.add_trace(go.Bar(\n",
     "                x=month_stats['month_name'],\n",
@@ -1106,7 +1107,7 @@
     "            ))\n",
     "            fig.add_hline(y=overall_retention, line_dash=\"dash\", line_color=\"gray\",\n",
     "                         annotation_text=f\"Overall: {overall_retention:.1%}\")\n",
-    "            \n",
+    "\n",
     "            fig.update_layout(\n",
     "                title=f\"Monthly Retention Pattern ({col_name})\",\n",
     "                xaxis_title=\"Month\",\n",
@@ -1116,21 +1117,21 @@
     "                yaxis_tickformat='.0%'\n",
     "            )\n",
     "            display_figure(fig)\n",
-    "            \n",
+    "\n",
     "            # Seasonal insights from framework\n",
     "            if result.seasonal_spread > 0.05:\n",
     "                print(f\"  📈 Seasonal spread: {result.seasonal_spread:.1%}\")\n",
     "                print(f\"     Best month: {result.best_month}\")\n",
     "                print(f\"     Worst month: {result.worst_month}\")\n",
-    "        \n",
+    "\n",
     "        # 3. Retention by Day of Week (from framework result)\n",
     "        if len(result.dow_stats) > 1:\n",
-    "            print(f\"\\n  📊 Retention by Day of Week:\")\n",
-    "            \n",
+    "            print(\"\\n  📊 Retention by Day of Week:\")\n",
+    "\n",
     "            dow_stats = result.dow_stats\n",
-    "            colors = ['rgba(46, 204, 113, 0.7)' if r >= overall_retention else 'rgba(231, 76, 60, 0.7)' \n",
+    "            colors = ['rgba(46, 204, 113, 0.7)' if r >= overall_retention else 'rgba(231, 76, 60, 0.7)'\n",
     "                     for r in dow_stats['retention_rate']]\n",
-    "            \n",
+    "\n",
     "            fig = go.Figure()\n",
     "            fig.add_trace(go.Bar(\n",
     "                x=dow_stats['day_name'],\n",
@@ -1140,7 +1141,7 @@
     "                textposition='outside'\n",
     "            ))\n",
     "            fig.add_hline(y=overall_retention, line_dash=\"dash\", line_color=\"gray\")\n",
-    "            \n",
+    "\n",
     "            fig.update_layout(\n",
     "                title=f\"Day of Week Pattern ({col_name})\",\n",
     "                xaxis_title=\"Day of Week\",\n",
@@ -1371,7 +1372,7 @@
     "    strong_df[\"correlation\"] = strong_df[\"correlation\"].apply(lambda x: f\"{x:+.3f}\")\n",
     "    strong_df = strong_df.sort_values(\"effect_size\", key=lambda x: x.str.replace(\"+\", \"\").astype(float).abs(), ascending=False)\n",
     "    display(strong_df)\n",
-    "    \n",
+    "\n",
     "    print(\"\\n   💡 These features show strong discrimination between retained/churned customers.\")\n",
     "    print(\"   → Ensure they're included in your model\")\n",
     "    print(\"   → Check for data quality issues that could inflate their importance\")\n",
@@ -1478,7 +1479,7 @@
     "    risk_df[\"retention_rate\"] = risk_df[\"retention_rate\"].apply(lambda x: f\"{x:.1%}\")\n",
     "    risk_df[\"lift\"] = risk_df[\"lift\"].apply(lambda x: f\"{x:.2f}x\")\n",
     "    display(risk_df[[\"feature\", \"segment\", \"count\", \"retention_rate\", \"lift\"]])\n",
-    "    \n",
+    "\n",
     "    print(\"\\n   💡 These segments have below-average retention.\")\n",
     "    print(\"   → Ensure they're adequately represented in both train and test sets\")\n",
     "    print(\"   → Consider oversampling or class weights in modeling\")\n",
@@ -1674,7 +1675,7 @@
     "    print(\"POTENTIAL INTERACTION FEATURES:\")\n",
     "    strong_features = [p[\"feature\"] for p in analysis_summary.strong_predictors[:5]]\n",
     "    if len(strong_features) >= 2:\n",
-    "        print(f\"\\n   Based on strong predictors, consider interactions between:\")\n",
+    "        print(\"\\n   Based on strong predictors, consider interactions between:\")\n",
     "        for i, f1 in enumerate(strong_features[:3]):\n",
     "            for f2 in strong_features[i+1:4]:\n",
     "                print(f\"   • {f1} × {f2}\")\n",
@@ -1733,12 +1734,12 @@
     "\n",
     "if all_recs_data:\n",
     "    recs_df = pd.DataFrame(all_recs_data)\n",
-    "    \n",
+    "\n",
     "    # Sort by priority\n",
     "    priority_order = {\"HIGH\": 0, \"MEDIUM\": 1, \"LOW\": 2}\n",
     "    recs_df[\"_sort\"] = recs_df[\"Priority\"].map(priority_order)\n",
     "    recs_df = recs_df.sort_values(\"_sort\").drop(\"_sort\", axis=1)\n",
-    "    \n",
+    "\n",
     "    print(\"=\" * 80)\n",
     "    print(\"ALL RECOMMENDATIONS SUMMARY\")\n",
     "    print(\"=\" * 80)\n",
@@ -1746,7 +1747,7 @@
     "    print(f\"  🔴 High priority: {len(recs_df[recs_df['Priority'] == 'HIGH'])}\")\n",
     "    print(f\"  🟡 Medium priority: {len(recs_df[recs_df['Priority'] == 'MEDIUM'])}\")\n",
     "    print(f\"  🟢 Low priority: {len(recs_df[recs_df['Priority'] == 'LOW'])}\")\n",
-    "    \n",
+    "\n",
     "    display(recs_df)\n",
     "\n",
     "# Save updated findings and recommendations registry\n",

churnkit 0.76.1a1__py3-none-any.whl → 0.76.1a2__py3-none-any.whl

churnkit 0.76.1a1py3-none-any.whl → 0.76.1a2py3-none-any.whl