PyPI - churnkit - Versions diffs - 0.76.0a3__py3-none-any.whl → 0.76.1a2__py3-none-any.whl - Mend

churnkit 0.76.0a3py3-none-any.whl → 0.76.1a2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

{churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb RENAMED Viewed

@@ -84,17 +84,20 @@
    "outputs": [],
    "source": [
     "from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
+    "\n",
     "track_and_export_previous(\"03_quality_assessment.ipynb\")\n",
     "\n",
-    "from customer_retention.analysis.auto_explorer import ExplorationFindings, RecommendationRegistry\n",
-    "from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table\n",
-    "from customer_retention.core.config.column_config import ColumnType\n",
     "import pandas as pd\n",
-    "import numpy as np\n",
-    "import plotly.graph_objects as go\n",
     "import plotly.express as px\n",
+    "import plotly.graph_objects as go\n",
     "from plotly.subplots import make_subplots\n",
-    "from customer_retention.core.config.experiments import FINDINGS_DIR, EXPERIMENTS_DIR, OUTPUT_DIR, setup_experiments_structure"
+    "\n",
+    "from customer_retention.analysis.auto_explorer import ExplorationFindings, RecommendationRegistry\n",
+    "from customer_retention.analysis.visualization import ChartBuilder, display_figure\n",
+    "from customer_retention.core.config.column_config import ColumnType\n",
+    "from customer_retention.core.config.experiments import (\n",
+    "    FINDINGS_DIR,\n",
+    ")"
    ]
   },
   {
@@ -125,6 +128,7 @@
     "\n",
     "# Option 2: Auto-discover the most recent findings file\n",
     "from pathlib import Path\n",
+    "\n",
     "import yaml\n",
     "\n",
     "# FINDINGS_DIR imported from customer_retention.core.config.experiments\n",
@@ -156,7 +160,7 @@
     "findings = ExplorationFindings.load(FINDINGS_PATH)\n",
     "\n",
     "# Load data - handle aggregated vs standard paths\n",
-    "from customer_retention.stages.temporal import load_data_with_snapshot_preference, TEMPORAL_METADATA_COLS\n",
+    "from customer_retention.stages.temporal import TEMPORAL_METADATA_COLS, load_data_with_snapshot_preference\n",
     "\n",
     "# For aggregated data, load directly from the parquet source\n",
     "if \"_aggregated\" in FINDINGS_PATH and findings.source_path.endswith('.parquet'):\n",
@@ -193,7 +197,7 @@
     "    entity_col = next((name for name, col in findings.columns.items() if col.inferred_type == ColumnType.IDENTIFIER), None)\n",
     "    if entity_col:\n",
     "        registry.init_silver(entity_col)\n",
-    "    print(f\"Initialized new recommendation registry\")\n",
+    "    print(\"Initialized new recommendation registry\")\n",
     "\n",
     "print(f\"\\nLoaded findings for {findings.column_count} columns\")"
    ]
@@ -252,7 +256,7 @@
     "validator = DataValidator()\n",
     "\n",
     "# Auto-detect potential key columns\n",
-    "potential_keys = [name for name, col in findings.columns.items() \n",
+    "potential_keys = [name for name, col in findings.columns.items()\n",
     "                  if col.inferred_type.value in ('identifier', 'id') or 'id' in name.lower()]\n",
     "KEY_COLUMN = potential_keys[0] if potential_keys else None\n",
     "\n",
@@ -266,7 +270,7 @@
     "    print(f\"Total Rows: {dup_result.total_rows:,}\")\n",
     "    print(f\"Unique Keys: {dup_result.unique_keys:,}\")\n",
     "    print(f\"Duplicate Keys: {dup_result.duplicate_keys:,} ({dup_result.duplicate_percentage:.2f}%)\")\n",
-    "    \n",
+    "\n",
     "    # Exact duplicates\n",
     "    if dup_result.exact_duplicate_rows > 0:\n",
     "        print(f\"\\n⚠️ Exact duplicate rows: {dup_result.exact_duplicate_rows:,}\")\n",
@@ -275,7 +279,7 @@
     "        if len(dup_examples) > 0:\n",
     "            print(\"\\nExample duplicate rows:\")\n",
     "            display(dup_examples)\n",
-    "        \n",
+    "\n",
     "        # Add deduplication recommendation for exact duplicates\n",
     "        registry.add_bronze_deduplication(\n",
     "            key_column=KEY_COLUMN, strategy=\"drop_exact_duplicates\",\n",
@@ -284,13 +288,13 @@
     "        )\n",
     "    else:\n",
     "        print(\"\\n✓ No exact duplicate rows\")\n",
-    "    \n",
+    "\n",
     "    # Value conflicts\n",
     "    if dup_result.has_value_conflicts:\n",
     "        print(f\"\\n⚠️ Value conflicts detected in: {', '.join(dup_result.conflict_columns[:5])}\")\n",
     "        if findings.target_column and findings.target_column in dup_result.conflict_columns:\n",
     "            print(f\"   🔴 CRITICAL: Target '{findings.target_column}' has conflicting values!\")\n",
-    "        \n",
+    "\n",
     "        # Show examples of conflicting records\n",
     "        key_counts = df[KEY_COLUMN].value_counts()\n",
     "        dup_keys = key_counts[key_counts > 1].head(3).index.tolist()\n",
@@ -298,7 +302,7 @@
     "            print(\"\\nExample records with duplicate keys:\")\n",
     "            conflict_examples = df[df[KEY_COLUMN].isin(dup_keys)].sort_values(KEY_COLUMN).head(10)\n",
     "            display(conflict_examples)\n",
-    "        \n",
+    "\n",
     "        # Add deduplication recommendation for value conflicts\n",
     "        registry.add_bronze_deduplication(\n",
     "            key_column=KEY_COLUMN, strategy=\"keep_first\",\n",
@@ -308,7 +312,7 @@
     "        )\n",
     "    else:\n",
     "        print(\"\\n✓ No value conflicts\")\n",
-    "    \n",
+    "\n",
     "    # Duplicate frequency distribution\n",
     "    if dup_result.duplicate_keys > 0:\n",
     "        key_counts = df[KEY_COLUMN].value_counts()\n",
@@ -317,7 +321,7 @@
     "            print(\"\\nDuplicate frequency distribution:\")\n",
     "            for count, num_keys in dup_distribution.head(5).items():\n",
     "                print(f\"   Keys appearing {count}x: {num_keys:,}\")\n",
-    "        \n",
+    "\n",
     "        # Recommendations\n",
     "        print(\"\\n💡 RECOMMENDATIONS:\")\n",
     "        if dup_result.exact_duplicate_rows > 0:\n",
@@ -435,7 +439,7 @@
     "if findings.target_column and findings.target_column in df.columns:\n",
     "    target_series = df[findings.target_column]\n",
     "    target_counts = target_series.value_counts().sort_index()\n",
-    "    \n",
+    "\n",
     "    # Create distribution table\n",
     "    dist_data = []\n",
     "    for val, count in target_counts.items():\n",
@@ -445,10 +449,10 @@
     "            \"count\": count,\n",
     "            \"percentage\": f\"{pct:.3f}\"\n",
     "        })\n",
-    "    \n",
+    "\n",
     "    dist_df = pd.DataFrame(dist_data)\n",
     "    display(dist_df)\n",
-    "    \n",
+    "\n",
     "    # Calculate imbalance metrics\n",
     "    if len(target_counts) == 2:\n",
     "        majority = target_counts.max()\n",
@@ -456,10 +460,10 @@
     "        minority_class = target_counts.idxmin()\n",
     "        imbalance_ratio = majority / minority\n",
     "        retention_rate = target_counts.get(1, 0) / len(df) * 100\n",
-    "        \n",
+    "\n",
     "        print(f\"\\nImbalance ratio: {imbalance_ratio:.2f}:1 (minority class: {minority_class})\")\n",
     "        print(f\"Retention rate: {retention_rate:.1f}%\")\n",
-    "        \n",
+    "\n",
     "        # Business context\n",
     "        if retention_rate > 70:\n",
     "            print(f\"\\n📊 Business Context: {retention_rate:.0f}% retention is healthy!\")\n",
@@ -470,14 +474,14 @@
     "        else:\n",
     "            print(f\"\\n⚠️ Business Context: {retention_rate:.0f}% retention is concerning!\")\n",
     "            print(\"   High churn rate requires urgent attention.\")\n",
-    "        \n",
+    "\n",
     "        # Modeling recommendations based on imbalance\n",
     "        print(\"\\n⚠️  Class imbalance considerations for modeling:\")\n",
     "        print(\"   - Use stratified sampling for train/test splits\")\n",
     "        print(\"   - Consider class weights in model training\")\n",
     "        print(\"   - Evaluate with Precision-Recall AUC (not just ROC-AUC)\")\n",
     "        print(\"   - Focus on recall for churned class (catch at-risk customers)\")\n",
-    "        \n",
+    "\n",
     "        # Add imbalance strategy recommendation\n",
     "        if imbalance_ratio < 3:\n",
     "            strategy = \"stratified_sampling\"\n",
@@ -491,7 +495,7 @@
     "            strategy = \"smote\"\n",
     "            rationale = f\"Severe imbalance ({imbalance_ratio:.2f}:1) - consider SMOTE\"\n",
     "            print(\"   - Consider SMOTE or undersampling (imbalance is severe)\")\n",
-    "        \n",
+    "\n",
     "        registry.add_bronze_imbalance_strategy(\n",
     "            target_column=findings.target_column,\n",
     "            imbalance_ratio=imbalance_ratio,\n",
@@ -500,23 +504,23 @@
     "            rationale=rationale,\n",
     "            source_notebook=\"03_quality_assessment\"\n",
     "        )\n",
-    "        \n",
+    "\n",
     "        # Visualization\n",
     "        fig = make_subplots(rows=1, cols=2, specs=[[{\"type\": \"pie\"}, {\"type\": \"bar\"}]],\n",
     "                           subplot_titles=[\"Class Distribution\", \"Count Comparison\"])\n",
-    "        \n",
+    "\n",
     "        labels = [f\"{'Retained' if v == 1 else 'Churned'} ({v})\" for v in target_counts.index]\n",
     "        fig.add_trace(go.Pie(labels=labels, values=target_counts.values, hole=0.4,\n",
     "                            marker_colors=[\"#2ecc71\", \"#e74c3c\"]), row=1, col=1)\n",
     "        fig.add_trace(go.Bar(x=labels, y=target_counts.values,\n",
     "                            marker_color=[\"#e74c3c\", \"#2ecc71\"]), row=1, col=2)\n",
-    "        \n",
+    "\n",
     "        fig.update_layout(height=350, title_text=\"Target Variable Distribution\",\n",
     "                         showlegend=False, template=\"plotly_white\")\n",
     "        display_figure(fig)\n",
     "    else:\n",
     "        print(f\"\\nMulticlass target with {len(target_counts)} classes\")\n",
-    "        \n",
+    "\n",
     "        fig = go.Figure(go.Bar(x=[str(v) for v in target_counts.index], y=target_counts.values,\n",
     "                               marker_color=px.colors.qualitative.Set2[:len(target_counts)]))\n",
     "        fig.update_layout(height=350, title_text=\"Target Variable Distribution\",\n",
@@ -593,7 +597,7 @@
     "    missing_df = pd.DataFrame(missing_data).sort_values(\"Missing Count\", ascending=False)\n",
     "    print(\"Columns with Missing Values:\")\n",
     "    display(missing_df)\n",
-    "    \n",
+    "\n",
     "    fig = charts.bar_chart(\n",
     "        missing_df[\"Column\"].tolist(),\n",
     "        [float(x.replace(\"%\", \"\")) for x in missing_df[\"Missing %\"].tolist()],\n",
@@ -742,7 +746,7 @@
     "\n",
     "if numeric_cols:\n",
     "    analyzer = SegmentAwareOutlierAnalyzer(max_segments=5)\n",
-    "    \n",
+    "\n",
     "    # Run segment-aware analysis\n",
     "    segment_result = analyzer.analyze(\n",
     "        df,\n",
@@ -750,31 +754,31 @@
     "        segment_col=SEGMENT_COL,\n",
     "        target_col=findings.target_column\n",
     "    )\n",
-    "    \n",
-    "    print(f\"\\n📊 SEGMENTATION RESULTS:\")\n",
+    "\n",
+    "    print(\"\\n📊 SEGMENTATION RESULTS:\")\n",
     "    print(f\"   Segments detected: {segment_result.n_segments}\")\n",
-    "    \n",
+    "\n",
     "    if segment_result.n_segments > 1:\n",
-    "        print(f\"\\n📈 GLOBAL VS SEGMENT OUTLIER COMPARISON:\")\n",
+    "        print(\"\\n📈 GLOBAL VS SEGMENT OUTLIER COMPARISON:\")\n",
     "        print(\"-\" * 60)\n",
-    "        \n",
+    "\n",
     "        comparison_data = []\n",
     "        for col in numeric_cols:\n",
     "            global_outliers = segment_result.global_analysis[col].outliers_detected\n",
     "            segment_outliers = sum(\n",
-    "                seg[col].outliers_detected \n",
+    "                seg[col].outliers_detected\n",
     "                for seg in segment_result.segment_analysis.values()\n",
     "                if col in seg\n",
     "            )\n",
     "            false_outliers = segment_result.false_outliers.get(col, 0)\n",
-    "            \n",
+    "\n",
     "            if global_outliers > 0:\n",
     "                reduction_pct = (global_outliers - segment_outliers) / global_outliers * 100\n",
     "                false_pct = false_outliers / global_outliers * 100\n",
     "            else:\n",
     "                reduction_pct = 0\n",
     "                false_pct = 0\n",
-    "            \n",
+    "\n",
     "            comparison_data.append({\n",
     "                \"Feature\": col,\n",
     "                \"Global Outliers\": global_outliers,\n",
@@ -782,13 +786,13 @@
     "                \"False Outliers\": false_outliers,\n",
     "                \"Reduction\": f\"{reduction_pct:.1f}%\"\n",
     "            })\n",
-    "        \n",
+    "\n",
     "        comparison_df = pd.DataFrame(comparison_data)\n",
     "        display(comparison_df)\n",
-    "        \n",
+    "\n",
     "        # Show false outlier analysis\n",
     "        has_false_outliers = any(segment_result.false_outliers.get(col, 0) > 0 for col in numeric_cols)\n",
-    "        \n",
+    "\n",
     "        if has_false_outliers:\n",
     "            print(\"\\n⚠️ FALSE OUTLIERS DETECTED:\")\n",
     "            print(\"   (Global outliers that are normal within their segment)\")\n",
@@ -797,14 +801,14 @@
     "                    global_count = segment_result.global_analysis[col].outliers_detected\n",
     "                    pct = count / global_count * 100 if global_count > 0 else 0\n",
     "                    print(f\"   • {col}: {count} false outliers ({pct:.1f}% of global)\")\n",
-    "        \n",
+    "\n",
     "        # Recommendations\n",
     "        print(\"\\n💡 RECOMMENDATIONS:\")\n",
     "        if segment_result.segmentation_recommended:\n",
     "            print(\"   ✅ SEGMENT-SPECIFIC OUTLIER TREATMENT RECOMMENDED\")\n",
     "            for rec in segment_result.recommendations:\n",
     "                print(f\"      • {rec}\")\n",
-    "            \n",
+    "\n",
     "            # Add outlier recommendations for columns with high false outlier rate\n",
     "            for col, count in segment_result.false_outliers.items():\n",
     "                if count > 0:\n",
@@ -819,27 +823,27 @@
     "                        )\n",
     "        else:\n",
     "            print(\"   ℹ️ Global outlier treatment is appropriate for this data\")\n",
-    "        \n",
+    "\n",
     "        # Rationale\n",
     "        print(\"\\n📋 RATIONALE:\")\n",
     "        for rationale in segment_result.rationale:\n",
     "            print(f\"   • {rationale}\")\n",
-    "        \n",
+    "\n",
     "        # Visualization: Compare outlier counts\n",
     "        cols_with_diff = [\n",
     "            row[\"Feature\"] for _, row in comparison_df.iterrows()\n",
     "            if row[\"Global Outliers\"] > 0 and row[\"Global Outliers\"] != row[\"Segment Outliers\"]\n",
     "        ]\n",
-    "        \n",
+    "\n",
     "        if cols_with_diff and len(cols_with_diff) <= 8:\n",
     "            fig = go.Figure()\n",
-    "            \n",
+    "\n",
     "            global_counts = [comparison_df[comparison_df[\"Feature\"] == c][\"Global Outliers\"].values[0] for c in cols_with_diff]\n",
     "            segment_counts = [comparison_df[comparison_df[\"Feature\"] == c][\"Segment Outliers\"].values[0] for c in cols_with_diff]\n",
-    "            \n",
+    "\n",
     "            fig.add_trace(go.Bar(name=\"Global Outliers\", x=cols_with_diff, y=global_counts, marker_color=\"#e74c3c\"))\n",
     "            fig.add_trace(go.Bar(name=\"Segment Outliers\", x=cols_with_diff, y=segment_counts, marker_color=\"#2ecc71\"))\n",
-    "            \n",
+    "\n",
     "            fig.update_layout(\n",
     "                barmode=\"group\",\n",
     "                title=\"Global vs Segment-Specific Outlier Detection\",\n",
@@ -852,7 +856,7 @@
     "    else:\n",
     "        print(\"\\n   ℹ️ Data appears homogeneous (single segment)\")\n",
     "        print(\"   → Proceeding with standard global outlier detection\")\n",
-    "    \n",
+    "\n",
     "    # Store result in findings metadata for use in later notebooks\n",
     "    findings.metadata[\"segment_aware_analysis\"] = {\n",
     "        \"n_segments\": segment_result.n_segments,\n",
@@ -934,11 +938,11 @@
     "    iqr = q3 - q1\n",
     "    lower_bound = q1 - 1.5 * iqr\n",
     "    upper_bound = q3 + 1.5 * iqr\n",
-    "    \n",
+    "\n",
     "    outliers_low = (series < lower_bound).sum()\n",
     "    outliers_high = (series > upper_bound).sum()\n",
     "    total_outliers = outliers_low + outliers_high\n",
-    "    \n",
+    "\n",
     "    outlier_data.append({\n",
     "        \"feature\": col_name,\n",
     "        \"Q1\": q1,\n",
@@ -956,7 +960,7 @@
     "\n",
     "# Display IQR bounds table\n",
     "print(\"\\n📊 IQR BOUNDS TABLE:\")\n",
-    "bounds_display = outlier_df[[\"feature\", \"Q1\", \"Q3\", \"IQR\", \"lower_bound\", \"upper_bound\", \n",
+    "bounds_display = outlier_df[[\"feature\", \"Q1\", \"Q3\", \"IQR\", \"lower_bound\", \"upper_bound\",\n",
     "                              \"outliers_low\", \"outliers_high\"]].copy()\n",
     "for col in [\"Q1\", \"Q3\", \"IQR\", \"lower_bound\", \"upper_bound\"]:\n",
     "    bounds_display[col] = bounds_display[col].apply(lambda x: f\"{x:.2f}\")\n",
@@ -974,11 +978,11 @@
     "            print(f\"     Below lower: {row['outliers_low']:,}\")\n",
     "        if row[\"outliers_high\"] > 0:\n",
     "            print(f\"     Above upper: {row['outliers_high']:,}\")\n",
-    "        \n",
+    "\n",
     "        # Determine action and add recommendation (skip if segment-aware already added)\n",
     "        col_name = row['feature']\n",
     "        existing_outlier_recs = [r for r in registry.bronze.outlier_handling if r.target_column == col_name]\n",
-    "        \n",
+    "\n",
     "        if not existing_outlier_recs and row[\"outlier_pct\"] > 5:  # Only add if significant and not already handled\n",
     "            if row[\"outlier_pct\"] > 10:\n",
     "                action = \"log_transform\"\n",
@@ -988,7 +992,7 @@
     "                action = \"winsorize\"\n",
     "                rationale = f\"{row['outlier_pct']:.1f}% outliers - winsorize to 1st/99th percentile\"\n",
     "                print(\"     → Consider Winsorization (clip to 1st/99th percentile)\")\n",
-    "            \n",
+    "\n",
     "            registry.add_bronze_outlier(\n",
     "                column=col_name, action=action,\n",
     "                parameters={\"method\": \"iqr\", \"lower_bound\": row[\"lower_bound\"], \"upper_bound\": row[\"upper_bound\"]},\n",
@@ -1003,16 +1007,16 @@
     "# Box plots for columns with outliers\n",
     "if len(cols_with_outliers) > 0 and len(cols_with_outliers) <= 6:\n",
     "    outlier_cols = cols_with_outliers[\"feature\"].tolist()\n",
-    "    \n",
+    "\n",
     "    fig = make_subplots(rows=1, cols=len(outlier_cols), subplot_titles=outlier_cols)\n",
-    "    \n",
+    "\n",
     "    for i, col in enumerate(outlier_cols, 1):\n",
     "        fig.add_trace(\n",
     "            go.Box(y=df[col].dropna(), name=col, boxpoints=\"outliers\",\n",
     "                  marker_color=\"#3498db\", showlegend=False),\n",
     "            row=1, col=i\n",
     "        )\n",
-    "    \n",
+    "\n",
     "    fig.update_layout(height=400, title_text=\"Outlier Distribution (Box Plots)\",\n",
     "                     template=\"plotly_white\")\n",
     "    display_figure(fig)"
@@ -1082,7 +1086,7 @@
     "    print(f\"Loaded date sequence from findings: {DATE_SEQUENCE}\")\n",
     "\n",
     "# Detect date columns from findings\n",
-    "date_cols = [name for name, col in findings.columns.items() \n",
+    "date_cols = [name for name, col in findings.columns.items()\n",
     "             if col.inferred_type == ColumnType.DATETIME]\n",
     "\n",
     "print(\"=\" * 60)\n",
@@ -1094,12 +1098,12 @@
     "    df_dates = df.copy()\n",
     "    for col in date_cols:\n",
     "        df_dates[col] = pd.to_datetime(df_dates[col], errors='coerce', format='mixed')\n",
-    "    \n",
+    "\n",
     "    # Date ranges\n",
     "    print(\"\\n📅 DATE RANGES:\")\n",
     "    for col in date_cols:\n",
     "        print(f\"  {col}: {df_dates[col].min()} to {df_dates[col].max()}\")\n",
-    "    \n",
+    "\n",
     "    # Placeholder detection\n",
     "    print(\"\\n🕵️ PLACEHOLDER DATE DETECTION:\")\n",
     "    for col in date_cols:\n",
@@ -1108,14 +1112,14 @@
     "            print(f\"  {col}: {old_dates:,} dates before 2005 (possible placeholders)\")\n",
     "        else:\n",
     "            print(f\"  {col}: No suspicious early dates\")\n",
-    "    \n",
+    "\n",
     "    # Sequence validation\n",
     "    if DATE_SEQUENCE and len(DATE_SEQUENCE) >= 2:\n",
     "        valid_sequence_cols = [c for c in DATE_SEQUENCE if c in date_cols]\n",
     "        if len(valid_sequence_cols) >= 2:\n",
-    "            print(f\"\\n🔗 DATE SEQUENCE VALIDATION:\")\n",
+    "            print(\"\\n🔗 DATE SEQUENCE VALIDATION:\")\n",
     "            print(f\"  Expected order: {' ≤ '.join(valid_sequence_cols)}\")\n",
-    "            \n",
+    "\n",
     "            total_violations = 0\n",
     "            for i in range(len(valid_sequence_cols) - 1):\n",
     "                col1, col2 = valid_sequence_cols[i], valid_sequence_cols[i + 1]\n",
@@ -1123,13 +1127,13 @@
     "                mask = df_dates[col1].notna() & df_dates[col2].notna()\n",
     "                violations = (df_dates.loc[mask, col2] < df_dates.loc[mask, col1]).sum()\n",
     "                total_violations += violations\n",
-    "                \n",
+    "\n",
     "                if violations > 0:\n",
     "                    pct = violations / mask.sum() * 100\n",
     "                    print(f\"  ⚠️ {col2} < {col1}: {violations:,} violations ({pct:.2f}%)\")\n",
     "                else:\n",
     "                    print(f\"  ✓ {col1} ≤ {col2}: No violations\")\n",
-    "            \n",
+    "\n",
     "            if total_violations == 0:\n",
     "                print(\"\\n  ✅ All date sequences valid\")\n",
     "            else:\n",
@@ -1186,7 +1190,7 @@
    },
    "outputs": [],
    "source": [
-    "binary_cols = [name for name, col in findings.columns.items() \n",
+    "binary_cols = [name for name, col in findings.columns.items()\n",
     "               if col.inferred_type == ColumnType.BINARY\n",
     "               and name not in TEMPORAL_METADATA_COLS]\n",
     "\n",
@@ -1203,7 +1207,7 @@
     "        count_0 = (df[col] == 0).sum()\n",
     "        count_1 = (df[col] == 1).sum()\n",
     "        total = count_0 + count_1\n",
-    "        \n",
+    "\n",
     "        binary_results.append({\n",
     "            'column': col,\n",
     "            'unique_values': unique_vals,\n",
@@ -1212,13 +1216,13 @@
     "            'count_1': count_1,\n",
     "            'pct_1': count_1 / total * 100 if total > 0 else 0\n",
     "        })\n",
-    "        \n",
+    "\n",
     "        status = \"✓\" if is_valid else \"⚠️\"\n",
     "        print(f\"\\n{status} {col}:\")\n",
     "        print(f\"   Unique values: {unique_vals}\")\n",
     "        print(f\"   0 (No): {count_0:,} ({count_0/total*100:.1f}%)\")\n",
     "        print(f\"   1 (Yes): {count_1:,} ({count_1/total*100:.1f}%)\")\n",
-    "        \n",
+    "\n",
     "        if not is_valid:\n",
     "            invalid_vals = [v for v in unique_vals if v not in [0, 1, 0.0, 1.0]]\n",
     "            print(f\"   ⚠️ Invalid values found: {invalid_vals}\")\n",
@@ -1226,7 +1230,7 @@
     "    if len(binary_cols) <= 6:\n",
     "        n_cols = len(binary_cols)\n",
     "        fig = make_subplots(rows=1, cols=n_cols, subplot_titles=binary_cols)\n",
-    "        \n",
+    "\n",
     "        for i, col in enumerate(binary_cols, 1):\n",
     "            counts = df[col].value_counts().sort_index()\n",
     "            fig.add_trace(\n",
@@ -1234,7 +1238,7 @@
     "                       marker_color=['#d62728', '#2ca02c'], showlegend=False),\n",
     "                row=1, col=i\n",
     "            )\n",
-    "        \n",
+    "\n",
     "        fig.update_layout(height=350, title_text=\"Binary Field Distributions\",\n",
     "                          template='plotly_white')\n",
     "        display_figure(fig)\n",
@@ -1295,7 +1299,7 @@
     "        if lower_val not in case_variants:\n",
     "            case_variants[lower_val] = []\n",
     "        case_variants[lower_val].append(val)\n",
-    "    \n",
+    "\n",
     "    for lower_val, variants in case_variants.items():\n",
     "        if len(variants) > 1:\n",
     "            consistency_issues.append({\n",
@@ -1308,7 +1312,7 @@
     "if consistency_issues:\n",
     "    print(\"Data Consistency Issues:\")\n",
     "    display(pd.DataFrame([{k: v for k, v in issue.items() if k != \"variants\"} for issue in consistency_issues]))\n",
-    "    \n",
+    "\n",
     "    # Add consistency recommendations\n",
     "    for issue in consistency_issues:\n",
     "        registry.add_bronze_consistency(\n",
@@ -1378,7 +1382,7 @@
     "    high_severity = [r for r in cleaning_recs if r.severity == \"high\"]\n",
     "    medium_severity = [r for r in cleaning_recs if r.severity == \"medium\"]\n",
     "    low_severity = [r for r in cleaning_recs if r.severity == \"low\"]\n",
-    "    \n",
+    "\n",
     "    if high_severity:\n",
     "        print(\"\\n🔴 HIGH PRIORITY (must fix before modeling):\")\n",
     "        print(\"-\" * 60)\n",
@@ -1391,7 +1395,7 @@
     "                print(\"     Action Steps:\")\n",
     "                for step in rec.action_steps:\n",
     "                    print(f\"       • {step}\")\n",
-    "    \n",
+    "\n",
     "    if medium_severity:\n",
     "        print(\"\\n🟡 MEDIUM PRIORITY (recommended fixes):\")\n",
     "        print(\"-\" * 60)\n",
@@ -1404,7 +1408,7 @@
     "                print(\"     Action Steps:\")\n",
     "                for step in rec.action_steps:\n",
     "                    print(f\"       • {step}\")\n",
-    "    \n",
+    "\n",
     "    if low_severity:\n",
     "        print(\"\\n🟢 LOW PRIORITY (nice to have):\")\n",
     "        print(\"-\" * 60)\n",
@@ -1413,13 +1417,13 @@
     "            print(f\"     Issue: {rec.description}\")\n",
     "            print(f\"     Strategy: {rec.strategy_label}\")\n",
     "            print(f\"     Impact: {rec.problem_impact}\")\n",
-    "    \n",
+    "\n",
     "    # Persist cleaning recommendations to registry\n",
     "    for rec in cleaning_recs:\n",
     "        # Check if not already added by previous sections\n",
     "        existing_null = [r for r in registry.bronze.null_handling if r.target_column == rec.column_name]\n",
     "        existing_outlier = [r for r in registry.bronze.outlier_handling if r.target_column == rec.column_name]\n",
-    "        \n",
+    "\n",
     "        if rec.issue_type in [\"null_values\", \"missing_values\"] and not existing_null:\n",
     "            strategy = \"median\" if \"median\" in rec.strategy.lower() else \"mode\" if \"mode\" in rec.strategy.lower() else \"drop\"\n",
     "            registry.add_bronze_null(\n",
@@ -1436,12 +1440,12 @@
     "                rationale=rec.description,\n",
     "                source_notebook=\"03_quality_assessment\"\n",
     "            )\n",
-    "    \n",
+    "\n",
     "    # Summary table\n",
     "    print(\"\\n\" + \"=\" * 80)\n",
     "    print(\"CLEANUP SUMMARY\")\n",
     "    print(\"=\" * 80)\n",
-    "    \n",
+    "\n",
     "    summary_data = []\n",
     "    for rec in cleaning_recs:\n",
     "        summary_data.append({\n",
@@ -1451,10 +1455,10 @@
     "            \"Recommended Action\": rec.strategy_label,\n",
     "            \"Affected Rows\": f\"{rec.affected_rows:,}\"\n",
     "        })\n",
-    "    \n",
+    "\n",
     "    summary_df = pd.DataFrame(summary_data)\n",
     "    display(summary_df)\n",
-    "    \n",
+    "\n",
     "    # Total impact\n",
     "    total_affected = sum(r.affected_rows for r in cleaning_recs)\n",
     "    unique_affected = min(total_affected, len(df))  # Can't exceed total rows\n",
@@ -1513,7 +1517,7 @@
     "\n",
     "# Summary of recommendations\n",
     "all_recs = registry.all_recommendations\n",
-    "print(f\"\\n📋 Recommendations Summary:\")\n",
+    "print(\"\\n📋 Recommendations Summary:\")\n",
     "print(f\"   Bronze layer: {len(registry.get_by_layer('bronze'))} recommendations\")\n",
     "if registry.silver:\n",
     "    print(f\"   Silver layer: {len(registry.get_by_layer('silver'))} recommendations\")\n",

churnkit 0.76.0a3__py3-none-any.whl → 0.76.1a2__py3-none-any.whl

churnkit 0.76.0a3py3-none-any.whl → 0.76.1a2py3-none-any.whl