PyPI - churnkit - Versions diffs - 0.76.0a3__py3-none-any.whl → 0.76.1a2__py3-none-any.whl - Mend

churnkit 0.76.0a3py3-none-any.whl → 0.76.1a2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

{churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb RENAMED Viewed

@@ -96,22 +96,26 @@
    "outputs": [],
    "source": [
     "from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
+    "\n",
     "track_and_export_previous(\"01a_temporal_deep_dive.ipynb\")\n",
     "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import plotly.graph_objects as go\n",
+    "from plotly.subplots import make_subplots\n",
+    "\n",
     "from customer_retention.analysis.auto_explorer import ExplorationFindings\n",
     "from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table\n",
     "from customer_retention.core.config.column_config import ColumnType, DatasetGranularity\n",
+    "from customer_retention.core.config.experiments import (\n",
+    "    FINDINGS_DIR,\n",
+    ")\n",
     "from customer_retention.stages.profiling import (\n",
-    "    TimeSeriesProfiler, TimeSeriesProfile,\n",
+    "    DistributionAnalyzer,\n",
+    "    TimeSeriesProfiler,\n",
+    "    TransformationType,\n",
     "    TypeDetector,\n",
-    "    DistributionAnalyzer, TransformationType,\n",
-    ")\n",
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "import plotly.graph_objects as go\n",
-    "import plotly.express as px\n",
-    "from plotly.subplots import make_subplots\n",
-    "from customer_retention.core.config.experiments import FINDINGS_DIR, EXPERIMENTS_DIR, OUTPUT_DIR, setup_experiments_structure\n"
+    ")\n"
    ]
   },
   {
@@ -141,7 +145,6 @@
     "# FINDINGS_PATH = \"../experiments/findings/transactions_abc123_findings.yaml\"\n",
     "\n",
     "# Option 2: Auto-discover findings files\n",
-    "from pathlib import Path\n",
     "\n",
     "# FINDINGS_DIR imported from customer_retention.core.config.experiments\n",
     "\n",
@@ -238,7 +241,7 @@
    },
    "outputs": [],
    "source": [
-    "from customer_retention.stages.temporal import load_data_with_snapshot_preference, TEMPORAL_METADATA_COLS\n",
+    "from customer_retention.stages.temporal import TEMPORAL_METADATA_COLS, load_data_with_snapshot_preference\n",
     "\n",
     "df, data_source = load_data_with_snapshot_preference(findings, output_dir=str(FINDINGS_DIR))\n",
     "charts = ChartBuilder()\n",
@@ -280,7 +283,7 @@
     "    # Manual configuration - uncomment and set if auto-detection failed\n",
     "    # ENTITY_COLUMN = \"customer_id\"\n",
     "    # TIME_COLUMN = \"event_date\"\n",
-    "    \n",
+    "\n",
     "    # Try auto-detection\n",
     "    detector = TypeDetector()\n",
     "    granularity = detector.detect_granularity(df)\n",
@@ -346,17 +349,17 @@
     "print(\"=\"*70)\n",
     "print(\"TIME SERIES PROFILE SUMMARY\")\n",
     "print(\"=\"*70)\n",
-    "print(f\"\\n\\U0001f4ca Dataset Overview:\")\n",
+    "print(\"\\n\\U0001f4ca Dataset Overview:\")\n",
     "print(f\"   Total Events: {ts_profile.total_events:,}\")\n",
     "print(f\"   Unique Entities: {ts_profile.unique_entities:,}\")\n",
     "print(f\"   Avg Events/Entity: {ts_profile.events_per_entity.mean:.1f}\")\n",
     "print(f\"   Time Span: {ts_profile.time_span_days:,} days ({ts_profile.time_span_days/365:.1f} years)\")\n",
     "\n",
-    "print(f\"\\n\\U0001f4c5 Date Range:\")\n",
+    "print(\"\\n\\U0001f4c5 Date Range:\")\n",
     "print(f\"   First Event: {ts_profile.first_event_date}\")\n",
     "print(f\"   Last Event: {ts_profile.last_event_date}\")\n",
     "\n",
-    "print(f\"\\n\\u23f1\\ufe0f  Inter-Event Timing:\")\n",
+    "print(\"\\n\\u23f1\\ufe0f  Inter-Event Timing:\")\n",
     "if ts_profile.avg_inter_event_days is not None:\n",
     "    print(f\"   Avg Days Between Events: {ts_profile.avg_inter_event_days:.1f}\")\n",
     "else:\n",
@@ -791,7 +794,7 @@
    },
    "outputs": [],
    "source": [
-    "print(f\"Coverage Summary:\")\n",
+    "print(\"Coverage Summary:\")\n",
     "print(f\"  Time span: {coverage_result.time_span_days:,} days \"\n",
     "      f\"({coverage_result.first_event.strftime('%Y-%m-%d')} to {coverage_result.last_event.strftime('%Y-%m-%d')})\")\n",
     "print(f\"  Volume trend: {coverage_result.volume_trend} ({coverage_result.volume_change_pct:+.0%})\")\n",
@@ -799,11 +802,11 @@
     "      + (f\" ({sum(g.duration_days for g in coverage_result.gaps):.0f} total days)\" if coverage_result.gaps else \"\"))\n",
     "\n",
     "if coverage_result.recommendations:\n",
-    "    print(f\"\\nRecommendations:\")\n",
+    "    print(\"\\nRecommendations:\")\n",
     "    for rec in coverage_result.recommendations:\n",
     "        print(f\"  -> {rec}\")\n",
     "else:\n",
-    "    print(f\"\\nNo coverage issues detected — data is suitable for all candidate windows.\")"
+    "    print(\"\\nNo coverage issues detected — data is suitable for all candidate windows.\")"
    ]
   },
   {
@@ -843,7 +846,7 @@
     "if drift.recommended_training_start:\n",
     "    print(f\"  Recommended training start: {drift.recommended_training_start.strftime('%Y-%m-%d')}\")\n",
     "\n",
-    "print(f\"\\nRationale:\")\n",
+    "print(\"\\nRationale:\")\n",
     "for r in drift.rationale:\n",
     "    print(f\"  -> {r}\")"
    ]
@@ -904,7 +907,7 @@
     "\n",
     "if inter_event_times:\n",
     "    inter_event_series = pd.Series(inter_event_times)\n",
-    "    \n",
+    "\n",
     "    print(\"\\u23f1\\ufe0f  Inter-Event Time Distribution (days):\")\n",
     "    print(f\"   Min: {inter_event_series.min():.2f}\")\n",
     "    print(f\"   25th percentile: {inter_event_series.quantile(0.25):.2f}\")\n",
@@ -912,14 +915,14 @@
     "    print(f\"   Mean: {inter_event_series.mean():.2f}\")\n",
     "    print(f\"   75th percentile: {inter_event_series.quantile(0.75):.2f}\")\n",
     "    print(f\"   Max: {inter_event_series.max():.2f}\")\n",
-    "    \n",
+    "\n",
     "    # Histogram\n",
     "    fig = go.Figure()\n",
-    "    \n",
+    "\n",
     "    # Cap at 99th percentile for visualization\n",
     "    cap = inter_event_series.quantile(0.99)\n",
     "    display_data = inter_event_series[inter_event_series <= cap]\n",
-    "    \n",
+    "\n",
     "    fig.add_trace(go.Histogram(\n",
     "        x=display_data,\n",
     "        nbinsx=50,\n",
@@ -927,11 +930,11 @@
     "        marker_color=\"coral\",\n",
     "        opacity=0.7\n",
     "    ))\n",
-    "    \n",
+    "\n",
     "    fig.add_vline(x=inter_event_series.median(), line_dash=\"solid\", line_color=\"green\",\n",
     "                  annotation_text=f\"Median: {inter_event_series.median():.1f} days\",\n",
     "                  annotation_position=\"top right\")\n",
-    "    \n",
+    "\n",
     "    fig.update_layout(\n",
     "        title=f\"Inter-Event Time Distribution (capped at {cap:.0f} days = 99th percentile)\",\n",
     "        xaxis_title=\"Days Between Events\",\n",
@@ -978,7 +981,7 @@
     "    if skew_ratio > 1.5:\n",
     "        print(f\"  Distribution is heavily right-skewed (mean/median = {skew_ratio:.2f})\")\n",
     "        print(f\"  -> Most entities engage frequently (median {median_iet:.0f}d between events)\")\n",
-    "        print(f\"  -> A long tail of entities has very infrequent engagement\")\n",
+    "        print(\"  -> A long tail of entities has very infrequent engagement\")\n",
     "    elif skew_ratio > 1.2:\n",
     "        print(f\"  Distribution is moderately right-skewed (mean/median = {skew_ratio:.2f})\")\n",
     "        print(f\"  -> Typical engagement every {median_iet:.0f} days, with some long gaps\")\n",
@@ -988,11 +991,11 @@
     "\n",
     "    print(f\"\\n  Spread: IQR = {iqr:.0f} days (Q25={q25:.0f}d to Q75={q75:.0f}d)\")\n",
     "    if iqr > median_iet:\n",
-    "        print(f\"  -> High variability (IQR > median) — entities have inconsistent timing\")\n",
+    "        print(\"  -> High variability (IQR > median) — entities have inconsistent timing\")\n",
     "    else:\n",
-    "        print(f\"  -> Moderate variability — most entities follow a similar cadence\")\n",
+    "        print(\"  -> Moderate variability — most entities follow a similar cadence\")\n",
     "\n",
-    "    print(f\"\\nRecommendations:\")\n",
+    "    print(\"\\nRecommendations:\")\n",
     "    # Window alignment\n",
     "    window_map = [(1, \"24h\"), (7, \"7d\"), (14, \"14d\"), (30, \"30d\"),\n",
     "                  (90, \"90d\"), (180, \"180d\"), (365, \"365d\")]\n",
@@ -1000,7 +1003,7 @@
     "    if aligned:\n",
     "        aligned_str = \", \".join(w for _, w in aligned)\n",
     "        print(f\"  -> Windows aligned with median inter-event time: {aligned_str}\")\n",
-    "        print(f\"     These capture ~2 events per entity on average\")\n",
+    "        print(\"     These capture ~2 events per entity on average\")\n",
     "    else:\n",
     "        print(f\"  -> Median inter-event ({median_iet:.0f}d) does not align with standard windows\")\n",
     "\n",
@@ -1010,11 +1013,11 @@
     "        print(f\"  -> 30d window captures only ~{events_in_30d:.1f} events/entity — \"\n",
     "              f\"consider longer windows (90d+) for meaningful aggregations\")\n",
     "    if median_iet < 7:\n",
-    "        print(f\"  -> High frequency engagement — 7d and 24h windows will be rich with signal\")\n",
+    "        print(\"  -> High frequency engagement — 7d and 24h windows will be rich with signal\")\n",
     "\n",
     "    if skew_ratio > 1.5:\n",
-    "        print(f\"  -> Consider log-transforming inter-event time as a feature \"\n",
-    "              f\"(reduces right-skew impact on models)\")\n"
+    "        print(\"  -> Consider log-transforming inter-event time as a feature \"\n",
+    "              \"(reduces right-skew impact on models)\")\n"
    ]
   },
   {
@@ -1061,13 +1064,13 @@
     "# Use framework's DistributionAnalyzer for comprehensive analysis\n",
     "analyzer = DistributionAnalyzer()\n",
     "\n",
-    "numeric_cols = [n for n, c in findings.columns.items() \n",
+    "numeric_cols = [n for n, c in findings.columns.items()\n",
     "                if c.inferred_type.value in ('numeric_continuous', 'numeric_discrete')\n",
     "                and n not in [ENTITY_COLUMN, TIME_COLUMN] and n not in TEMPORAL_METADATA_COLS]\n",
     "\n",
     "# Analyze all numeric columns using the framework\n",
     "analyses = analyzer.analyze_dataframe(df, numeric_cols)\n",
-    "recommendations = {col: analyzer.recommend_transformation(analysis) \n",
+    "recommendations = {col: analyzer.recommend_transformation(analysis)\n",
     "                   for col, analysis in analyses.items()}\n",
     "\n",
     "# Human-readable transformation names\n",
@@ -1092,25 +1095,25 @@
     "    col_info = findings.columns[col_name]\n",
     "    analysis = analyses.get(col_name)\n",
     "    rec = recommendations.get(col_name)\n",
-    "    \n",
+    "\n",
     "    print(f\"\\n{'='*70}\")\n",
     "    print(f\"Column: {col_name}\")\n",
     "    print(f\"Type: {col_info.inferred_type.value} (Confidence: {col_info.confidence:.0%})\")\n",
-    "    print(f\"-\" * 70)\n",
-    "    \n",
+    "    print(\"-\" * 70)\n",
+    "\n",
     "    if analysis:\n",
-    "        print(f\"📊 Distribution Statistics:\")\n",
+    "        print(\"📊 Distribution Statistics:\")\n",
     "        print(f\"   Mean: {analysis.mean:.3f}  |  Median: {analysis.median:.3f}  |  Std: {analysis.std:.3f}\")\n",
     "        print(f\"   Range: [{analysis.min_value:.3f}, {analysis.max_value:.3f}]\")\n",
     "        print(f\"   Percentiles: 1%={analysis.percentiles['p1']:.3f}, 25%={analysis.q1:.3f}, 75%={analysis.q3:.3f}, 99%={analysis.percentiles['p99']:.3f}\")\n",
-    "        print(f\"\\n📈 Shape Analysis:\")\n",
+    "        print(\"\\n📈 Shape Analysis:\")\n",
     "        skew_label = '(Right-skewed)' if analysis.skewness > 0.5 else '(Left-skewed)' if analysis.skewness < -0.5 else '(Symmetric)'\n",
     "        print(f\"   Skewness: {analysis.skewness:.2f} {skew_label}\")\n",
     "        kurt_label = '(Heavy tails/outliers)' if analysis.kurtosis > 3 else '(Light tails)'\n",
     "        print(f\"   Kurtosis: {analysis.kurtosis:.2f} {kurt_label}\")\n",
     "        print(f\"   Zeros: {analysis.zero_count:,} ({analysis.zero_percentage:.1f}%)\")\n",
     "        print(f\"   Outliers (IQR): {analysis.outlier_count_iqr:,} ({analysis.outlier_percentage:.1f}%)\")\n",
-    "        \n",
+    "\n",
     "        if rec:\n",
     "            transform_display = TRANSFORM_DISPLAY_NAMES.get(rec.recommended_transform.value, rec.recommended_transform.value)\n",
     "            print(f\"\\n🔧 Recommended Transformation: {transform_display}\")\n",
@@ -1149,20 +1152,20 @@
     "    rec = recommendations.get(col_name)\n",
     "    if not analysis:\n",
     "        continue\n",
-    "    \n",
+    "\n",
     "    data = df[col_name].dropna()\n",
     "    fig = go.Figure()\n",
-    "    \n",
+    "\n",
     "    fig.add_trace(go.Histogram(x=data, nbinsx=50, name='Distribution',\n",
     "                                marker_color='steelblue', opacity=0.7))\n",
-    "    \n",
+    "\n",
     "    mean_val = data.mean()\n",
     "    median_val = data.median()\n",
-    "    \n",
+    "\n",
     "    # Position labels on opposite sides to avoid overlap\n",
     "    mean_position = \"top right\" if mean_val >= median_val else \"top left\"\n",
     "    median_position = \"top left\" if mean_val >= median_val else \"top right\"\n",
-    "    \n",
+    "\n",
     "    fig.add_vline(\n",
     "        x=mean_val, line_dash=\"dash\", line_color=\"red\",\n",
     "        annotation_text=f\"Mean: {mean_val:.2f}\",\n",
@@ -1170,7 +1173,7 @@
     "        annotation_font_color=\"red\",\n",
     "        annotation_bgcolor=\"rgba(255,255,255,0.8)\"\n",
     "    )\n",
-    "    \n",
+    "\n",
     "    fig.add_vline(\n",
     "        x=median_val, line_dash=\"solid\", line_color=\"green\",\n",
     "        annotation_text=f\"Median: {median_val:.2f}\",\n",
@@ -1178,7 +1181,7 @@
     "        annotation_font_color=\"green\",\n",
     "        annotation_bgcolor=\"rgba(255,255,255,0.8)\"\n",
     "    )\n",
-    "    \n",
+    "\n",
     "    # Add 99th percentile marker if there are outliers\n",
     "    if analysis.outlier_percentage > 5:\n",
     "        fig.add_vline(x=analysis.percentiles['p99'], line_dash=\"dot\", line_color=\"orange\",\n",
@@ -1186,7 +1189,7 @@
     "                      annotation_position=\"top right\",\n",
     "                      annotation_font_color=\"orange\",\n",
     "                      annotation_bgcolor=\"rgba(255,255,255,0.8)\")\n",
-    "    \n",
+    "\n",
     "    transform_key = rec.recommended_transform.value if rec else \"none\"\n",
     "    transform_label = TRANSFORM_DISPLAY_NAMES.get(transform_key, transform_key)\n",
     "    fig.update_layout(\n",
@@ -1233,12 +1236,12 @@
     "for col_name in categorical_cols:\n",
     "    col_info = findings.columns[col_name]\n",
     "    cardinality = col_info.universal_metrics.get('distinct_count', df[col_name].nunique())\n",
-    "    \n",
+    "\n",
     "    print(f\"\\n{'='*50}\")\n",
     "    print(f\"Column: {col_name}\")\n",
     "    print(f\"Type: {col_info.inferred_type.value} (Confidence: {col_info.confidence:.0%})\")\n",
     "    print(f\"Distinct Values: {cardinality}\")\n",
-    "    \n",
+    "\n",
     "    # Encoding recommendation based on type and cardinality\n",
     "    if col_info.inferred_type.value == 'categorical_cyclical':\n",
     "        encoding_rec = \"Sin/Cos encoding (cyclical)\"\n",
@@ -1249,7 +1252,7 @@
     "    else:\n",
     "        encoding_rec = \"Target encoding or Frequency encoding (high cardinality)\"\n",
     "    print(f\"Recommended Encoding: {encoding_rec}\")\n",
-    "    \n",
+    "\n",
     "    # Value counts visualization\n",
     "    value_counts = df[col_name].value_counts().head(10)\n",
     "    fig = charts.bar_chart(value_counts.index.tolist(), value_counts.values.tolist(),\n",
@@ -1314,7 +1317,7 @@
     "    # Sort by priority\n",
     "    priority_order = {'high': 0, 'medium': 1, 'low': 2}\n",
     "    transformations.sort(key=lambda x: priority_order.get(x['priority'], 3))\n",
-    "    \n",
+    "\n",
     "    for t in transformations:\n",
     "        priority_marker = \"🔴\" if t['priority'] == 'high' else \"🟡\" if t['priority'] == 'medium' else \"🟢\"\n",
     "        print(f\"\\n   {priority_marker} {t['column']}: {t['transform']}\")\n",
@@ -1352,7 +1355,7 @@
     "    print(\"TEMPORAL AGGREGATION PERSPECTIVE\")\n",
     "    print(\"=\"*70)\n",
     "    print(f\"\\nMedian inter-event time: {median_iet:.0f} days\")\n",
-    "    print(f\"Expected events per window (at median cadence):\")\n",
+    "    print(\"Expected events per window (at median cadence):\")\n",
     "    windows_days = [(\"7d\", 7), (\"30d\", 30), (\"90d\", 90), (\"180d\", 180), (\"365d\", 365)]\n",
     "    for label, days in windows_days:\n",
     "        expected = days / median_iet if median_iet > 0 else 0\n",
@@ -1360,7 +1363,7 @@
     "        print(f\"   {marker} {label}: ~{expected:.1f} events/entity\")\n",
     "\n",
     "    # Within-entity vs between-entity variance per column\n",
-    "    print(f\"\\nColumn Temporal Variability (within-entity CV vs between-entity CV):\")\n",
+    "    print(\"\\nColumn Temporal Variability (within-entity CV vs between-entity CV):\")\n",
     "    print(f\"{'Column':<25} {'Within-CV':<12} {'Between-CV':<12} {'Ratio':<8} {'Aggregation Guidance'}\")\n",
     "    print(\"-\" * 90)\n",
     "\n",
@@ -1390,11 +1393,11 @@
     "        ratio_str = f\"{ratio:.2f}\" if not np.isinf(ratio) else \">10\"\n",
     "        print(f\"{col:<25} {within_str:<12} {between_cv:<12.2f} {ratio_str:<8} {guidance}\")\n",
     "\n",
-    "    print(f\"\\nInterpretation:\")\n",
-    "    print(f\"  Within-CV: how much each entity\\'s values vary across their events\")\n",
-    "    print(f\"  Between-CV: how much entity averages differ from each other\")\n",
-    "    print(f\"  Ratio > 1: temporal variation dominates -> shorter windows capture dynamics\")\n",
-    "    print(f\"  Ratio < 1: entity identity dominates -> longer windows (or all_time) sufficient\")\n"
+    "    print(\"\\nInterpretation:\")\n",
+    "    print(\"  Within-CV: how much each entity\\'s values vary across their events\")\n",
+    "    print(\"  Between-CV: how much entity averages differ from each other\")\n",
+    "    print(\"  Ratio > 1: temporal variation dominates -> shorter windows capture dynamics\")\n",
+    "    print(\"  Ratio < 1: entity identity dominates -> longer windows (or all_time) sufficient\")\n"
    ]
   },
   {
@@ -1466,8 +1469,8 @@
     "explanation[\"meaningful_pct\"] = (explanation[\"meaningful_pct\"] * 100).round(1).astype(str) + \"%\"\n",
     "display_table(explanation)\n",
     "\n",
-    "print(f\"\\nCoverage: % of entities with enough tenure AND expected >=2 events in that window\")\n",
-    "print(f\"Meaningful: among entities with enough tenure, % that have sufficient event density\")"
+    "print(\"\\nCoverage: % of entities with enough tenure AND expected >=2 events in that window\")\n",
+    "print(\"Meaningful: among entities with enough tenure, % that have sufficient event density\")"
    ]
   },
   {
@@ -1495,9 +1498,9 @@
     "h = window_result.heterogeneity\n",
     "\n",
     "print(\"Temporal Heterogeneity (eta-squared):\")\n",
-    "print(f\"  eta² measures the fraction of variance in a metric explained by lifecycle quadrant grouping.\")\n",
-    "print(f\"  Scale: 0 = no group differences, 1 = all variance is between groups.\")\n",
-    "print(f\"  Thresholds: <0.06 = low | 0.06-0.14 = moderate | >0.14 = high effect size\\n\")\n",
+    "print(\"  eta² measures the fraction of variance in a metric explained by lifecycle quadrant grouping.\")\n",
+    "print(\"  Scale: 0 = no group differences, 1 = all variance is between groups.\")\n",
+    "print(\"  Thresholds: <0.06 = low | 0.06-0.14 = moderate | >0.14 = high effect size\\n\")\n",
     "\n",
     "eta_max = max(h.eta_squared_intensity, h.eta_squared_event_count)\n",
     "print(f\"  Intensity eta²:   {h.eta_squared_intensity:.3f}  {'<-- dominant' if h.eta_squared_intensity >= h.eta_squared_event_count else ''}\")\n",

{churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb RENAMED Viewed

@@ -71,22 +71,25 @@
    "outputs": [],
    "source": [
     "from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
+    "\n",
     "track_and_export_previous(\"01b_temporal_quality.ipynb\")\n",
     "\n",
-    "from pathlib import Path\n",
     "import pandas as pd\n",
     "import plotly.graph_objects as go\n",
-    "from plotly.subplots import make_subplots\n",
     "\n",
     "from customer_retention.analysis.auto_explorer import ExplorationFindings, RecommendationEngine\n",
     "from customer_retention.analysis.visualization import ChartBuilder, display_figure\n",
     "from customer_retention.core.config.column_config import ColumnType\n",
+    "from customer_retention.core.config.experiments import FINDINGS_DIR\n",
     "from customer_retention.stages.profiling import (\n",
-    "    DuplicateEventCheck, TemporalGapCheck, FutureDateCheck, EventOrderCheck,\n",
-    "    TemporalQualityReporter, SegmentAwareOutlierAnalyzer\n",
+    "    DuplicateEventCheck,\n",
+    "    EventOrderCheck,\n",
+    "    FutureDateCheck,\n",
+    "    SegmentAwareOutlierAnalyzer,\n",
+    "    TemporalGapCheck,\n",
+    "    TemporalQualityReporter,\n",
     ")\n",
-    "from customer_retention.stages.temporal import load_data_with_snapshot_preference, TEMPORAL_METADATA_COLS\n",
-    "from customer_retention.core.config.experiments import FINDINGS_DIR, EXPERIMENTS_DIR, OUTPUT_DIR, setup_experiments_structure\n"
+    "from customer_retention.stages.temporal import TEMPORAL_METADATA_COLS, load_data_with_snapshot_preference\n"
    ]
   },
   {
@@ -397,7 +400,7 @@
     "if numeric_cols:\n",
     "    analyzer = SegmentAwareOutlierAnalyzer(max_segments=5)\n",
     "    result = analyzer.analyze(df, feature_cols=numeric_cols, segment_col=None, target_col=findings.target_column)\n",
-    "    \n",
+    "\n",
     "    print(f\"Segments detected: {result.n_segments}\")\n",
     "    if result.n_segments > 1:\n",
     "        data = [{\"Feature\": c, \"Global\": result.global_analysis[c].outliers_detected,\n",
@@ -456,25 +459,7 @@
     "tags": []
    },
    "outputs": [],
-   "source": [
-    "# Binary field validation\n",
-    "binary_cols = [n for n, c in findings.columns.items() if c.inferred_type == ColumnType.BINARY and n not in TEMPORAL_METADATA_COLS]\n",
-    "for col in binary_cols:\n",
-    "    c0, c1 = (df[col] == 0).sum(), (df[col] == 1).sum()\n",
-    "    print(f\"✓ {col}: 0={c0:,} ({c0/(c0+c1)*100:.1f}%), 1={c1:,} ({c1/(c0+c1)*100:.1f}%)\")\n",
-    "\n",
-    "# Consistency check\n",
-    "issues = []\n",
-    "for col in df.select_dtypes(include=['object']).columns:\n",
-    "    if col in [ENTITY_COLUMN, TIME_COLUMN]: continue\n",
-    "    variants = {}\n",
-    "    for v in df[col].dropna().unique():\n",
-    "        key = str(v).lower().strip()\n",
-    "        variants.setdefault(key, []).append(v)\n",
-    "    issues.extend([{\"Column\": col, \"Variants\": vs} for vs in variants.values() if len(vs) > 1])\n",
-    "\n",
-    "print(f\"\\n{'⚠️ Consistency issues: ' + str(len(issues)) if issues else '✅ No consistency issues'}\")"
-   ]
+   "source": "# Binary field validation\nbinary_cols = [n for n, c in findings.columns.items() if c.inferred_type == ColumnType.BINARY and n not in TEMPORAL_METADATA_COLS]\nfor col in binary_cols:\n    c0, c1 = (df[col] == 0).sum(), (df[col] == 1).sum()\n    print(f\"✓ {col}: 0={c0:,} ({c0/(c0+c1)*100:.1f}%), 1={c1:,} ({c1/(c0+c1)*100:.1f}%)\")\n\n# Consistency check\nissues = []\nfor col in df.select_dtypes(include=['object']).columns:\n    if col in [ENTITY_COLUMN, TIME_COLUMN]:\n        continue\n    variants = {}\n    for v in df[col].dropna().unique():\n        key = str(v).lower().strip()\n        variants.setdefault(key, []).append(v)\n    issues.extend([{\"Column\": col, \"Variants\": vs} for vs in variants.values() if len(vs) > 1])\n\nprint(f\"\\n{'⚠️ Consistency issues: ' + str(len(issues)) if issues else '✅ No consistency issues'}\")"
   },
   {
    "cell_type": "markdown",
@@ -676,4 +661,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}

churnkit 0.76.0a3__py3-none-any.whl → 0.76.1a2__py3-none-any.whl

churnkit 0.76.0a3py3-none-any.whl → 0.76.1a2py3-none-any.whl