PyPI - churnkit - Versions diffs - 0.76.0a3__py3-none-any.whl → 0.76.1a2__py3-none-any.whl - Mend

churnkit 0.76.0a3py3-none-any.whl → 0.76.1a2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

{churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb RENAMED Viewed

@@ -95,27 +95,31 @@
    "outputs": [],
    "source": [
     "from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
+    "\n",
     "track_and_export_previous(\"01d_event_aggregation.ipynb\")\n",
     "\n",
-    "from customer_retention.analysis.auto_explorer import ExplorationFindings, DataExplorer\n",
-    "from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table\n",
-    "from customer_retention.core.config.column_config import ColumnType, DatasetGranularity\n",
+    "from datetime import datetime\n",
+    "from pathlib import Path\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "from customer_retention.analysis.auto_explorer import DataExplorer, ExplorationFindings\n",
+    "from customer_retention.analysis.visualization import ChartBuilder\n",
+    "from customer_retention.core.config.experiments import (\n",
+    "    EXPERIMENTS_DIR,\n",
+    "    FINDINGS_DIR,\n",
+    ")\n",
     "from customer_retention.stages.profiling import (\n",
     "    AggregationFeatureConfig,\n",
-    "    TimeWindowAggregator,\n",
     "    TimeSeriesProfiler,\n",
+    "    TimeWindowAggregator,\n",
     "    classify_lifecycle_quadrants,\n",
-    "    classify_activity_segments,\n",
     "    create_momentum_ratio_features,\n",
     "    create_recency_bucket_feature,\n",
     "    deduplicate_events,\n",
     "    get_duplicate_event_count,\n",
-    ")\n",
-    "from datetime import datetime\n",
-    "from pathlib import Path\n",
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "from customer_retention.core.config.experiments import FINDINGS_DIR, EXPERIMENTS_DIR, OUTPUT_DIR, setup_experiments_structure"
+    ")"
    ]
   },
   {
@@ -145,7 +149,7 @@
     "\n",
     "# Find findings files (exclude multi_dataset and already-aggregated)\n",
     "findings_files = [\n",
-    "    f for f in FINDINGS_DIR.glob(\"*_findings.yaml\") \n",
+    "    f for f in FINDINGS_DIR.glob(\"*_findings.yaml\")\n",
     "    if \"multi_dataset\" not in f.name and \"_aggregated\" not in f.name\n",
     "]\n",
     "if not findings_files:\n",
@@ -212,7 +216,7 @@
     "    print(\"\\n   ⚠️ No window recommendations - will use defaults\")\n",
     "\n",
     "if ts_meta.temporal_segmentation_recommendation:\n",
-    "    print(f\"\\n   📋 Segmentation recommendation:\")\n",
+    "    print(\"\\n   📋 Segmentation recommendation:\")\n",
     "    print(f\"      {ts_meta.temporal_segmentation_recommendation}\")\n",
     "    if ts_meta.heterogeneity_level:\n",
     "        print(f\"      Heterogeneity: {ts_meta.heterogeneity_level}\")\n",
@@ -227,7 +231,7 @@
     "# === 01b: Temporal Quality ===\n",
     "quality_meta = findings.metadata.get(\"temporal_quality\", {})\n",
     "if quality_meta:\n",
-    "    print(f\"\\n📋 FROM 01b (Temporal Quality):\")\n",
+    "    print(\"\\n📋 FROM 01b (Temporal Quality):\")\n",
     "    if quality_meta.get(\"temporal_quality_score\"):\n",
     "        print(f\"   Quality score: {quality_meta.get('temporal_quality_score'):.1f}\")\n",
     "    if quality_meta.get(\"temporal_quality_grade\"):\n",
@@ -246,7 +250,7 @@
     "COHORT_RECOMMENDATIONS = []  # Store for later application\n",
     "\n",
     "if pattern_meta:\n",
-    "    print(f\"\\n📈 FROM 01c (Temporal Patterns):\")\n",
+    "    print(\"\\n📈 FROM 01c (Temporal Patterns):\")\n",
     "    windows_used = pattern_meta.get(\"windows_used\", {})\n",
     "    if windows_used:\n",
     "        if windows_used.get(\"aggregation_windows\"):\n",
@@ -255,17 +259,17 @@
     "            print(f\"   Velocity window: {windows_used.get('velocity_window')} days\")\n",
     "        if windows_used.get(\"momentum_pairs\"):\n",
     "            print(f\"   Momentum pairs: {windows_used.get('momentum_pairs')}\")\n",
-    "    \n",
+    "\n",
     "    trend = pattern_meta.get(\"trend\", {})\n",
     "    if trend and trend.get(\"direction\"):\n",
     "        print(f\"\\n   Trend: {trend.get('direction')} (strength: {trend.get('strength', 0):.2f})\")\n",
     "        TREND_RECOMMENDATIONS = trend.get(\"recommendations\", [])\n",
     "        trend_features = [r for r in TREND_RECOMMENDATIONS if r.get(\"features\")]\n",
     "        if trend_features:\n",
-    "            print(f\"\\n   📈 Trend Features to Add:\")\n",
+    "            print(\"\\n   📈 Trend Features to Add:\")\n",
     "            for rec in trend_features:\n",
     "                print(f\"      → {', '.join(rec['features'])} ({rec['priority']} priority)\")\n",
-    "    \n",
+    "\n",
     "    # Handle both old format (list) and new format (dict with patterns and recommendations)\n",
     "    seasonality = pattern_meta.get(\"seasonality\", {})\n",
     "    if isinstance(seasonality, list):\n",
@@ -274,14 +278,14 @@
     "    else:\n",
     "        patterns = seasonality.get(\"patterns\", [])\n",
     "        SEASONALITY_RECOMMENDATIONS = seasonality.get(\"recommendations\", [])\n",
-    "    \n",
+    "\n",
     "    if patterns:\n",
     "        periods = [f\"{s.get('name', 'period')} ({s.get('period')}d)\" for s in patterns[:3]]\n",
     "        print(f\"   Seasonality: {', '.join(periods)}\")\n",
-    "    \n",
+    "\n",
     "    # Display seasonality recommendations\n",
     "    if SEASONALITY_RECOMMENDATIONS:\n",
-    "        print(f\"\\n   📋 Seasonality Recommendations:\")\n",
+    "        print(\"\\n   📋 Seasonality Recommendations:\")\n",
     "        for rec in SEASONALITY_RECOMMENDATIONS:\n",
     "            action = rec.get(\"action\", \"\").replace(\"_\", \" \")\n",
     "            if action == \"add cyclical feature\":\n",
@@ -292,18 +296,18 @@
     "                print(f\"      → Warning: Windows don't align with cycles {rec.get('detected_periods')}\")\n",
     "            elif action == \"consider deseasonalization\":\n",
     "                print(f\"      → Consider deseasonalizing for periods {rec.get('periods')}\")\n",
-    "    \n",
+    "\n",
     "    recency = pattern_meta.get(\"recency\", {})\n",
     "    if recency and recency.get(\"median_days\"):\n",
     "        print(f\"   Recency: median={recency.get('median_days'):.0f} days, \"\n",
     "              f\"target_corr={recency.get('target_correlation', 0):.2f}\")\n",
-    "    \n",
+    "\n",
     "    # Divergent columns (important for feature prioritization)\n",
     "    velocity = pattern_meta.get(\"velocity\", {})\n",
     "    divergent_velocity = [k for k, v in velocity.items() if isinstance(v, dict) and v.get(\"divergent\")]\n",
     "    if divergent_velocity:\n",
     "        print(f\"\\n   🎯 Divergent velocity columns: {divergent_velocity}\")\n",
-    "    \n",
+    "\n",
     "    momentum = pattern_meta.get(\"momentum\", {})\n",
     "    divergent_momentum = momentum.get(\"_divergent_columns\", [])\n",
     "    if divergent_momentum:\n",
@@ -320,7 +324,7 @@
     "        else:\n",
     "            cohort_features = [r for r in COHORT_RECOMMENDATIONS if r.get(\"features\")]\n",
     "            if cohort_features:\n",
-    "                print(f\"\\n   👥 Cohort Features to Add:\")\n",
+    "                print(\"\\n   👥 Cohort Features to Add:\")\n",
     "                for rec in cohort_features:\n",
     "                    print(f\"      → {', '.join(rec['features'])} ({rec['priority']} priority)\")\n",
     "\n",
@@ -365,7 +369,7 @@
    },
    "outputs": [],
    "source": [
-    "from customer_retention.stages.temporal import load_data_with_snapshot_preference, TEMPORAL_METADATA_COLS\n",
+    "from customer_retention.stages.temporal import TEMPORAL_METADATA_COLS, load_data_with_snapshot_preference\n",
     "\n",
     "# Load source data (prefers snapshots over raw files)\n",
     "df, data_source = load_data_with_snapshot_preference(findings, output_dir=str(FINDINGS_DIR))\n",
@@ -479,7 +483,7 @@
     "momentum_meta = pattern_meta.get(\"momentum\", {})\n",
     "\n",
     "# Identify divergent columns (these are most predictive for target)\n",
-    "DIVERGENT_VELOCITY_COLS = [k for k, v in velocity_meta.items() \n",
+    "DIVERGENT_VELOCITY_COLS = [k for k, v in velocity_meta.items()\n",
     "                           if isinstance(v, dict) and v.get(\"divergent\")]\n",
     "DIVERGENT_MOMENTUM_COLS = momentum_meta.get(\"_divergent_columns\", [])\n",
     "\n",
@@ -493,7 +497,7 @@
     "available_numeric = [c for c in numeric_cols if c not in exclude_cols]\n",
     "\n",
     "# Put divergent columns first (they showed predictive signal in 01c)\n",
-    "priority_cols = [c for c in DIVERGENT_VELOCITY_COLS + DIVERGENT_MOMENTUM_COLS \n",
+    "priority_cols = [c for c in DIVERGENT_VELOCITY_COLS + DIVERGENT_MOMENTUM_COLS\n",
     "                 if c in available_numeric]\n",
     "other_cols = [c for c in available_numeric if c not in priority_cols]\n",
     "\n",
@@ -535,7 +539,7 @@
     "if TARGET_COLUMN:\n",
     "    print(f\"\\n   Excluded from aggregation: {TARGET_COLUMN} (target - prevents leakage)\")\n",
     "print(f\"\\nAggregation functions: {AGG_FUNCTIONS}\")\n",
-    "print(f\"\\nAdditional features:\")\n",
+    "print(\"\\nAdditional features:\")\n",
     "print(f\"   Include lifecycle_quadrant: {INCLUDE_LIFECYCLE_QUADRANT}\")\n",
     "print(f\"   Include recency: {INCLUDE_RECENCY}\")\n",
     "print(f\"   Include tenure: {INCLUDE_TENURE}\")\n",
@@ -630,10 +634,10 @@
     "    print(f\"   ... and {len(plan.feature_columns) - 15} more\")\n",
     "\n",
     "if additional_features:\n",
-    "    print(f\"\\nAdditional features:\")\n",
+    "    print(\"\\nAdditional features:\")\n",
     "    for feat in additional_features:\n",
     "        print(f\"   - {feat}\")\n",
-    "    \n",
+    "\n",
     "print(f\"\\nTotal expected features: {len(plan.feature_columns) + len(additional_features) + 1}\")"
    ]
   },
@@ -697,18 +701,18 @@
     "    print(\"\\n   Adding lifecycle_quadrant feature...\")\n",
     "    profiler = TimeSeriesProfiler(entity_column=ENTITY_COLUMN, time_column=TIME_COLUMN)\n",
     "    ts_profile = profiler.profile(df)\n",
-    "    \n",
+    "\n",
     "    # Rename 'entity' column to match our entity column name\n",
     "    lifecycles = ts_profile.entity_lifecycles.copy()\n",
     "    lifecycles = lifecycles.rename(columns={\"entity\": ENTITY_COLUMN})\n",
-    "    \n",
+    "\n",
     "    quadrant_result = classify_lifecycle_quadrants(lifecycles)\n",
-    "    \n",
+    "\n",
     "    # Merge lifecycle_quadrant into aggregated data\n",
     "    quadrant_map = quadrant_result.lifecycles.set_index(ENTITY_COLUMN)[\"lifecycle_quadrant\"]\n",
     "    df_aggregated[\"lifecycle_quadrant\"] = df_aggregated[ENTITY_COLUMN].map(quadrant_map)\n",
-    "    \n",
-    "    print(f\"   Quadrant distribution:\")\n",
+    "\n",
+    "    print(\"   Quadrant distribution:\")\n",
     "    for quad, count in df_aggregated[\"lifecycle_quadrant\"].value_counts().items():\n",
     "        pct = count / len(df_aggregated) * 100\n",
     "        print(f\"      {quad}: {count:,} ({pct:.1f}%)\")\n",
@@ -720,7 +724,7 @@
     "    # For entity-level target, use max (if any event has target=1, entity has target=1)\n",
     "    entity_target = df.groupby(ENTITY_COLUMN)[TARGET_COLUMN].max()\n",
     "    df_aggregated[TARGET_COLUMN] = df_aggregated[ENTITY_COLUMN].map(entity_target)\n",
-    "    \n",
+    "\n",
     "    target_dist = df_aggregated[TARGET_COLUMN].value_counts()\n",
     "    for val, count in target_dist.items():\n",
     "        pct = count / len(df_aggregated) * 100\n",
@@ -753,9 +757,9 @@
     "                df_aggregated[\"quarter_sin\"] = np.sin(2 * np.pi * df_aggregated[ENTITY_COLUMN].map(entity_quarter) / 4)\n",
     "                df_aggregated[\"quarter_cos\"] = np.cos(2 * np.pi * df_aggregated[ENTITY_COLUMN].map(entity_quarter) / 4)\n",
     "                cyclical_added.append(\"quarter (quarter_sin, quarter_cos)\")\n",
-    "    \n",
+    "\n",
     "    if cyclical_added:\n",
-    "        print(f\"\\n   Adding cyclical features from seasonality analysis:\")\n",
+    "        print(\"\\n   Adding cyclical features from seasonality analysis:\")\n",
     "        for feat in cyclical_added:\n",
     "            print(f\"      -> {feat}\")\n",
     "\n",
@@ -765,21 +769,21 @@
     "    for rec in TEMPORAL_PATTERN_RECOMMENDATIONS:\n",
     "        features = rec.get(\"features\", [])\n",
     "        pattern = rec.get(\"pattern\", \"\")\n",
-    "        \n",
+    "\n",
     "        if pattern == \"day_of_week\" and \"dow_sin\" in df_aggregated.columns:\n",
     "            continue\n",
     "        if pattern == \"month\" and \"month_sin\" in df_aggregated.columns:\n",
     "            continue\n",
     "        if pattern == \"quarter\" and \"quarter_sin\" in df_aggregated.columns:\n",
     "            continue\n",
-    "            \n",
+    "\n",
     "        if \"dow_sin\" in features or \"dow_cos\" in features:\n",
     "            if \"dow_sin\" not in df_aggregated.columns:\n",
     "                entity_dow = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].apply(lambda x: x.dt.dayofweek.mean())\n",
     "                df_aggregated[\"dow_sin\"] = np.sin(2 * np.pi * df_aggregated[ENTITY_COLUMN].map(entity_dow) / 7)\n",
     "                df_aggregated[\"dow_cos\"] = np.cos(2 * np.pi * df_aggregated[ENTITY_COLUMN].map(entity_dow) / 7)\n",
     "                tp_added.append(\"day_of_week (dow_sin, dow_cos)\")\n",
-    "        \n",
+    "\n",
     "        if \"is_weekend\" in features:\n",
     "            if \"is_weekend\" not in df_aggregated.columns:\n",
     "                entity_weekend_pct = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].apply(\n",
@@ -787,28 +791,28 @@
     "                )\n",
     "                df_aggregated[\"is_weekend_pct\"] = df_aggregated[ENTITY_COLUMN].map(entity_weekend_pct)\n",
     "                tp_added.append(\"is_weekend_pct\")\n",
-    "        \n",
+    "\n",
     "        if \"month_sin\" in features or \"month_cos\" in features:\n",
     "            if \"month_sin\" not in df_aggregated.columns:\n",
     "                entity_month = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].apply(lambda x: x.dt.month.mean())\n",
     "                df_aggregated[\"month_sin\"] = np.sin(2 * np.pi * df_aggregated[ENTITY_COLUMN].map(entity_month) / 12)\n",
     "                df_aggregated[\"month_cos\"] = np.cos(2 * np.pi * df_aggregated[ENTITY_COLUMN].map(entity_month) / 12)\n",
     "                tp_added.append(\"month (month_sin, month_cos)\")\n",
-    "        \n",
+    "\n",
     "        if \"quarter_sin\" in features or \"quarter_cos\" in features:\n",
     "            if \"quarter_sin\" not in df_aggregated.columns:\n",
     "                entity_quarter = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].apply(lambda x: x.dt.quarter.mean())\n",
     "                df_aggregated[\"quarter_sin\"] = np.sin(2 * np.pi * df_aggregated[ENTITY_COLUMN].map(entity_quarter) / 4)\n",
     "                df_aggregated[\"quarter_cos\"] = np.cos(2 * np.pi * df_aggregated[ENTITY_COLUMN].map(entity_quarter) / 4)\n",
     "                tp_added.append(\"quarter (quarter_sin, quarter_cos)\")\n",
-    "        \n",
+    "\n",
     "        if \"year_trend\" in features:\n",
     "            if \"year_trend\" not in df_aggregated.columns:\n",
     "                entity_year = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].apply(lambda x: x.dt.year.mean())\n",
     "                min_year = entity_year.min()\n",
     "                df_aggregated[\"year_trend\"] = df_aggregated[ENTITY_COLUMN].map(entity_year) - min_year\n",
     "                tp_added.append(f\"year_trend (normalized from {min_year:.0f})\")\n",
-    "        \n",
+    "\n",
     "        if \"year_categorical\" in features:\n",
     "            if \"year_mode\" not in df_aggregated.columns:\n",
     "                entity_year_mode = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].apply(\n",
@@ -816,9 +820,9 @@
     "                )\n",
     "                df_aggregated[\"year_mode\"] = df_aggregated[ENTITY_COLUMN].map(entity_year_mode).astype(int)\n",
     "                tp_added.append(\"year_mode (categorical - encode before modeling)\")\n",
-    "    \n",
+    "\n",
     "    if tp_added:\n",
-    "        print(f\"\\n   Adding features from temporal pattern analysis:\")\n",
+    "        print(\"\\n   Adding features from temporal pattern analysis:\")\n",
     "        for feat in tp_added:\n",
     "            print(f\"      -> {feat}\")\n",
     "\n",
@@ -827,20 +831,20 @@
     "    trend_added = []\n",
     "    for rec in TREND_RECOMMENDATIONS:\n",
     "        features = rec.get(\"features\", [])\n",
-    "        \n",
+    "\n",
     "        if \"recent_vs_overall_ratio\" in features:\n",
     "            if \"recent_vs_overall_ratio\" not in df_aggregated.columns:\n",
     "                time_span = (df[TIME_COLUMN].max() - df[TIME_COLUMN].min()).days\n",
     "                recent_cutoff = df[TIME_COLUMN].max() - pd.Timedelta(days=int(time_span * 0.3))\n",
-    "                \n",
+    "\n",
     "                overall_counts = df.groupby(ENTITY_COLUMN).size()\n",
     "                recent_counts = df[df[TIME_COLUMN] >= recent_cutoff].groupby(ENTITY_COLUMN).size()\n",
-    "                \n",
+    "\n",
     "                ratio = recent_counts / overall_counts\n",
     "                ratio = ratio.fillna(0)\n",
     "                df_aggregated[\"recent_vs_overall_ratio\"] = df_aggregated[ENTITY_COLUMN].map(ratio).fillna(0)\n",
     "                trend_added.append(\"recent_vs_overall_ratio\")\n",
-    "        \n",
+    "\n",
     "        if \"entity_trend_slope\" in features:\n",
     "            if \"entity_trend_slope\" not in df_aggregated.columns:\n",
     "                def compute_entity_slope(group):\n",
@@ -852,13 +856,13 @@
     "                        return 0.0\n",
     "                    slope = np.polyfit(x, y, 1)[0]\n",
     "                    return slope\n",
-    "                \n",
+    "\n",
     "                entity_slopes = df.groupby(ENTITY_COLUMN).apply(compute_entity_slope)\n",
     "                df_aggregated[\"entity_trend_slope\"] = df_aggregated[ENTITY_COLUMN].map(entity_slopes).fillna(0)\n",
     "                trend_added.append(\"entity_trend_slope\")\n",
-    "    \n",
+    "\n",
     "    if trend_added:\n",
-    "        print(f\"\\n   Adding features from trend analysis:\")\n",
+    "        print(\"\\n   Adding features from trend analysis:\")\n",
     "        for feat in trend_added:\n",
     "            print(f\"      -> {feat}\")\n",
     "\n",
@@ -868,25 +872,25 @@
     "    if not skip_cohort:\n",
     "        cohort_added = []\n",
     "        cohort_features = [f for r in COHORT_RECOMMENDATIONS for f in r.get(\"features\", [])]\n",
-    "        \n",
+    "\n",
     "        if \"cohort_year\" in cohort_features or \"cohort_quarter\" in cohort_features:\n",
     "            entity_first = df.groupby(ENTITY_COLUMN)[TIME_COLUMN].min()\n",
-    "            \n",
+    "\n",
     "            if \"cohort_year\" in cohort_features and \"cohort_year\" not in df_aggregated.columns:\n",
     "                df_aggregated[\"cohort_year\"] = df_aggregated[ENTITY_COLUMN].map(entity_first).dt.year\n",
     "                cohort_added.append(\"cohort_year\")\n",
-    "            \n",
+    "\n",
     "            if \"cohort_quarter\" in cohort_features and \"cohort_quarter\" not in df_aggregated.columns:\n",
     "                first_dates = df_aggregated[ENTITY_COLUMN].map(entity_first)\n",
     "                df_aggregated[\"cohort_quarter\"] = first_dates.dt.year.astype(str) + \"Q\" + first_dates.dt.quarter.astype(str)\n",
     "                cohort_added.append(\"cohort_quarter\")\n",
-    "        \n",
+    "\n",
     "        if cohort_added:\n",
-    "            print(f\"\\n   Adding cohort features:\")\n",
+    "            print(\"\\n   Adding cohort features:\")\n",
     "            for feat in cohort_added:\n",
     "                print(f\"      -> {feat}\")\n",
     "    else:\n",
-    "        print(f\"\\n   Skipping cohort features (insufficient variation)\")\n",
+    "        print(\"\\n   Skipping cohort features (insufficient variation)\")\n",
     "\n",
     "# Step 8: Add momentum ratio features from 01c momentum recommendations\n",
     "if MOMENTUM_RECOMMENDATIONS:\n",
@@ -894,24 +898,26 @@
     "    df_aggregated = create_momentum_ratio_features(df_aggregated, MOMENTUM_RECOMMENDATIONS)\n",
     "    new_momentum_cols = set(df_aggregated.columns) - before_cols\n",
     "    if new_momentum_cols:\n",
-    "        print(f\"\\n   Adding momentum ratio features:\")\n",
+    "        print(\"\\n   Adding momentum ratio features:\")\n",
     "        for feat in sorted(new_momentum_cols):\n",
     "            print(f\"      -> {feat}\")\n",
     "    else:\n",
-    "        print(f\"\\n   Momentum ratio features: columns not available in aggregated data (skipped)\")\n",
+    "        print(\"\\n   Momentum ratio features: columns not available in aggregated data (skipped)\")\n",
     "\n",
     "# Step 9: Add recency bucket feature\n",
     "if INCLUDE_RECENCY and \"days_since_last_event\" in df_aggregated.columns:\n",
     "    df_aggregated = create_recency_bucket_feature(df_aggregated)\n",
     "    if \"recency_bucket\" in df_aggregated.columns:\n",
-    "        print(f\"\\n   Adding recency_bucket feature:\")\n",
+    "        print(\"\\n   Adding recency_bucket feature:\")\n",
     "        for bucket, count in df_aggregated[\"recency_bucket\"].value_counts().sort_index().items():\n",
     "            pct = count / len(df_aggregated) * 100\n",
     "            print(f\"      {bucket}: {count:,} ({pct:.1f}%)\")\n",
     "\n",
-    "print(f\"\\n   Aggregation complete!\")\n",
+    "print(\"\\n   Aggregation complete!\")\n",
     "print(f\"   Output: {len(df_aggregated):,} entities x {len(df_aggregated.columns)} features\")\n",
-    "print(f\"   Memory: {df_aggregated.memory_usage(deep=True).sum() / 1024**2:.1f} MB\")"
+    "from customer_retention.core.compat import safe_memory_usage_bytes\n",
+    "\n",
+    "print(f\"   Memory: {safe_memory_usage_bytes(df_aggregated) / 1024**2:.1f} MB\")"
    ]
   },
   {
@@ -1036,12 +1042,12 @@
     "if original_entities == aggregated_entities:\n",
     "    print(f\"\\n✅ Entity count matches: {aggregated_entities:,}\")\n",
     "else:\n",
-    "    print(f\"\\n⚠️ Entity count mismatch!\")\n",
+    "    print(\"\\n⚠️ Entity count mismatch!\")\n",
     "    print(f\"   Original: {original_entities:,}\")\n",
     "    print(f\"   Aggregated: {aggregated_entities:,}\")\n",
     "\n",
     "# Check feature statistics\n",
-    "print(f\"\\n📊 Feature Statistics:\")\n",
+    "print(\"\\n📊 Feature Statistics:\")\n",
     "numeric_agg_cols = df_aggregated.select_dtypes(include=[np.number]).columns.tolist()\n",
     "if TARGET_COLUMN:\n",
     "    numeric_agg_cols = [c for c in numeric_agg_cols if c != TARGET_COLUMN]\n",
@@ -1057,7 +1063,7 @@
     "\n",
     "# If lifecycle_quadrant was added, show its correlation with target\n",
     "if INCLUDE_LIFECYCLE_QUADRANT and TARGET_COLUMN and TARGET_COLUMN in df_aggregated.columns:\n",
-    "    print(f\"\\n📊 Lifecycle Quadrant vs Target:\")\n",
+    "    print(\"\\n📊 Lifecycle Quadrant vs Target:\")\n",
     "    cross = pd.crosstab(df_aggregated[\"lifecycle_quadrant\"], df_aggregated[TARGET_COLUMN], normalize='index')\n",
     "    if 1 in cross.columns:\n",
     "        for quad in cross.index:\n",
@@ -1200,6 +1206,7 @@
     "print(f\"✅ Original findings updated with aggregation metadata: {FINDINGS_PATH}\")\n",
     "\n",
     "from customer_retention.analysis.notebook_html_exporter import export_notebook_html\n",
+    "\n",
     "export_notebook_html(Path(\"01d_event_aggregation.ipynb\"), EXPERIMENTS_DIR / \"docs\")\n"
    ]
   },
@@ -1230,25 +1237,25 @@
     "print(\"AGGREGATION COMPLETE - OUTPUT SUMMARY\")\n",
     "print(\"=\"*70)\n",
     "\n",
-    "print(f\"\\n📁 Files created:\")\n",
+    "print(\"\\n📁 Files created:\")\n",
     "print(f\"   1. Aggregated data: {AGGREGATED_DATA_PATH}\")\n",
     "print(f\"   2. Aggregated findings: {AGGREGATED_FINDINGS_PATH}\")\n",
     "print(f\"   3. Updated original findings: {FINDINGS_PATH}\")\n",
     "\n",
-    "print(f\"\\n📊 Transformation stats:\")\n",
+    "print(\"\\n📊 Transformation stats:\")\n",
     "print(f\"   Input events: {len(df):,}\")\n",
     "print(f\"   Output entities: {len(df_aggregated):,}\")\n",
     "print(f\"   Features created: {len(df_aggregated.columns)}\")\n",
     "\n",
-    "print(f\"\\n⚙️ Configuration applied:\")\n",
+    "print(\"\\n⚙️ Configuration applied:\")\n",
     "print(f\"   Windows: {WINDOWS} (from {window_source})\")\n",
     "print(f\"   Aggregation functions: {AGG_FUNCTIONS}\")\n",
     "if priority_cols:\n",
     "    print(f\"   Priority columns (from 01c divergence): {priority_cols}\")\n",
     "if INCLUDE_LIFECYCLE_QUADRANT:\n",
-    "    print(f\"   Lifecycle quadrant: included (from 01a recommendation)\")\n",
+    "    print(\"   Lifecycle quadrant: included (from 01a recommendation)\")\n",
     "\n",
-    "print(f\"\\n🎯 Ready for modeling:\")\n",
+    "print(\"\\n🎯 Ready for modeling:\")\n",
     "print(f\"   Entity column: {ENTITY_COLUMN}\")\n",
     "if TARGET_COLUMN:\n",
     "    print(f\"   Target column: {TARGET_COLUMN}\")\n",
@@ -1258,9 +1265,9 @@
     "\n",
     "# Drift warning if applicable\n",
     "if ts_meta.drift_risk_level == \"high\":\n",
-    "    print(f\"\\n⚠️ DRIFT WARNING: High drift risk detected in 01a\")\n",
+    "    print(\"\\n⚠️ DRIFT WARNING: High drift risk detected in 01a\")\n",
     "    print(f\"   Volume drift: {ts_meta.volume_drift_risk or 'unknown'}\")\n",
-    "    print(f\"   Consider: temporal validation splits, monitoring for distribution shift\")"
+    "    print(\"   Consider: temporal validation splits, monitoring for distribution shift\")"
    ]
   },
   {
@@ -1317,19 +1324,19 @@
     "\n",
     "if TARGET_COLUMN and TARGET_COLUMN in df_aggregated.columns:\n",
     "    detector = LeakageDetector()\n",
-    "    \n",
+    "\n",
     "    # Separate features and target\n",
     "    feature_cols = [c for c in df_aggregated.columns if c not in [ENTITY_COLUMN, TARGET_COLUMN]]\n",
     "    X = df_aggregated[feature_cols]\n",
     "    y = df_aggregated[TARGET_COLUMN]\n",
-    "    \n",
+    "\n",
     "    # Run leakage checks\n",
     "    result = detector.run_all_checks(X, y, include_pit=False)\n",
-    "    \n",
+    "\n",
     "    print(\"=\" * 70)\n",
     "    print(\"LEAKAGE VALIDATION RESULTS\")\n",
     "    print(\"=\" * 70)\n",
-    "    \n",
+    "\n",
     "    if result.passed:\n",
     "        print(\"\\n✅ PASSED: No critical leakage issues detected\")\n",
     "        print(f\"   Total checks run: {len(result.checks)}\")\n",
@@ -1460,4 +1467,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}

churnkit 0.76.0a3__py3-none-any.whl → 0.76.1a2__py3-none-any.whl

churnkit 0.76.0a3py3-none-any.whl → 0.76.1a2py3-none-any.whl