churnkit 0.76.1a1__py3-none-any.whl → 0.76.1a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +10 -5
  2. {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +6 -6
  3. {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +52 -46
  4. {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +68 -65
  5. {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +12 -27
  6. {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +216 -221
  7. {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +88 -81
  8. {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +111 -108
  9. {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +44 -38
  10. {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +89 -85
  11. {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +81 -80
  12. {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +83 -89
  13. {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +102 -98
  14. {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +32 -31
  15. {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +33 -29
  16. {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +6 -5
  17. {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +67 -63
  18. {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +38 -23
  19. {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +3 -1
  20. {churnkit-0.76.1a1.dist-info → churnkit-0.76.1a2.dist-info}/METADATA +1 -1
  21. {churnkit-0.76.1a1.dist-info → churnkit-0.76.1a2.dist-info}/RECORD +30 -30
  22. customer_retention/__init__.py +1 -1
  23. customer_retention/analysis/auto_explorer/explorer.py +2 -2
  24. customer_retention/analysis/notebook_progress.py +4 -1
  25. customer_retention/core/compat/__init__.py +10 -0
  26. customer_retention/integrations/databricks_init.py +13 -0
  27. customer_retention/stages/profiling/column_profiler.py +9 -2
  28. {churnkit-0.76.1a1.dist-info → churnkit-0.76.1a2.dist-info}/WHEEL +0 -0
  29. {churnkit-0.76.1a1.dist-info → churnkit-0.76.1a2.dist-info}/entry_points.txt +0 -0
  30. {churnkit-0.76.1a1.dist-info → churnkit-0.76.1a2.dist-info}/licenses/LICENSE +0 -0
@@ -73,24 +73,28 @@
73
73
  "outputs": [],
74
74
  "source": [
75
75
  "from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
76
+ "\n",
76
77
  "track_and_export_previous(\"02_column_deep_dive.ipynb\")\n",
77
78
  "\n",
79
+ "import numpy as np\n",
80
+ "import pandas as pd\n",
81
+ "import plotly.graph_objects as go\n",
82
+ "from scipy import stats\n",
83
+ "\n",
78
84
  "from customer_retention.analysis.auto_explorer import ExplorationFindings, RecommendationRegistry\n",
79
- "from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table, console\n",
85
+ "from customer_retention.analysis.visualization import ChartBuilder, console, display_figure, display_table\n",
80
86
  "from customer_retention.core.config.column_config import ColumnType\n",
87
+ "from customer_retention.core.config.experiments import (\n",
88
+ " FINDINGS_DIR,\n",
89
+ ")\n",
81
90
  "from customer_retention.stages.profiling import (\n",
82
- " DistributionAnalyzer, TransformationType,\n",
83
- " TemporalAnalyzer, TemporalGranularity,\n",
84
- " CategoricalDistributionAnalyzer, EncodingType\n",
91
+ " CategoricalDistributionAnalyzer,\n",
92
+ " DistributionAnalyzer,\n",
93
+ " TemporalAnalyzer,\n",
94
+ " TemporalGranularity,\n",
95
+ " TransformationType,\n",
85
96
  ")\n",
86
- "from customer_retention.stages.validation import DataValidator, RuleGenerator\n",
87
- "import pandas as pd\n",
88
- "import numpy as np\n",
89
- "from scipy import stats\n",
90
- "import plotly.graph_objects as go\n",
91
- "import plotly.express as px\n",
92
- "from plotly.subplots import make_subplots\n",
93
- "from customer_retention.core.config.experiments import FINDINGS_DIR, EXPERIMENTS_DIR, OUTPUT_DIR, setup_experiments_structure\n"
97
+ "from customer_retention.stages.validation import DataValidator, RuleGenerator\n"
94
98
  ]
95
99
  },
96
100
  {
@@ -121,7 +125,6 @@
121
125
  "\n",
122
126
  "# Option 2: Auto-discover findings file (prefers aggregated over event-level)\n",
123
127
  "from pathlib import Path\n",
124
- "import os\n",
125
128
  "\n",
126
129
  "# FINDINGS_DIR imported from customer_retention.core.config.experiments\n",
127
130
  "\n",
@@ -157,9 +160,9 @@
157
160
  "# Warn if this is event-level data (should run 01d first)\n",
158
161
  "if findings.is_time_series and \"_aggregated\" not in FINDINGS_PATH:\n",
159
162
  " ts_meta = findings.time_series_metadata\n",
160
- " print(f\"\\n⚠️ WARNING: This appears to be EVENT-LEVEL data\")\n",
163
+ " print(\"\\n⚠️ WARNING: This appears to be EVENT-LEVEL data\")\n",
161
164
  " print(f\" Entity: {ts_meta.entity_column}, Time: {ts_meta.time_column}\")\n",
162
- " print(f\" Recommendation: Run 01d_event_aggregation.ipynb first to create entity-level data\")"
165
+ " print(\" Recommendation: Run 01d_event_aggregation.ipynb first to create entity-level data\")"
163
166
  ]
164
167
  },
165
168
  {
@@ -202,7 +205,7 @@
202
205
  "outputs": [],
203
206
  "source": [
204
207
  "# Load data - handle aggregated parquet files directly\n",
205
- "from customer_retention.stages.temporal import load_data_with_snapshot_preference, TEMPORAL_METADATA_COLS\n",
208
+ "from customer_retention.stages.temporal import TEMPORAL_METADATA_COLS, load_data_with_snapshot_preference\n",
206
209
  "\n",
207
210
  "# For aggregated data, load directly from the parquet source\n",
208
211
  "if \"_aggregated\" in FINDINGS_PATH and findings.source_path.endswith('.parquet'):\n",
@@ -301,20 +304,20 @@
301
304
  "\n",
302
305
  "if range_rules:\n",
303
306
  " range_results = validator.validate_value_ranges(df, range_rules)\n",
304
- " \n",
307
+ "\n",
305
308
  " issues_found = []\n",
306
309
  " for r in range_results:\n",
307
310
  " detail = f\"{r.invalid_values} invalid\" if r.invalid_values > 0 else None\n",
308
311
  " console.check(f\"{r.column_name} ({r.rule_type})\", r.invalid_values == 0, detail)\n",
309
312
  " if r.invalid_values > 0:\n",
310
313
  " issues_found.append(r)\n",
311
- " \n",
314
+ "\n",
312
315
  " all_invalid = sum(r.invalid_values for r in range_results)\n",
313
316
  " if all_invalid == 0:\n",
314
317
  " console.success(\"All value ranges valid\")\n",
315
318
  " else:\n",
316
319
  " console.error(f\"Found {all_invalid:,} values outside expected ranges\")\n",
317
- " \n",
320
+ "\n",
318
321
  " console.info(\"Examples of invalid values:\")\n",
319
322
  " for r in issues_found[:3]:\n",
320
323
  " col = r.column_name\n",
@@ -333,19 +336,19 @@
333
336
  " condition = \"value < 0 or value > 1\"\n",
334
337
  " else:\n",
335
338
  " continue\n",
336
- " \n",
339
+ "\n",
337
340
  " invalid_values = df.loc[invalid_mask, col].dropna()\n",
338
341
  " if len(invalid_values) > 0:\n",
339
342
  " examples = invalid_values.head(5).tolist()\n",
340
343
  " console.metric(f\" {col}\", f\"{examples}\")\n",
341
- " \n",
344
+ "\n",
342
345
  " # Add filtering recommendation\n",
343
346
  " registry.add_bronze_filtering(\n",
344
347
  " column=col, condition=condition, action=\"cap\",\n",
345
348
  " rationale=f\"{r.invalid_values} values violate {r.rule_type} constraint\",\n",
346
349
  " source_notebook=\"02_column_deep_dive\"\n",
347
350
  " )\n",
348
- " \n",
351
+ "\n",
349
352
  " console.info(\"Rules auto-generated from detected column types\")\n",
350
353
  "else:\n",
351
354
  " range_results = []\n",
@@ -425,32 +428,32 @@
425
428
  "\n",
426
429
  "# Analyze all numeric columns using the framework\n",
427
430
  "analyses = analyzer.analyze_dataframe(df, numeric_cols)\n",
428
- "recommendations = {col: analyzer.recommend_transformation(analysis) \n",
431
+ "recommendations = {col: analyzer.recommend_transformation(analysis)\n",
429
432
  " for col, analysis in analyses.items()}\n",
430
433
  "\n",
431
434
  "for col_name in numeric_cols:\n",
432
435
  " col_info = findings.columns[col_name]\n",
433
436
  " analysis = analyses.get(col_name)\n",
434
437
  " rec = recommendations.get(col_name)\n",
435
- " \n",
438
+ "\n",
436
439
  " print(f\"\\n{'='*70}\")\n",
437
440
  " print(f\"Column: {col_name}\")\n",
438
441
  " print(f\"Type: {col_info.inferred_type.value} (Confidence: {col_info.confidence:.0%})\")\n",
439
- " print(f\"-\" * 70)\n",
440
- " \n",
442
+ " print(\"-\" * 70)\n",
443
+ "\n",
441
444
  " if analysis:\n",
442
- " print(f\"📊 Distribution Statistics:\")\n",
445
+ " print(\"📊 Distribution Statistics:\")\n",
443
446
  " print(f\" Mean: {analysis.mean:.3f} | Median: {analysis.median:.3f} | Std: {analysis.std:.3f}\")\n",
444
447
  " print(f\" Range: [{analysis.min_value:.3f}, {analysis.max_value:.3f}]\")\n",
445
448
  " print(f\" Percentiles: 1%={analysis.percentiles['p1']:.3f}, 25%={analysis.q1:.3f}, 75%={analysis.q3:.3f}, 99%={analysis.percentiles['p99']:.3f}\")\n",
446
- " print(f\"\\n📈 Shape Analysis:\")\n",
449
+ " print(\"\\n📈 Shape Analysis:\")\n",
447
450
  " skew_label = '(Right-skewed)' if analysis.skewness > 0.5 else '(Left-skewed)' if analysis.skewness < -0.5 else '(Symmetric)'\n",
448
451
  " print(f\" Skewness: {analysis.skewness:.2f} {skew_label}\")\n",
449
452
  " kurt_label = '(Heavy tails/outliers)' if analysis.kurtosis > 3 else '(Light tails)'\n",
450
453
  " print(f\" Kurtosis: {analysis.kurtosis:.2f} {kurt_label}\")\n",
451
454
  " print(f\" Zeros: {analysis.zero_count:,} ({analysis.zero_percentage:.1f}%)\")\n",
452
455
  " print(f\" Outliers (IQR): {analysis.outlier_count_iqr:,} ({analysis.outlier_percentage:.1f}%)\")\n",
453
- " \n",
456
+ "\n",
454
457
  " if rec:\n",
455
458
  " print(f\"\\n🔧 Recommended Transformation: {rec.recommended_transform.value}\")\n",
456
459
  " print(f\" Reason: {rec.reason}\")\n",
@@ -458,45 +461,45 @@
458
461
  " if rec.warnings:\n",
459
462
  " for warn in rec.warnings:\n",
460
463
  " print(f\" ⚠️ {warn}\")\n",
461
- " \n",
464
+ "\n",
462
465
  " # Create enhanced histogram with Plotly\n",
463
466
  " data = df[col_name].dropna()\n",
464
467
  " fig = go.Figure()\n",
465
- " \n",
468
+ "\n",
466
469
  " fig.add_trace(go.Histogram(x=data, nbinsx=50, name='Distribution',\n",
467
470
  " marker_color='steelblue', opacity=0.7))\n",
468
- " \n",
471
+ "\n",
469
472
  " # Calculate mean and median\n",
470
473
  " mean_val = data.mean()\n",
471
474
  " median_val = data.median()\n",
472
- " \n",
475
+ "\n",
473
476
  " # Position labels on opposite sides (left/right) to avoid overlap\n",
474
477
  " # The larger value gets right-justified, smaller gets left-justified\n",
475
478
  " mean_position = \"top right\" if mean_val >= median_val else \"top left\"\n",
476
479
  " median_position = \"top left\" if mean_val >= median_val else \"top right\"\n",
477
- " \n",
480
+ "\n",
478
481
  " # Add mean line\n",
479
482
  " fig.add_vline(\n",
480
- " x=mean_val, \n",
481
- " line_dash=\"dash\", \n",
483
+ " x=mean_val,\n",
484
+ " line_dash=\"dash\",\n",
482
485
  " line_color=\"red\",\n",
483
486
  " annotation_text=f\"Mean: {mean_val:.2f}\",\n",
484
487
  " annotation_position=mean_position,\n",
485
488
  " annotation_font_color=\"red\",\n",
486
489
  " annotation_bgcolor=\"rgba(255,255,255,0.8)\"\n",
487
490
  " )\n",
488
- " \n",
491
+ "\n",
489
492
  " # Add median line\n",
490
493
  " fig.add_vline(\n",
491
- " x=median_val, \n",
492
- " line_dash=\"solid\", \n",
494
+ " x=median_val,\n",
495
+ " line_dash=\"solid\",\n",
493
496
  " line_color=\"green\",\n",
494
497
  " annotation_text=f\"Median: {median_val:.2f}\",\n",
495
498
  " annotation_position=median_position,\n",
496
499
  " annotation_font_color=\"green\",\n",
497
500
  " annotation_bgcolor=\"rgba(255,255,255,0.8)\"\n",
498
501
  " )\n",
499
- " \n",
502
+ "\n",
500
503
  " # Add 99th percentile marker if there are outliers\n",
501
504
  " if analysis and analysis.outlier_percentage > 5:\n",
502
505
  " fig.add_vline(x=analysis.percentiles['p99'], line_dash=\"dot\", line_color=\"orange\",\n",
@@ -504,7 +507,7 @@
504
507
  " annotation_position=\"top right\",\n",
505
508
  " annotation_font_color=\"orange\",\n",
506
509
  " annotation_bgcolor=\"rgba(255,255,255,0.8)\")\n",
507
- " \n",
510
+ "\n",
508
511
  " transform_label = rec.recommended_transform.value if rec else \"none\"\n",
509
512
  " fig.update_layout(\n",
510
513
  " title=f\"Distribution: {col_name}<br><sub>Skew: {analysis.skewness:.2f} | Kurt: {analysis.kurtosis:.2f} | Strategy: {transform_label}</sub>\",\n",
@@ -559,16 +562,16 @@
559
562
  " \"skewness\": stats.skew(series),\n",
560
563
  " \"kurtosis\": stats.kurtosis(series)\n",
561
564
  " })\n",
562
- " \n",
565
+ "\n",
563
566
  " stats_df = pd.DataFrame(stats_data)\n",
564
- " \n",
567
+ "\n",
565
568
  " # Format for display\n",
566
569
  " display_stats = stats_df.copy()\n",
567
570
  " for col in [\"mean\", \"std\", \"min\", \"25%\", \"50%\", \"75%\", \"95%\", \"99%\", \"max\"]:\n",
568
571
  " display_stats[col] = display_stats[col].apply(lambda x: f\"{x:.3f}\")\n",
569
572
  " display_stats[\"skewness\"] = display_stats[\"skewness\"].apply(lambda x: f\"{x:.3f}\")\n",
570
573
  " display_stats[\"kurtosis\"] = display_stats[\"kurtosis\"].apply(lambda x: f\"{x:.3f}\")\n",
571
- " \n",
574
+ "\n",
572
575
  " print(\"=\" * 80)\n",
573
576
  " print(\"NUMERICAL FEATURE STATISTICS\")\n",
574
577
  " print(\"=\" * 80)\n",
@@ -621,7 +624,7 @@
621
624
  "for col_name in numeric_cols:\n",
622
625
  " analysis = analyses.get(col_name)\n",
623
626
  " rec = recommendations.get(col_name)\n",
624
- " \n",
627
+ "\n",
625
628
  " if analysis and rec:\n",
626
629
  " summary_data.append({\n",
627
630
  " \"Column\": col_name,\n",
@@ -632,7 +635,7 @@
632
635
  " \"Transform\": rec.recommended_transform.value,\n",
633
636
  " \"Priority\": rec.priority\n",
634
637
  " })\n",
635
- " \n",
638
+ "\n",
636
639
  " # Add Gold transformation recommendation if not \"none\"\n",
637
640
  " if rec.recommended_transform != TransformationType.NONE and registry.gold:\n",
638
641
  " registry.add_gold_transformation(\n",
@@ -646,7 +649,7 @@
646
649
  "if summary_data:\n",
647
650
  " summary_df = pd.DataFrame(summary_data)\n",
648
651
  " display_table(summary_df)\n",
649
- " \n",
652
+ "\n",
650
653
  " # Show how many transformation recommendations were added\n",
651
654
  " transform_count = sum(1 for r in recommendations.values() if r and r.recommended_transform != TransformationType.NONE)\n",
652
655
  " if transform_count > 0 and registry.gold:\n",
@@ -728,7 +731,7 @@
728
731
  "cat_analyses = cat_analyzer.analyze_dataframe(df, categorical_cols)\n",
729
732
  "\n",
730
733
  "# Get encoding recommendations\n",
731
- "cyclical_cols = [name for name, col in findings.columns.items() \n",
734
+ "cyclical_cols = [name for name, col in findings.columns.items()\n",
732
735
  " if col.inferred_type == ColumnType.CATEGORICAL_CYCLICAL]\n",
733
736
  "cat_recommendations = cat_analyzer.get_all_recommendations(df, categorical_cols, cyclical_columns=cyclical_cols)\n",
734
737
  "\n",
@@ -736,56 +739,56 @@
736
739
  " col_info = findings.columns[col_name]\n",
737
740
  " analysis = cat_analyses.get(col_name)\n",
738
741
  " rec = next((r for r in cat_recommendations if r.column_name == col_name), None)\n",
739
- " \n",
742
+ "\n",
740
743
  " print(f\"\\n{'='*70}\")\n",
741
744
  " print(f\"Column: {col_name}\")\n",
742
745
  " print(f\"Type: {col_info.inferred_type.value} (Confidence: {col_info.confidence:.0%})\")\n",
743
- " print(f\"-\" * 70)\n",
744
- " \n",
746
+ " print(\"-\" * 70)\n",
747
+ "\n",
745
748
  " if analysis:\n",
746
- " print(f\"\\n📊 Distribution Metrics:\")\n",
749
+ " print(\"\\n📊 Distribution Metrics:\")\n",
747
750
  " print(f\" Categories: {analysis.category_count}\")\n",
748
751
  " print(f\" Imbalance Ratio: {analysis.imbalance_ratio:.1f}x (largest/smallest)\")\n",
749
752
  " print(f\" Entropy: {analysis.entropy:.2f} ({analysis.normalized_entropy*100:.0f}% of max)\")\n",
750
753
  " print(f\" Top-1 Concentration: {analysis.top1_concentration:.1f}%\")\n",
751
754
  " print(f\" Top-3 Concentration: {analysis.top3_concentration:.1f}%\")\n",
752
755
  " print(f\" Rare Categories (<1%): {analysis.rare_category_count}\")\n",
753
- " \n",
756
+ "\n",
754
757
  " # Interpretation\n",
755
- " print(f\"\\n📈 Interpretation:\")\n",
758
+ " print(\"\\n📈 Interpretation:\")\n",
756
759
  " if analysis.has_low_diversity:\n",
757
- " print(f\" ⚠️ LOW DIVERSITY: Distribution dominated by few categories\")\n",
760
+ " print(\" ⚠️ LOW DIVERSITY: Distribution dominated by few categories\")\n",
758
761
  " elif analysis.normalized_entropy > 0.9:\n",
759
- " print(f\" ✓ HIGH DIVERSITY: Categories are relatively balanced\")\n",
762
+ " print(\" ✓ HIGH DIVERSITY: Categories are relatively balanced\")\n",
760
763
  " else:\n",
761
- " print(f\" ✓ MODERATE DIVERSITY: Some category dominance but acceptable\")\n",
762
- " \n",
764
+ " print(\" ✓ MODERATE DIVERSITY: Some category dominance but acceptable\")\n",
765
+ "\n",
763
766
  " if analysis.imbalance_ratio > 100:\n",
764
- " print(f\" 🔴 SEVERE IMBALANCE: Rarest category has very few samples\")\n",
767
+ " print(\" 🔴 SEVERE IMBALANCE: Rarest category has very few samples\")\n",
765
768
  " elif analysis.is_imbalanced:\n",
766
- " print(f\" 🟡 MODERATE IMBALANCE: Consider grouping rare categories\")\n",
767
- " \n",
769
+ " print(\" 🟡 MODERATE IMBALANCE: Consider grouping rare categories\")\n",
770
+ "\n",
768
771
  " # Recommendations\n",
769
772
  " if rec:\n",
770
- " print(f\"\\n🔧 Recommendations:\")\n",
773
+ " print(\"\\n🔧 Recommendations:\")\n",
771
774
  " print(f\" Encoding: {rec.encoding_type.value}\")\n",
772
775
  " print(f\" Reason: {rec.reason}\")\n",
773
776
  " print(f\" Priority: {rec.priority}\")\n",
774
- " \n",
777
+ "\n",
775
778
  " if rec.preprocessing_steps:\n",
776
- " print(f\" Preprocessing:\")\n",
779
+ " print(\" Preprocessing:\")\n",
777
780
  " for step in rec.preprocessing_steps:\n",
778
781
  " print(f\" • {step}\")\n",
779
- " \n",
782
+ "\n",
780
783
  " if rec.warnings:\n",
781
784
  " for warn in rec.warnings:\n",
782
785
  " print(f\" ⚠️ {warn}\")\n",
783
- " \n",
786
+ "\n",
784
787
  " # Visualization\n",
785
788
  " value_counts = df[col_name].value_counts()\n",
786
789
  " subtitle = f\"Entropy: {analysis.normalized_entropy*100:.0f}% | Imbalance: {analysis.imbalance_ratio:.1f}x | Rare: {analysis.rare_category_count}\" if analysis else \"\"\n",
787
790
  " fig = charts.bar_chart(\n",
788
- " value_counts.head(10).index.tolist(), \n",
791
+ " value_counts.head(10).index.tolist(),\n",
789
792
  " value_counts.head(10).values.tolist(),\n",
790
793
  " title=f\"Top Categories: {col_name}<br><sub>{subtitle}</sub>\"\n",
791
794
  " )\n",
@@ -808,7 +811,7 @@
808
811
  " \"Rare (<1%)\": analysis.rare_category_count,\n",
809
812
  " \"Encoding\": rec.encoding_type.value if rec else \"N/A\"\n",
810
813
  " })\n",
811
- " \n",
814
+ "\n",
812
815
  " # Add encoding recommendation to Gold layer\n",
813
816
  " if rec and registry.gold:\n",
814
817
  " registry.add_gold_encoding(\n",
@@ -817,9 +820,9 @@
817
820
  " rationale=rec.reason,\n",
818
821
  " source_notebook=\"02_column_deep_dive\"\n",
819
822
  " )\n",
820
- " \n",
823
+ "\n",
821
824
  " display_table(pd.DataFrame(summary_data))\n",
822
- " \n",
825
+ "\n",
823
826
  " if registry.gold:\n",
824
827
  " print(f\"\\n✅ Added {len(cat_recommendations)} encoding recommendations to Gold layer\")"
825
828
  ]
@@ -900,76 +903,76 @@
900
903
  " print(f\"Column: {col_name}\")\n",
901
904
  " print(f\"Type: {col_info.inferred_type.value} (Confidence: {col_info.confidence:.0%})\")\n",
902
905
  " print(f\"{'='*70}\")\n",
903
- " \n",
906
+ "\n",
904
907
  " date_series = pd.to_datetime(df[col_name], errors='coerce', format='mixed')\n",
905
908
  " valid_dates = date_series.dropna()\n",
906
- " \n",
909
+ "\n",
907
910
  " print(f\"\\n📅 Date Range: {valid_dates.min()} to {valid_dates.max()}\")\n",
908
911
  " print(f\" Nulls: {date_series.isna().sum():,} ({date_series.isna().mean()*100:.1f}%)\")\n",
909
- " \n",
912
+ "\n",
910
913
  " # Basic temporal analysis\n",
911
914
  " analysis = temporal_analyzer.analyze(date_series)\n",
912
915
  " print(f\" Auto-detected granularity: {analysis.granularity.value}\")\n",
913
916
  " print(f\" Span: {analysis.span_days:,} days ({analysis.span_days/365:.1f} years)\")\n",
914
- " \n",
917
+ "\n",
915
918
  " # Growth analysis\n",
916
919
  " growth = temporal_analyzer.calculate_growth_rate(date_series)\n",
917
920
  " if growth.get(\"has_data\"):\n",
918
- " print(f\"\\n📈 Growth Analysis:\")\n",
921
+ " print(\"\\n📈 Growth Analysis:\")\n",
919
922
  " print(f\" Trend: {growth['trend_direction'].upper()}\")\n",
920
923
  " print(f\" Overall growth: {growth['overall_growth_pct']:+.1f}%\")\n",
921
924
  " print(f\" Avg monthly growth: {growth['avg_monthly_growth']:+.1f}%\")\n",
922
- " \n",
925
+ "\n",
923
926
  " # Seasonality analysis\n",
924
927
  " seasonality = temporal_analyzer.analyze_seasonality(date_series)\n",
925
928
  " if seasonality.has_seasonality:\n",
926
- " print(f\"\\n🔄 Seasonality Detected:\")\n",
929
+ " print(\"\\n🔄 Seasonality Detected:\")\n",
927
930
  " print(f\" Peak months: {', '.join(seasonality.peak_periods[:3])}\")\n",
928
931
  " print(f\" Trough months: {', '.join(seasonality.trough_periods[:3])}\")\n",
929
932
  " print(f\" Seasonal strength: {seasonality.seasonal_strength:.2f}\")\n",
930
- " \n",
933
+ "\n",
931
934
  " # Get recommendations using framework\n",
932
935
  " other_dates = [c for c in datetime_cols if c != col_name]\n",
933
936
  " recommendations = temporal_analyzer.recommend_features(date_series, col_name, other_date_columns=other_dates)\n",
934
- " \n",
937
+ "\n",
935
938
  " # Group by recommendation type\n",
936
939
  " col_feature_recs = [r for r in recommendations if r.recommendation_type == TemporalRecommendationType.FEATURE_ENGINEERING]\n",
937
940
  " col_modeling_recs = [r for r in recommendations if r.recommendation_type == TemporalRecommendationType.MODELING_STRATEGY]\n",
938
941
  " col_quality_recs = [r for r in recommendations if r.recommendation_type == TemporalRecommendationType.DATA_QUALITY]\n",
939
- " \n",
942
+ "\n",
940
943
  " feature_engineering_recs.extend(col_feature_recs)\n",
941
944
  " modeling_strategy_recs.extend(col_modeling_recs)\n",
942
945
  " data_quality_recs.extend(col_quality_recs)\n",
943
- " \n",
946
+ "\n",
944
947
  " # Display recommendations grouped by type\n",
945
948
  " if col_feature_recs:\n",
946
- " print(f\"\\n🛠️ FEATURES TO CREATE:\")\n",
949
+ " print(\"\\n🛠️ FEATURES TO CREATE:\")\n",
947
950
  " for rec in col_feature_recs:\n",
948
951
  " priority_icon = \"🔴\" if rec.priority == \"high\" else \"🟡\" if rec.priority == \"medium\" else \"✓\"\n",
949
952
  " print(f\" {priority_icon} {rec.feature_name} ({rec.category})\")\n",
950
953
  " print(f\" Why: {rec.reason}\")\n",
951
954
  " if rec.code_hint:\n",
952
955
  " print(f\" Code: {rec.code_hint}\")\n",
953
- " \n",
956
+ "\n",
954
957
  " if col_modeling_recs:\n",
955
- " print(f\"\\n⚙️ MODELING CONSIDERATIONS:\")\n",
958
+ " print(\"\\n⚙️ MODELING CONSIDERATIONS:\")\n",
956
959
  " for rec in col_modeling_recs:\n",
957
960
  " priority_icon = \"🔴\" if rec.priority == \"high\" else \"🟡\" if rec.priority == \"medium\" else \"✓\"\n",
958
961
  " print(f\" {priority_icon} {rec.feature_name}\")\n",
959
962
  " print(f\" Why: {rec.reason}\")\n",
960
- " \n",
963
+ "\n",
961
964
  " if col_quality_recs:\n",
962
- " print(f\"\\n⚠️ DATA QUALITY ISSUES:\")\n",
965
+ " print(\"\\n⚠️ DATA QUALITY ISSUES:\")\n",
963
966
  " for rec in col_quality_recs:\n",
964
967
  " priority_icon = \"🔴\" if rec.priority == \"high\" else \"🟡\" if rec.priority == \"medium\" else \"✓\"\n",
965
968
  " print(f\" {priority_icon} {rec.feature_name}\")\n",
966
969
  " print(f\" Why: {rec.reason}\")\n",
967
970
  " if rec.code_hint:\n",
968
971
  " print(f\" Code: {rec.code_hint}\")\n",
969
- " \n",
972
+ "\n",
970
973
  " # Standard extractions always available\n",
971
- " print(f\"\\n Standard extractions available: year, month, day, day_of_week, quarter\")\n",
972
- " \n",
974
+ " print(\"\\n Standard extractions available: year, month, day, day_of_week, quarter\")\n",
975
+ "\n",
973
976
  " # Store summary\n",
974
977
  " datetime_summaries.append({\n",
975
978
  " \"Column\": col_name,\n",
@@ -980,31 +983,31 @@
980
983
  " \"Modeling Notes\": len(col_modeling_recs),\n",
981
984
  " \"Quality Issues\": len(col_quality_recs)\n",
982
985
  " })\n",
983
- " \n",
986
+ "\n",
984
987
  " # === VISUALIZATIONS ===\n",
985
- " \n",
988
+ "\n",
986
989
  " if growth.get(\"has_data\"):\n",
987
990
  " fig = charts.growth_summary_indicators(growth, title=f\"Growth Summary: {col_name}\")\n",
988
991
  " display_figure(fig)\n",
989
- " \n",
992
+ "\n",
990
993
  " chart_type = \"line\" if analysis.granularity in [TemporalGranularity.DAY, TemporalGranularity.WEEK] else \"bar\"\n",
991
994
  " fig = charts.temporal_distribution(analysis, title=f\"Records Over Time: {col_name}\", chart_type=chart_type)\n",
992
995
  " display_figure(fig)\n",
993
- " \n",
996
+ "\n",
994
997
  " fig = charts.temporal_trend(analysis, title=f\"Trend Analysis: {col_name}\")\n",
995
998
  " display_figure(fig)\n",
996
- " \n",
999
+ "\n",
997
1000
  " yoy_data = temporal_analyzer.year_over_year_comparison(date_series)\n",
998
1001
  " if len(yoy_data) > 1:\n",
999
1002
  " fig = charts.year_over_year_lines(yoy_data, title=f\"Year-over-Year: {col_name}\")\n",
1000
1003
  " display_figure(fig)\n",
1001
1004
  " fig = charts.year_month_heatmap(yoy_data, title=f\"Records Heatmap: {col_name}\")\n",
1002
1005
  " display_figure(fig)\n",
1003
- " \n",
1006
+ "\n",
1004
1007
  " if growth.get(\"has_data\"):\n",
1005
1008
  " fig = charts.cumulative_growth_chart(growth[\"cumulative\"], title=f\"Cumulative Records: {col_name}\")\n",
1006
1009
  " display_figure(fig)\n",
1007
- " \n",
1010
+ "\n",
1008
1011
  " fig = charts.temporal_heatmap(date_series, title=f\"Day of Week Distribution: {col_name}\")\n",
1009
1012
  " display_figure(fig)\n",
1010
1013
  "\n",
@@ -1014,32 +1017,32 @@
1014
1017
  " print(\"DATETIME COLUMNS SUMMARY\")\n",
1015
1018
  " print(\"=\" * 70)\n",
1016
1019
  " display_table(pd.DataFrame(datetime_summaries))\n",
1017
- " \n",
1020
+ "\n",
1018
1021
  " # Summary by recommendation type\n",
1019
1022
  " print(\"\\n📋 ALL RECOMMENDATIONS BY TYPE:\")\n",
1020
- " \n",
1023
+ "\n",
1021
1024
  " if feature_engineering_recs:\n",
1022
1025
  " print(f\"\\n🛠️ FEATURES TO CREATE ({len(feature_engineering_recs)}):\")\n",
1023
1026
  " for i, rec in enumerate(feature_engineering_recs, 1):\n",
1024
1027
  " priority_icon = \"🔴\" if rec.priority == \"high\" else \"🟡\" if rec.priority == \"medium\" else \"✓\"\n",
1025
1028
  " print(f\" {i}. {priority_icon} {rec.feature_name}\")\n",
1026
- " \n",
1029
+ "\n",
1027
1030
  " if modeling_strategy_recs:\n",
1028
1031
  " print(f\"\\n⚙️ MODELING CONSIDERATIONS ({len(modeling_strategy_recs)}):\")\n",
1029
1032
  " for i, rec in enumerate(modeling_strategy_recs, 1):\n",
1030
1033
  " priority_icon = \"🔴\" if rec.priority == \"high\" else \"🟡\" if rec.priority == \"medium\" else \"✓\"\n",
1031
1034
  " print(f\" {i}. {priority_icon} {rec.feature_name}: {rec.reason}\")\n",
1032
- " \n",
1035
+ "\n",
1033
1036
  " if data_quality_recs:\n",
1034
1037
  " print(f\"\\n⚠️ DATA QUALITY TO ADDRESS ({len(data_quality_recs)}):\")\n",
1035
1038
  " for i, rec in enumerate(data_quality_recs, 1):\n",
1036
1039
  " priority_icon = \"🔴\" if rec.priority == \"high\" else \"🟡\" if rec.priority == \"medium\" else \"✓\"\n",
1037
1040
  " print(f\" {i}. {priority_icon} {rec.feature_name}: {rec.reason}\")\n",
1038
- " \n",
1041
+ "\n",
1039
1042
  " # Add recommendations to registry\n",
1040
1043
  " added_derived = 0\n",
1041
1044
  " added_modeling = 0\n",
1042
- " \n",
1045
+ "\n",
1043
1046
  " # Add feature engineering recommendations to Silver layer (derived columns)\n",
1044
1047
  " if registry.silver:\n",
1045
1048
  " for rec in feature_engineering_recs:\n",
@@ -1051,7 +1054,7 @@
1051
1054
  " source_notebook=\"02_column_deep_dive\"\n",
1052
1055
  " )\n",
1053
1056
  " added_derived += 1\n",
1054
- " \n",
1057
+ "\n",
1055
1058
  " # Add modeling strategy recommendations to Bronze layer\n",
1056
1059
  " seen_strategies = set()\n",
1057
1060
  " for rec in modeling_strategy_recs:\n",
@@ -1065,7 +1068,7 @@
1065
1068
  " )\n",
1066
1069
  " seen_strategies.add(rec.feature_name)\n",
1067
1070
  " added_modeling += 1\n",
1068
- " \n",
1071
+ "\n",
1069
1072
  " print(f\"\\n✅ Added {added_derived} derived column recommendations to Silver layer\")\n",
1070
1073
  " print(f\"✅ Added {added_modeling} modeling strategy recommendations to Bronze layer\")"
1071
1074
  ]
@@ -1217,14 +1220,14 @@
1217
1220
  " max_segments=5\n",
1218
1221
  ")\n",
1219
1222
  "\n",
1220
- "print(f\"\\n🎯 Analysis Results:\")\n",
1223
+ "print(\"\\n🎯 Analysis Results:\")\n",
1221
1224
  "print(f\" Method: {segmentation.method.value}\")\n",
1222
1225
  "print(f\" Detected Segments: {segmentation.n_segments}\")\n",
1223
1226
  "print(f\" Cluster Quality Score: {segmentation.quality_score:.2f}\")\n",
1224
1227
  "if segmentation.target_variance_ratio is not None:\n",
1225
1228
  " print(f\" Target Variance Ratio: {segmentation.target_variance_ratio:.2f}\")\n",
1226
1229
  "\n",
1227
- "print(f\"\\n📊 Segment Profiles:\")\n",
1230
+ "print(\"\\n📊 Segment Profiles:\")\n",
1228
1231
  "for profile in segmentation.profiles:\n",
1229
1232
  " target_info = f\" | Target Rate: {profile.target_rate*100:.1f}%\" if profile.target_rate is not None else \"\"\n",
1230
1233
  " print(f\" Segment {profile.segment_id}: {profile.size:,} records ({profile.size_pct:.1f}%){target_info}\")\n",
@@ -1242,7 +1245,7 @@
1242
1245
  " fig = charts.segment_feature_comparison(segmentation, title=\"Feature Comparison Across Segments\")\n",
1243
1246
  " display_figure(fig)\n",
1244
1247
  "\n",
1245
- "print(f\"\\n📝 Rationale:\")\n",
1248
+ "print(\"\\n📝 Rationale:\")\n",
1246
1249
  "for reason in segmentation.rationale:\n",
1247
1250
  " print(f\" • {reason}\")"
1248
1251
  ]
@@ -1297,7 +1300,7 @@
1297
1300
  "\n",
1298
1301
  "# Summary of recommendations\n",
1299
1302
  "all_recs = registry.all_recommendations\n",
1300
- "print(f\"\\n📋 Recommendations Summary:\")\n",
1303
+ "print(\"\\n📋 Recommendations Summary:\")\n",
1301
1304
  "print(f\" Bronze layer: {len(registry.get_by_layer('bronze'))} recommendations\")\n",
1302
1305
  "print(f\" Silver layer: {len(registry.get_by_layer('silver'))} recommendations\")\n",
1303
1306
  "print(f\" Gold layer: {len(registry.get_by_layer('gold'))} recommendations\")\n",