churnkit 0.76.0a3__py3-none-any.whl → 0.76.1a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +10 -5
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +11 -9
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +52 -46
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +68 -65
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +12 -27
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +216 -221
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +88 -81
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +111 -108
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +44 -38
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +89 -85
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +81 -80
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +83 -89
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +102 -98
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +32 -31
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +33 -29
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +6 -5
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +67 -63
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +38 -23
- {churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +3 -1
- {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/METADATA +1 -1
- {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/RECORD +31 -31
- customer_retention/__init__.py +1 -1
- customer_retention/analysis/auto_explorer/explorer.py +2 -2
- customer_retention/analysis/notebook_progress.py +14 -2
- customer_retention/core/compat/__init__.py +10 -0
- customer_retention/core/config/experiments.py +45 -0
- customer_retention/integrations/databricks_init.py +41 -1
- customer_retention/stages/profiling/column_profiler.py +9 -2
- {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/WHEEL +0 -0
- {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/entry_points.txt +0 -0
- {churnkit-0.76.0a3.dist-info → churnkit-0.76.1a2.dist-info}/licenses/LICENSE +0 -0
|
@@ -112,22 +112,27 @@
|
|
|
112
112
|
"outputs": [],
|
|
113
113
|
"source": [
|
|
114
114
|
"from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
|
|
115
|
+
"\n",
|
|
115
116
|
"track_and_export_previous(\"02a_text_columns_deep_dive.ipynb\")\n",
|
|
116
117
|
"\n",
|
|
117
|
-
"from customer_retention.analysis.auto_explorer import ExplorationFindings, TextProcessingMetadata\n",
|
|
118
|
-
"from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table, console\n",
|
|
119
|
-
"from customer_retention.core.config.column_config import ColumnType\n",
|
|
120
|
-
"from customer_retention.stages.profiling import (\n",
|
|
121
|
-
" TextColumnProcessor, TextProcessingConfig, TextColumnResult,\n",
|
|
122
|
-
" TextEmbedder, TextDimensionalityReducer,\n",
|
|
123
|
-
" EMBEDDING_MODELS, get_model_info, list_available_models\n",
|
|
124
|
-
")\n",
|
|
125
|
-
"import pandas as pd\n",
|
|
126
118
|
"import numpy as np\n",
|
|
127
|
-
"import plotly.graph_objects as go\n",
|
|
128
119
|
"import plotly.express as px\n",
|
|
120
|
+
"import plotly.graph_objects as go\n",
|
|
129
121
|
"from plotly.subplots import make_subplots\n",
|
|
130
|
-
"
|
|
122
|
+
"\n",
|
|
123
|
+
"from customer_retention.analysis.auto_explorer import ExplorationFindings, TextProcessingMetadata\n",
|
|
124
|
+
"from customer_retention.analysis.visualization import ChartBuilder, display_figure\n",
|
|
125
|
+
"from customer_retention.core.config.column_config import ColumnType\n",
|
|
126
|
+
"from customer_retention.core.config.experiments import (\n",
|
|
127
|
+
" EXPERIMENTS_DIR,\n",
|
|
128
|
+
" FINDINGS_DIR,\n",
|
|
129
|
+
")\n",
|
|
130
|
+
"from customer_retention.stages.profiling import (\n",
|
|
131
|
+
" TextColumnProcessor,\n",
|
|
132
|
+
" TextProcessingConfig,\n",
|
|
133
|
+
" get_model_info,\n",
|
|
134
|
+
" list_available_models,\n",
|
|
135
|
+
")"
|
|
131
136
|
]
|
|
132
137
|
},
|
|
133
138
|
{
|
|
@@ -249,7 +254,7 @@
|
|
|
249
254
|
},
|
|
250
255
|
"outputs": [],
|
|
251
256
|
"source": [
|
|
252
|
-
"from customer_retention.stages.temporal import load_data_with_snapshot_preference
|
|
257
|
+
"from customer_retention.stages.temporal import load_data_with_snapshot_preference\n",
|
|
253
258
|
"\n",
|
|
254
259
|
"df, data_source = load_data_with_snapshot_preference(findings, output_dir=str(FINDINGS_DIR))\n",
|
|
255
260
|
"charts = ChartBuilder()\n",
|
|
@@ -427,27 +432,27 @@
|
|
|
427
432
|
" print(f\"\\n{'='*70}\")\n",
|
|
428
433
|
" print(f\"Column: {col_name}\")\n",
|
|
429
434
|
" print(f\"{'='*70}\")\n",
|
|
430
|
-
"
|
|
435
|
+
"\n",
|
|
431
436
|
" text_series = df[col_name].fillna(\"\")\n",
|
|
432
|
-
"
|
|
437
|
+
"\n",
|
|
433
438
|
" # Basic statistics\n",
|
|
434
439
|
" non_empty = (text_series.str.len() > 0).sum()\n",
|
|
435
440
|
" avg_length = text_series.str.len().mean()\n",
|
|
436
441
|
" max_length = text_series.str.len().max()\n",
|
|
437
|
-
"
|
|
438
|
-
" print(
|
|
442
|
+
"\n",
|
|
443
|
+
" print(\"\\n\\U0001f4ca Statistics:\")\n",
|
|
439
444
|
" print(f\" Total rows: {len(text_series):,}\")\n",
|
|
440
445
|
" print(f\" Non-empty: {non_empty:,} ({non_empty/len(text_series)*100:.1f}%)\")\n",
|
|
441
446
|
" print(f\" Avg length: {avg_length:.0f} characters\")\n",
|
|
442
447
|
" print(f\" Max length: {max_length:,} characters\")\n",
|
|
443
|
-
"
|
|
448
|
+
"\n",
|
|
444
449
|
" # Sample texts\n",
|
|
445
|
-
" print(
|
|
450
|
+
" print(\"\\n\\U0001f4dd Sample texts:\")\n",
|
|
446
451
|
" samples = text_series[text_series.str.len() > 10].head(3)\n",
|
|
447
452
|
" for i, sample in enumerate(samples, 1):\n",
|
|
448
453
|
" truncated = sample[:100] + \"...\" if len(sample) > 100 else sample\n",
|
|
449
454
|
" print(f\" {i}. {truncated}\")\n",
|
|
450
|
-
"
|
|
455
|
+
"\n",
|
|
451
456
|
" # Text length distribution\n",
|
|
452
457
|
" lengths = text_series.str.len()\n",
|
|
453
458
|
" fig = go.Figure()\n",
|
|
@@ -511,27 +516,27 @@
|
|
|
511
516
|
"source": [
|
|
512
517
|
"if text_columns:\n",
|
|
513
518
|
" processor = TextColumnProcessor(config)\n",
|
|
514
|
-
"
|
|
519
|
+
"\n",
|
|
515
520
|
" print(\"Processing TEXT columns...\")\n",
|
|
516
521
|
" print(\"(This may take a moment for large datasets)\\n\")\n",
|
|
517
|
-
"
|
|
522
|
+
"\n",
|
|
518
523
|
" results = []\n",
|
|
519
524
|
" df_processed = df.copy()\n",
|
|
520
|
-
"
|
|
525
|
+
"\n",
|
|
521
526
|
" for col_name in text_columns:\n",
|
|
522
527
|
" print(f\"\\n{'='*70}\")\n",
|
|
523
528
|
" print(f\"Processing: {col_name}\")\n",
|
|
524
529
|
" print(f\"{'='*70}\")\n",
|
|
525
|
-
"
|
|
530
|
+
"\n",
|
|
526
531
|
" df_processed, result = processor.process_column(df_processed, col_name)\n",
|
|
527
532
|
" results.append(result)\n",
|
|
528
|
-
"
|
|
529
|
-
" print(
|
|
533
|
+
"\n",
|
|
534
|
+
" print(\"\\n\\u2705 Processing complete:\")\n",
|
|
530
535
|
" print(f\" Embedding shape: {result.embeddings_shape}\")\n",
|
|
531
536
|
" print(f\" Components kept: {result.n_components}\")\n",
|
|
532
537
|
" print(f\" Explained variance: {result.explained_variance:.1%}\")\n",
|
|
533
538
|
" print(f\" Features created: {', '.join(result.component_columns)}\")\n",
|
|
534
|
-
"
|
|
539
|
+
"\n",
|
|
535
540
|
" print(f\"\\n\\n{'='*70}\")\n",
|
|
536
541
|
" print(\"PROCESSING SUMMARY\")\n",
|
|
537
542
|
" print(f\"{'='*70}\")\n",
|
|
@@ -586,32 +591,32 @@
|
|
|
586
591
|
" print(f\"\\n{'='*70}\")\n",
|
|
587
592
|
" print(f\"Results: {result.column_name}\")\n",
|
|
588
593
|
" print(f\"{'='*70}\")\n",
|
|
589
|
-
"
|
|
594
|
+
"\n",
|
|
590
595
|
" # Explained variance per component\n",
|
|
591
596
|
" reducer = processor._reducers[result.column_name]\n",
|
|
592
597
|
" var_ratios = reducer._pca.explained_variance_ratio_\n",
|
|
593
598
|
" cumulative = np.cumsum(var_ratios)\n",
|
|
594
|
-
"
|
|
599
|
+
"\n",
|
|
595
600
|
" fig = make_subplots(rows=1, cols=2,\n",
|
|
596
601
|
" subplot_titles=(\"Variance per Component\", \"Cumulative Variance\"))\n",
|
|
597
|
-
"
|
|
602
|
+
"\n",
|
|
598
603
|
" fig.add_trace(go.Bar(\n",
|
|
599
604
|
" x=[f\"PC{i+1}\" for i in range(len(var_ratios))],\n",
|
|
600
605
|
" y=var_ratios,\n",
|
|
601
606
|
" marker_color='steelblue'\n",
|
|
602
607
|
" ), row=1, col=1)\n",
|
|
603
|
-
"
|
|
608
|
+
"\n",
|
|
604
609
|
" fig.add_trace(go.Scatter(\n",
|
|
605
610
|
" x=[f\"PC{i+1}\" for i in range(len(cumulative))],\n",
|
|
606
611
|
" y=cumulative,\n",
|
|
607
612
|
" mode='lines+markers',\n",
|
|
608
613
|
" line_color='green'\n",
|
|
609
614
|
" ), row=1, col=2)\n",
|
|
610
|
-
"
|
|
615
|
+
"\n",
|
|
611
616
|
" fig.add_hline(y=config.variance_threshold, line_dash=\"dash\", line_color=\"red\",\n",
|
|
612
617
|
" annotation_text=f\"Target: {config.variance_threshold:.0%}\",\n",
|
|
613
618
|
" row=1, col=2)\n",
|
|
614
|
-
"
|
|
619
|
+
"\n",
|
|
615
620
|
" fig.update_layout(\n",
|
|
616
621
|
" title=f\"PCA Results: {result.column_name}\",\n",
|
|
617
622
|
" height=400,\n",
|
|
@@ -621,7 +626,7 @@
|
|
|
621
626
|
" fig.update_yaxes(title_text=\"Variance Ratio\", row=1, col=1)\n",
|
|
622
627
|
" fig.update_yaxes(title_text=\"Cumulative Variance\", row=1, col=2)\n",
|
|
623
628
|
" display_figure(fig)\n",
|
|
624
|
-
"
|
|
629
|
+
"\n",
|
|
625
630
|
" # PC feature distributions\n",
|
|
626
631
|
" if len(result.component_columns) >= 2:\n",
|
|
627
632
|
" fig = px.scatter(\n",
|
|
@@ -687,16 +692,17 @@
|
|
|
687
692
|
" processing_approach=\"pca\"\n",
|
|
688
693
|
" )\n",
|
|
689
694
|
" findings.text_processing[result.column_name] = metadata\n",
|
|
690
|
-
"
|
|
695
|
+
"\n",
|
|
691
696
|
" print(f\"\\u2705 Added metadata for {result.column_name}:\")\n",
|
|
692
697
|
" print(f\" Model: {metadata.embedding_model}\")\n",
|
|
693
698
|
" print(f\" Components: {metadata.n_components}\")\n",
|
|
694
699
|
" print(f\" Explained variance: {metadata.explained_variance:.1%}\")\n",
|
|
695
|
-
"
|
|
700
|
+
"\n",
|
|
696
701
|
" findings.save(FINDINGS_PATH)\n",
|
|
697
702
|
" print(f\"\\nFindings saved to: {FINDINGS_PATH}\")\n",
|
|
698
703
|
"\n",
|
|
699
704
|
"from customer_retention.analysis.notebook_html_exporter import export_notebook_html\n",
|
|
705
|
+
"\n",
|
|
700
706
|
"export_notebook_html(Path(\"02a_text_columns_deep_dive.ipynb\"), EXPERIMENTS_DIR / \"docs\")\n"
|
|
701
707
|
]
|
|
702
708
|
},
|
|
@@ -743,15 +749,15 @@
|
|
|
743
749
|
" print(\"\\n\" + \"=\"*70)\n",
|
|
744
750
|
" print(\"PRODUCTION RECOMMENDATIONS\")\n",
|
|
745
751
|
" print(\"=\"*70)\n",
|
|
746
|
-
"
|
|
752
|
+
"\n",
|
|
747
753
|
" for result in results:\n",
|
|
748
754
|
" print(f\"\\n\\U0001f527 {result.column_name}:\")\n",
|
|
749
|
-
" print(
|
|
755
|
+
" print(\" Action: embed_reduce (embeddings + PCA)\")\n",
|
|
750
756
|
" print(f\" Model: {config.embedding_model}\")\n",
|
|
751
757
|
" print(f\" Variance threshold: {config.variance_threshold:.0%}\")\n",
|
|
752
758
|
" print(f\" Expected features: {result.n_components}\")\n",
|
|
753
759
|
" print(f\" Feature names: {', '.join(result.component_columns[:3])}...\")\n",
|
|
754
|
-
"
|
|
760
|
+
"\n",
|
|
755
761
|
" print(\"\\n\\U0001f4a1 These recommendations will be used by the pipeline generator.\")\n",
|
|
756
762
|
" print(\" The same processing will be applied in production.\")"
|
|
757
763
|
]
|