churnkit 0.76.1a1__py3-none-any.whl → 0.76.1a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +10 -5
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +6 -6
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +52 -46
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +68 -65
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +12 -27
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +216 -221
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +88 -81
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +111 -108
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +44 -38
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +89 -85
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +81 -80
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +83 -89
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +102 -98
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +32 -31
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +33 -29
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +6 -5
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +67 -63
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +38 -23
- {churnkit-0.76.1a1.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +3 -1
- {churnkit-0.76.1a1.dist-info → churnkit-0.76.1a2.dist-info}/METADATA +1 -1
- {churnkit-0.76.1a1.dist-info → churnkit-0.76.1a2.dist-info}/RECORD +30 -30
- customer_retention/__init__.py +1 -1
- customer_retention/analysis/auto_explorer/explorer.py +2 -2
- customer_retention/analysis/notebook_progress.py +4 -1
- customer_retention/core/compat/__init__.py +10 -0
- customer_retention/integrations/databricks_init.py +13 -0
- customer_retention/stages/profiling/column_profiler.py +9 -2
- {churnkit-0.76.1a1.dist-info → churnkit-0.76.1a2.dist-info}/WHEEL +0 -0
- {churnkit-0.76.1a1.dist-info → churnkit-0.76.1a2.dist-info}/entry_points.txt +0 -0
- {churnkit-0.76.1a1.dist-info → churnkit-0.76.1a2.dist-info}/licenses/LICENSE +0 -0
|
@@ -70,10 +70,12 @@
|
|
|
70
70
|
"outputs": [],
|
|
71
71
|
"source": [
|
|
72
72
|
"from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
|
|
73
|
+
"\n",
|
|
73
74
|
"track_and_export_previous(\"10_spec_generation.ipynb\")\n",
|
|
74
75
|
"\n",
|
|
75
|
-
"from pathlib import Path\n",
|
|
76
76
|
"from enum import Enum\n",
|
|
77
|
+
"from pathlib import Path\n",
|
|
78
|
+
"\n",
|
|
77
79
|
"\n",
|
|
78
80
|
"class GenerationTarget(Enum):\n",
|
|
79
81
|
" LOCAL_FEAST_MLFLOW = \"local\"\n",
|
|
@@ -99,8 +101,7 @@
|
|
|
99
101
|
"\n",
|
|
100
102
|
"print(f\"Pipeline: {PIPELINE_NAME}\")\n",
|
|
101
103
|
"print(f\"Target: {GENERATION_TARGET.value}\")\n",
|
|
102
|
-
"print(f\"Format: {OUTPUT_FORMAT.value}\")
|
|
103
|
-
"from customer_retention.stages.temporal import TEMPORAL_METADATA_COLS"
|
|
104
|
+
"print(f\"Format: {OUTPUT_FORMAT.value}\")"
|
|
104
105
|
]
|
|
105
106
|
},
|
|
106
107
|
{
|
|
@@ -143,9 +144,11 @@
|
|
|
143
144
|
"outputs": [],
|
|
144
145
|
"source": [
|
|
145
146
|
"import yaml\n",
|
|
147
|
+
"\n",
|
|
146
148
|
"from customer_retention.analysis.auto_explorer import ExplorationFindings\n",
|
|
147
149
|
"from customer_retention.analysis.auto_explorer.layered_recommendations import RecommendationRegistry\n",
|
|
148
|
-
"from customer_retention.core.config.experiments import
|
|
150
|
+
"from customer_retention.core.config.experiments import EXPERIMENTS_DIR, FINDINGS_DIR\n",
|
|
151
|
+
"\n",
|
|
149
152
|
"\n",
|
|
150
153
|
"def load_findings_and_recommendations(findings_dir: Path):\n",
|
|
151
154
|
" findings_files = sorted(\n",
|
|
@@ -154,37 +157,37 @@
|
|
|
154
157
|
" )\n",
|
|
155
158
|
" if not findings_files:\n",
|
|
156
159
|
" raise FileNotFoundError(f\"No findings in {findings_dir}. Run exploration notebooks first.\")\n",
|
|
157
|
-
"
|
|
160
|
+
"\n",
|
|
158
161
|
" findings = ExplorationFindings.load(str(findings_files[0]))\n",
|
|
159
|
-
"
|
|
162
|
+
"\n",
|
|
160
163
|
" # Look for recommendations file matching the findings file pattern\n",
|
|
161
164
|
" # Step 06 saves as: {name}_recommendations.yaml (matching {name}_findings.yaml)\n",
|
|
162
165
|
" findings_name = findings_files[0].stem.replace(\"_findings\", \"\")\n",
|
|
163
166
|
" recommendations_path = findings_dir / f\"{findings_name}_recommendations.yaml\"\n",
|
|
164
|
-
"
|
|
167
|
+
"\n",
|
|
165
168
|
" # Fallback to generic recommendations.yaml if not found\n",
|
|
166
169
|
" if not recommendations_path.exists():\n",
|
|
167
170
|
" recommendations_path = findings_dir / \"recommendations.yaml\"\n",
|
|
168
|
-
"
|
|
171
|
+
"\n",
|
|
169
172
|
" # Final fallback: find any *_recommendations.yaml\n",
|
|
170
173
|
" if not recommendations_path.exists():\n",
|
|
171
|
-
" rec_files = sorted(findings_dir.glob(\"*_recommendations.yaml\")
|
|
174
|
+
" rec_files = sorted(findings_dir.glob(\"*_recommendations.yaml\"),\n",
|
|
172
175
|
" key=lambda f: f.stat().st_mtime, reverse=True)\n",
|
|
173
176
|
" if rec_files:\n",
|
|
174
177
|
" recommendations_path = rec_files[0]\n",
|
|
175
|
-
"
|
|
178
|
+
"\n",
|
|
176
179
|
" registry = None\n",
|
|
177
180
|
" if recommendations_path.exists():\n",
|
|
178
181
|
" with open(recommendations_path) as f:\n",
|
|
179
182
|
" registry = RecommendationRegistry.from_dict(yaml.safe_load(f))\n",
|
|
180
183
|
" print(f\"Loaded recommendations from: {recommendations_path.name}\")\n",
|
|
181
|
-
"
|
|
184
|
+
"\n",
|
|
182
185
|
" multi_dataset_path = findings_dir / \"multi_dataset_findings.yaml\"\n",
|
|
183
186
|
" multi_dataset = None\n",
|
|
184
187
|
" if multi_dataset_path.exists():\n",
|
|
185
188
|
" with open(multi_dataset_path) as f:\n",
|
|
186
189
|
" multi_dataset = yaml.safe_load(f)\n",
|
|
187
|
-
"
|
|
190
|
+
"\n",
|
|
188
191
|
" return findings, registry, multi_dataset\n",
|
|
189
192
|
"\n",
|
|
190
193
|
"findings, registry, multi_dataset = load_findings_and_recommendations(FINDINGS_DIR)\n",
|
|
@@ -244,7 +247,7 @@
|
|
|
244
247
|
" if not registry:\n",
|
|
245
248
|
" print(\"No recommendations loaded. Run notebooks 02-07 first.\")\n",
|
|
246
249
|
" return\n",
|
|
247
|
-
"
|
|
250
|
+
"\n",
|
|
248
251
|
" for layer in [\"bronze\", \"silver\", \"gold\"]:\n",
|
|
249
252
|
" recs = registry.get_by_layer(layer)\n",
|
|
250
253
|
" print(f\"\\n{layer.upper()} ({len(recs)} recommendations):\")\n",
|
|
@@ -300,7 +303,6 @@
|
|
|
300
303
|
},
|
|
301
304
|
"outputs": [],
|
|
302
305
|
"source": [
|
|
303
|
-
"import os\n",
|
|
304
306
|
"\n",
|
|
305
307
|
"output_dir = OUTPUT_BASE_DIR / GENERATION_TARGET.value / PIPELINE_NAME\n",
|
|
306
308
|
"output_dir.mkdir(parents=True, exist_ok=True)\n",
|
|
@@ -348,24 +350,24 @@
|
|
|
348
350
|
"outputs": [],
|
|
349
351
|
"source": [
|
|
350
352
|
"if GENERATION_TARGET == GenerationTarget.LOCAL_FEAST_MLFLOW:\n",
|
|
351
|
-
" from customer_retention.generators.spec_generator import MLflowPipelineGenerator, MLflowConfig\n",
|
|
352
353
|
" from customer_retention.generators.pipeline_generator import PipelineGenerator\n",
|
|
353
|
-
" \n",
|
|
354
|
+
" from customer_retention.generators.spec_generator import MLflowConfig, MLflowPipelineGenerator\n",
|
|
355
|
+
"\n",
|
|
354
356
|
" mlflow_config = MLflowConfig(\n",
|
|
355
357
|
" tracking_uri=\"./mlruns\",\n",
|
|
356
358
|
" experiment_name=PIPELINE_NAME,\n",
|
|
357
359
|
" log_data_quality=True,\n",
|
|
358
360
|
" nested_runs=True\n",
|
|
359
361
|
" )\n",
|
|
360
|
-
"
|
|
362
|
+
"\n",
|
|
361
363
|
" mlflow_gen = MLflowPipelineGenerator(mlflow_config=mlflow_config, output_dir=str(output_dir))\n",
|
|
362
|
-
"
|
|
364
|
+
"\n",
|
|
363
365
|
" if OUTPUT_FORMAT == OutputFormat.PYTHON:\n",
|
|
364
366
|
" saved = mlflow_gen.save_all(findings)\n",
|
|
365
367
|
" print(\"Generated MLflow pipeline files:\")\n",
|
|
366
368
|
" for f in saved:\n",
|
|
367
369
|
" print(f\" {f}\")\n",
|
|
368
|
-
"
|
|
370
|
+
"\n",
|
|
369
371
|
" pipeline_gen = PipelineGenerator(\n",
|
|
370
372
|
" findings_dir=str(FINDINGS_DIR),\n",
|
|
371
373
|
" output_dir=str(output_dir),\n",
|
|
@@ -421,7 +423,7 @@
|
|
|
421
423
|
"source": [
|
|
422
424
|
"if GENERATION_TARGET == GenerationTarget.DATABRICKS:\n",
|
|
423
425
|
" from customer_retention.generators.spec_generator import DatabricksSpecGenerator, PipelineSpec, SourceSpec\n",
|
|
424
|
-
"
|
|
426
|
+
"\n",
|
|
425
427
|
" spec = PipelineSpec(\n",
|
|
426
428
|
" name=PIPELINE_NAME,\n",
|
|
427
429
|
" version=\"1.0.0\",\n",
|
|
@@ -431,7 +433,7 @@
|
|
|
431
433
|
" format=findings.source_format\n",
|
|
432
434
|
" )]\n",
|
|
433
435
|
" )\n",
|
|
434
|
-
"
|
|
436
|
+
"\n",
|
|
435
437
|
" if findings.target_column:\n",
|
|
436
438
|
" from customer_retention.generators.spec_generator import ModelSpec\n",
|
|
437
439
|
" spec.model_config = ModelSpec(\n",
|
|
@@ -439,13 +441,13 @@
|
|
|
439
441
|
" model_type=\"gradient_boosting\",\n",
|
|
440
442
|
" target_column=findings.target_column\n",
|
|
441
443
|
" )\n",
|
|
442
|
-
"
|
|
444
|
+
"\n",
|
|
443
445
|
" db_gen = DatabricksSpecGenerator(\n",
|
|
444
446
|
" catalog=DATABRICKS_CATALOG,\n",
|
|
445
447
|
" schema=DATABRICKS_SCHEMA,\n",
|
|
446
448
|
" output_dir=str(output_dir)\n",
|
|
447
449
|
" )\n",
|
|
448
|
-
"
|
|
450
|
+
"\n",
|
|
449
451
|
" saved = db_gen.save_all(spec)\n",
|
|
450
452
|
" print(\"Generated Databricks artifacts:\")\n",
|
|
451
453
|
" for f in saved:\n",
|
|
@@ -495,15 +497,15 @@
|
|
|
495
497
|
"source": [
|
|
496
498
|
"if GENERATION_TARGET == GenerationTarget.LLM_DOCS:\n",
|
|
497
499
|
" from customer_retention.analysis.auto_explorer import RecommendationEngine\n",
|
|
498
|
-
"
|
|
500
|
+
"\n",
|
|
499
501
|
" recommender = RecommendationEngine()\n",
|
|
500
502
|
" target_rec = recommender.recommend_target(findings)\n",
|
|
501
503
|
" feature_recs = recommender.recommend_features(findings)\n",
|
|
502
504
|
" cleaning_recs = recommender.recommend_cleaning(findings)\n",
|
|
503
|
-
"
|
|
505
|
+
"\n",
|
|
504
506
|
" docs_dir = output_dir / \"docs\"\n",
|
|
505
507
|
" docs_dir.mkdir(parents=True, exist_ok=True)\n",
|
|
506
|
-
"
|
|
508
|
+
"\n",
|
|
507
509
|
" # 1. Overview\n",
|
|
508
510
|
" overview = f\"\"\"# {PIPELINE_NAME} Pipeline Overview\n",
|
|
509
511
|
"\n",
|
|
@@ -526,7 +528,7 @@
|
|
|
526
528
|
" for name, col in list(findings.columns.items())[:20]:\n",
|
|
527
529
|
" overview += f\"| {name} | {col.inferred_type.value} | {col.null_percentage:.1f}% | {col.unique_count} |\\n\"\n",
|
|
528
530
|
" (docs_dir / \"01_overview.md\").write_text(overview)\n",
|
|
529
|
-
"
|
|
531
|
+
"\n",
|
|
530
532
|
" # 2. Bronze layer - separate file per source\n",
|
|
531
533
|
" if registry and registry.sources:\n",
|
|
532
534
|
" for source_name, bronze_recs in registry.sources.items():\n",
|
|
@@ -539,38 +541,38 @@
|
|
|
539
541
|
"\"\"\"\n",
|
|
540
542
|
" for rec in bronze_recs.null_handling:\n",
|
|
541
543
|
" bronze_doc += f\"- `{rec.target_column}`: {rec.action} ({rec.parameters.get('strategy', '')}) - {rec.rationale}\\n\"\n",
|
|
542
|
-
"
|
|
544
|
+
"\n",
|
|
543
545
|
" bronze_doc += \"\\n## Outlier Handling\\n\"\n",
|
|
544
546
|
" for rec in bronze_recs.outlier_handling:\n",
|
|
545
547
|
" bronze_doc += f\"- `{rec.target_column}`: {rec.action} - {rec.rationale}\\n\"\n",
|
|
546
|
-
"
|
|
548
|
+
"\n",
|
|
547
549
|
" bronze_doc += \"\\n## Type Conversions\\n\"\n",
|
|
548
550
|
" for rec in bronze_recs.type_conversions:\n",
|
|
549
551
|
" bronze_doc += f\"- `{rec.target_column}`: {rec.action} - {rec.rationale}\\n\"\n",
|
|
550
|
-
"
|
|
552
|
+
"\n",
|
|
551
553
|
" bronze_doc += \"\\n## Deduplication\\n\"\n",
|
|
552
554
|
" for rec in bronze_recs.deduplication:\n",
|
|
553
555
|
" bronze_doc += f\"- `{rec.target_column}`: {rec.action} - {rec.rationale}\\n\"\n",
|
|
554
|
-
"
|
|
556
|
+
"\n",
|
|
555
557
|
" bronze_doc += \"\\n## Filtering\\n\"\n",
|
|
556
558
|
" for rec in bronze_recs.filtering:\n",
|
|
557
559
|
" bronze_doc += f\"- `{rec.target_column}`: {rec.action} - {rec.rationale}\\n\"\n",
|
|
558
|
-
"
|
|
560
|
+
"\n",
|
|
559
561
|
" bronze_doc += \"\\n## Text Processing\\n\"\n",
|
|
560
562
|
" for rec in bronze_recs.text_processing:\n",
|
|
561
563
|
" bronze_doc += f\"- `{rec.target_column}`: {rec.action} - {rec.rationale}\\n\"\n",
|
|
562
|
-
"
|
|
564
|
+
"\n",
|
|
563
565
|
" safe_name = source_name.replace(\" \", \"_\").lower()\n",
|
|
564
566
|
" (docs_dir / f\"02_bronze_cleaning_{safe_name}.md\").write_text(bronze_doc)\n",
|
|
565
567
|
" else:\n",
|
|
566
|
-
" bronze_doc =
|
|
568
|
+
" bronze_doc = \"\"\"# Bronze Layer - Data Cleaning\n",
|
|
567
569
|
"\n",
|
|
568
570
|
"## Cleaning Recommendations\n",
|
|
569
571
|
"\"\"\"\n",
|
|
570
572
|
" for rec in cleaning_recs:\n",
|
|
571
573
|
" bronze_doc += f\"\\n### {rec.column_name}\\n- **Strategy**: {rec.strategy}\\n- **Severity**: {rec.severity}\\n- **Rationale**: {rec.rationale}\\n\"\n",
|
|
572
574
|
" (docs_dir / \"02_bronze_cleaning.md\").write_text(bronze_doc)\n",
|
|
573
|
-
"
|
|
575
|
+
"\n",
|
|
574
576
|
" # 3. Silver layer\n",
|
|
575
577
|
" silver_doc = \"\"\"# Silver Layer - Feature Engineering\n",
|
|
576
578
|
"\n",
|
|
@@ -580,18 +582,18 @@
|
|
|
580
582
|
" silver_doc += \"\\n### Joins\\n\"\n",
|
|
581
583
|
" for rec in registry.silver.joins:\n",
|
|
582
584
|
" silver_doc += f\"- {rec.parameters.get('left_source', '')} ⟷ {rec.parameters.get('right_source', '')} on `{rec.parameters.get('join_keys', [])}`\\n\"\n",
|
|
583
|
-
"
|
|
585
|
+
"\n",
|
|
584
586
|
" silver_doc += \"\\n### Aggregations\\n\"\n",
|
|
585
587
|
" for rec in registry.silver.aggregations:\n",
|
|
586
588
|
" silver_doc += f\"- `{rec.target_column}`: {rec.action} - windows: {rec.parameters.get('windows', [])}\\n\"\n",
|
|
587
|
-
"
|
|
589
|
+
"\n",
|
|
588
590
|
" silver_doc += \"\\n### Derived Columns\\n\"\n",
|
|
589
591
|
" for rec in registry.silver.derived_columns:\n",
|
|
590
592
|
" silver_doc += f\"- `{rec.target_column}`: {rec.parameters.get('expression', rec.action)}\\n\"\n",
|
|
591
593
|
" else:\n",
|
|
592
594
|
" silver_doc += \"\\nNo silver-layer recommendations found.\\n\"\n",
|
|
593
595
|
" (docs_dir / \"03_silver_features.md\").write_text(silver_doc)\n",
|
|
594
|
-
"
|
|
596
|
+
"\n",
|
|
595
597
|
" # 4. Gold layer\n",
|
|
596
598
|
" gold_doc = \"\"\"# Gold Layer - ML Features\n",
|
|
597
599
|
"\n",
|
|
@@ -599,25 +601,25 @@
|
|
|
599
601
|
"\"\"\"\n",
|
|
600
602
|
" for rec in feature_recs[:15]:\n",
|
|
601
603
|
" gold_doc += f\"\\n### {rec.feature_name}\\n- **Source**: {rec.source_column}\\n- **Type**: {rec.feature_type}\\n- **Description**: {rec.description}\\n\"\n",
|
|
602
|
-
"
|
|
604
|
+
"\n",
|
|
603
605
|
" if registry and registry.gold:\n",
|
|
604
606
|
" gold_doc += \"\\n## Encoding\\n\"\n",
|
|
605
607
|
" for rec in registry.gold.encoding:\n",
|
|
606
608
|
" gold_doc += f\"- `{rec.target_column}`: {rec.parameters.get('method', rec.action)}\\n\"\n",
|
|
607
|
-
"
|
|
609
|
+
"\n",
|
|
608
610
|
" gold_doc += \"\\n## Scaling\\n\"\n",
|
|
609
611
|
" for rec in registry.gold.scaling:\n",
|
|
610
612
|
" gold_doc += f\"- `{rec.target_column}`: {rec.parameters.get('method', rec.action)}\\n\"\n",
|
|
611
|
-
"
|
|
613
|
+
"\n",
|
|
612
614
|
" gold_doc += \"\\n## Feature Selection\\n\"\n",
|
|
613
615
|
" for rec in registry.gold.feature_selection:\n",
|
|
614
616
|
" gold_doc += f\"- `{rec.target_column}`: {rec.action} - {rec.rationale}\\n\"\n",
|
|
615
|
-
"
|
|
617
|
+
"\n",
|
|
616
618
|
" gold_doc += \"\\n## Transformations\\n\"\n",
|
|
617
619
|
" for rec in registry.gold.transformations:\n",
|
|
618
620
|
" gold_doc += f\"- `{rec.target_column}`: {rec.action} - {rec.parameters}\\n\"\n",
|
|
619
621
|
" (docs_dir / \"04_gold_ml_features.md\").write_text(gold_doc)\n",
|
|
620
|
-
"
|
|
622
|
+
"\n",
|
|
621
623
|
" # 5. Training\n",
|
|
622
624
|
" training_doc = f\"\"\"# Model Training\n",
|
|
623
625
|
"\n",
|
|
@@ -636,7 +638,7 @@
|
|
|
636
638
|
"- F1 Score\n",
|
|
637
639
|
"\"\"\"\n",
|
|
638
640
|
" (docs_dir / \"05_training.md\").write_text(training_doc)\n",
|
|
639
|
-
"
|
|
641
|
+
"\n",
|
|
640
642
|
" print(\"Generated LLM documentation:\")\n",
|
|
641
643
|
" for f in sorted(docs_dir.glob(\"*.md\")):\n",
|
|
642
644
|
" print(f\" {f.name}\")\n",
|
|
@@ -687,11 +689,12 @@
|
|
|
687
689
|
"source": [
|
|
688
690
|
"import json\n",
|
|
689
691
|
"\n",
|
|
692
|
+
"\n",
|
|
690
693
|
"def py_to_notebook(py_path: Path):\n",
|
|
691
694
|
" content = py_path.read_text()\n",
|
|
692
695
|
" cells = []\n",
|
|
693
696
|
" current_lines = []\n",
|
|
694
|
-
"
|
|
697
|
+
"\n",
|
|
695
698
|
" for line in content.split(\"\\n\"):\n",
|
|
696
699
|
" if line.startswith(\"# %% \") or line.startswith(\"# %%\\n\"):\n",
|
|
697
700
|
" if current_lines:\n",
|
|
@@ -702,16 +705,16 @@
|
|
|
702
705
|
" cells.append({\"cell_type\": \"markdown\", \"metadata\": {}, \"source\": [f\"## {title}\"]})\n",
|
|
703
706
|
" else:\n",
|
|
704
707
|
" current_lines.append(line + \"\\n\")\n",
|
|
705
|
-
"
|
|
708
|
+
"\n",
|
|
706
709
|
" if current_lines:\n",
|
|
707
710
|
" cells.append({\"cell_type\": \"code\", \"metadata\": {}, \"source\": current_lines, \"outputs\": [], \"execution_count\": None})\n",
|
|
708
|
-
"
|
|
711
|
+
"\n",
|
|
709
712
|
" notebook = {\n",
|
|
710
713
|
" \"cells\": cells,\n",
|
|
711
714
|
" \"metadata\": {\"kernelspec\": {\"display_name\": \"Python 3\", \"language\": \"python\", \"name\": \"python3\"}},\n",
|
|
712
715
|
" \"nbformat\": 4, \"nbformat_minor\": 4\n",
|
|
713
716
|
" }\n",
|
|
714
|
-
"
|
|
717
|
+
"\n",
|
|
715
718
|
" out_path = py_path.with_suffix(\".ipynb\")\n",
|
|
716
719
|
" out_path.write_text(json.dumps(notebook, indent=1))\n",
|
|
717
720
|
" return out_path\n",
|
|
@@ -783,12 +786,12 @@
|
|
|
783
786
|
" print(\"Pipeline will run Bronze → Silver → Gold → Training...\")\n",
|
|
784
787
|
" subprocess.run([\"python\", \"pipeline_runner.py\"], cwd=str(output_dir.resolve()))\n",
|
|
785
788
|
" else:\n",
|
|
786
|
-
" print(
|
|
789
|
+
" print(\"pipeline_runner.py not found. Generate first by running cells above.\")\n",
|
|
787
790
|
"else:\n",
|
|
788
791
|
" print(\"To run the complete pipeline:\")\n",
|
|
789
792
|
" print(f\"\\n cd {output_dir}\")\n",
|
|
790
|
-
" print(
|
|
791
|
-
" print(
|
|
793
|
+
" print(\" python pipeline_runner.py\")\n",
|
|
794
|
+
" print(\"\\nThis will:\")\n",
|
|
792
795
|
" print(\" 1. Run Landing layers (event sources)\")\n",
|
|
793
796
|
" print(\" 2. Run Bronze layers (parallel)\")\n",
|
|
794
797
|
" print(\" 3. Run Silver merge\")\n",
|
|
@@ -916,7 +919,7 @@
|
|
|
916
919
|
" print(f\" - Scalings: {len(registry.gold.scaling) if registry.gold else 0}\")\n",
|
|
917
920
|
" print(f\" - Transformations: {len(registry.gold.transformations) if registry.gold else 0}\")\n",
|
|
918
921
|
" print(f\" - Feature selections: {len(registry.gold.feature_selection) if registry.gold else 0}\")\n",
|
|
919
|
-
"
|
|
922
|
+
"\n",
|
|
920
923
|
" # Show what's in each layer for debugging\n",
|
|
921
924
|
" print()\n",
|
|
922
925
|
" print(\"Recommendations by layer:\")\n",
|
|
@@ -928,13 +931,13 @@
|
|
|
928
931
|
" print(f\" - [{rec.category}] {rec.target_column}: {rec.action}\")\n",
|
|
929
932
|
" if len(recs) > 3:\n",
|
|
930
933
|
" print(f\" ... and {len(recs) - 3} more\")\n",
|
|
931
|
-
"
|
|
934
|
+
"\n",
|
|
932
935
|
" # Check if gold layer exists but is empty\n",
|
|
933
936
|
" if registry.gold:\n",
|
|
934
937
|
" print(f\"\\n✓ Gold layer initialized (target: {registry.gold.target_column})\")\n",
|
|
935
938
|
" else:\n",
|
|
936
939
|
" print(\"\\n⚠ Gold layer not initialized - run step 06 first\")\n",
|
|
937
|
-
"
|
|
940
|
+
"\n",
|
|
938
941
|
" print()\n",
|
|
939
942
|
" print(\"Use this hash to:\")\n",
|
|
940
943
|
" print(\" - Track MLflow experiments (tag: recommendations_hash)\")\n",
|
|
@@ -990,6 +993,7 @@
|
|
|
990
993
|
"source": [
|
|
991
994
|
"# Inspect Feast Feature Store contents\n",
|
|
992
995
|
"import warnings\n",
|
|
996
|
+
"\n",
|
|
993
997
|
"warnings.filterwarnings(\"ignore\", category=DeprecationWarning, module=\"feast\")\n",
|
|
994
998
|
"\n",
|
|
995
999
|
"feast_repo_path = output_dir / \"feature_repo\"\n",
|
|
@@ -998,15 +1002,15 @@
|
|
|
998
1002
|
" try:\n",
|
|
999
1003
|
" from feast import FeatureStore\n",
|
|
1000
1004
|
" store = FeatureStore(repo_path=str(feast_repo_path))\n",
|
|
1001
|
-
"
|
|
1005
|
+
"\n",
|
|
1002
1006
|
" print(\"Feast Feature Store Contents\")\n",
|
|
1003
1007
|
" print(\"=\" * 60)\n",
|
|
1004
|
-
"
|
|
1008
|
+
"\n",
|
|
1005
1009
|
" # List entities\n",
|
|
1006
1010
|
" entities = store.list_entities()\n",
|
|
1007
1011
|
" feature_views = store.list_feature_views()\n",
|
|
1008
1012
|
" data_sources = store.list_data_sources()\n",
|
|
1009
|
-
"
|
|
1013
|
+
"\n",
|
|
1010
1014
|
" # Check if registry is empty (feast apply not run yet)\n",
|
|
1011
1015
|
" if not entities and not feature_views:\n",
|
|
1012
1016
|
" print(\"\\n⚠️ Feature store registry is empty.\")\n",
|
|
@@ -1021,7 +1025,7 @@
|
|
|
1021
1025
|
" print(f\"\\n📦 Entities ({len(entities)}):\")\n",
|
|
1022
1026
|
" for entity in entities:\n",
|
|
1023
1027
|
" print(f\" - {entity.name} (join_key: {entity.join_keys})\")\n",
|
|
1024
|
-
"
|
|
1028
|
+
"\n",
|
|
1025
1029
|
" print(f\"\\n📊 Feature Views ({len(feature_views)}):\")\n",
|
|
1026
1030
|
" for fv in feature_views:\n",
|
|
1027
1031
|
" print(f\" - {fv.name}: {len(fv.features)} features\")\n",
|
|
@@ -1029,13 +1033,13 @@
|
|
|
1029
1033
|
" print(f\" • {feat.name} ({feat.dtype})\")\n",
|
|
1030
1034
|
" if len(fv.features) > 5:\n",
|
|
1031
1035
|
" print(f\" ... and {len(fv.features) - 5} more\")\n",
|
|
1032
|
-
"
|
|
1036
|
+
"\n",
|
|
1033
1037
|
" print(f\"\\n💾 Data Sources ({len(data_sources)}):\")\n",
|
|
1034
1038
|
" for ds in data_sources:\n",
|
|
1035
1039
|
" print(f\" - {ds.name}\")\n",
|
|
1036
|
-
"
|
|
1040
|
+
"\n",
|
|
1037
1041
|
" # Try to show sample data from parquet files\n",
|
|
1038
|
-
" print(
|
|
1042
|
+
" print(\"\\n📄 Sample Feature Data:\")\n",
|
|
1039
1043
|
" data_dir = feast_repo_path / \"data\"\n",
|
|
1040
1044
|
" if data_dir.exists():\n",
|
|
1041
1045
|
" parquet_files = list(data_dir.glob(\"*.parquet\"))\n",
|
|
@@ -1043,14 +1047,14 @@
|
|
|
1043
1047
|
" sample_df = pd.read_parquet(parquet_files[0])\n",
|
|
1044
1048
|
" print(f\" Source: {parquet_files[0].name}\")\n",
|
|
1045
1049
|
" print(f\" Shape: {sample_df.shape[0]:,} rows x {sample_df.shape[1]} columns\")\n",
|
|
1046
|
-
" print(
|
|
1050
|
+
" print(\"\\n Head (first 5 rows):\")\n",
|
|
1047
1051
|
" display(sample_df.head())\n",
|
|
1048
1052
|
" else:\n",
|
|
1049
1053
|
" print(\" No parquet files found yet in data/ directory.\")\n",
|
|
1050
1054
|
" print(\" Features will be materialized when you run the pipeline.\")\n",
|
|
1051
1055
|
" else:\n",
|
|
1052
1056
|
" print(\" Data directory not created yet.\")\n",
|
|
1053
|
-
"
|
|
1057
|
+
"\n",
|
|
1054
1058
|
" except ImportError:\n",
|
|
1055
1059
|
" print(\"Feast not installed. Install with: pip install feast\")\n",
|
|
1056
1060
|
" except Exception as e:\n",
|
|
@@ -54,12 +54,11 @@
|
|
|
54
54
|
"outputs": [],
|
|
55
55
|
"source": [
|
|
56
56
|
"from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
|
|
57
|
+
"\n",
|
|
57
58
|
"track_and_export_previous(\"11_scoring_validation.ipynb\")\n",
|
|
58
59
|
"\n",
|
|
59
60
|
"import sys\n",
|
|
60
|
-
"from pathlib import Path\n"
|
|
61
|
-
"\n",
|
|
62
|
-
"from customer_retention.core.config.experiments import EXPERIMENTS_DIR, FINDINGS_DIR"
|
|
61
|
+
"from pathlib import Path\n"
|
|
63
62
|
]
|
|
64
63
|
},
|
|
65
64
|
{
|
|
@@ -95,10 +94,19 @@
|
|
|
95
94
|
"sys.path.insert(0, str(PIPELINE_DIR))\n",
|
|
96
95
|
"\n",
|
|
97
96
|
"from config import (\n",
|
|
98
|
-
"
|
|
99
|
-
"
|
|
100
|
-
"
|
|
101
|
-
"
|
|
97
|
+
" ARTIFACTS_PATH,\n",
|
|
98
|
+
" FEAST_ENTITY_KEY,\n",
|
|
99
|
+
" FEAST_FEATURE_VIEW,\n",
|
|
100
|
+
" FEAST_REPO_PATH,\n",
|
|
101
|
+
" FEAST_TIMESTAMP_COL,\n",
|
|
102
|
+
" MLFLOW_TRACKING_URI,\n",
|
|
103
|
+
" PIPELINE_NAME,\n",
|
|
104
|
+
" PRODUCTION_DIR,\n",
|
|
105
|
+
" RECOMMENDATIONS_HASH,\n",
|
|
106
|
+
" TARGET_COLUMN,\n",
|
|
107
|
+
")\n",
|
|
108
|
+
"from config import (\n",
|
|
109
|
+
" EXPERIMENTS_DIR as GEN_EXPERIMENTS_DIR,\n",
|
|
102
110
|
")\n",
|
|
103
111
|
"\n",
|
|
104
112
|
"print(f\"Pipeline: {PIPELINE_NAME}\")\n",
|
|
@@ -146,18 +154,15 @@
|
|
|
146
154
|
},
|
|
147
155
|
"outputs": [],
|
|
148
156
|
"source": [
|
|
149
|
-
"import numpy as np\n",
|
|
150
|
-
"import pandas as pd\n",
|
|
151
157
|
"import mlflow\n",
|
|
152
158
|
"import mlflow.sklearn\n",
|
|
153
159
|
"import mlflow.xgboost\n",
|
|
160
|
+
"import numpy as np\n",
|
|
161
|
+
"import pandas as pd\n",
|
|
154
162
|
"import xgboost as xgb\n",
|
|
155
163
|
"from feast import FeatureStore\n",
|
|
156
|
-
"
|
|
157
|
-
"from customer_retention.
|
|
158
|
-
" PipelineTransformationType, TransformationStep,\n",
|
|
159
|
-
")\n",
|
|
160
|
-
"from config import EXCLUDED_SOURCES\n",
|
|
164
|
+
"\n",
|
|
165
|
+
"from customer_retention.transforms import ArtifactStore, TransformExecutor\n",
|
|
161
166
|
"\n",
|
|
162
167
|
"_registry = ArtifactStore.from_manifest(Path(ARTIFACTS_PATH) / \"manifest.yaml\")\n",
|
|
163
168
|
"_executor = TransformExecutor()\n",
|
|
@@ -279,7 +284,7 @@
|
|
|
279
284
|
"y_pred = (y_proba >= 0.5).astype(int)\n",
|
|
280
285
|
"\n",
|
|
281
286
|
"# --- Metrics ---\n",
|
|
282
|
-
"from sklearn.metrics import
|
|
287
|
+
"from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score\n",
|
|
283
288
|
"\n",
|
|
284
289
|
"metrics = {\n",
|
|
285
290
|
" \"accuracy\": accuracy_score(y_true, y_pred),\n",
|
|
@@ -346,8 +351,12 @@
|
|
|
346
351
|
"outputs": [],
|
|
347
352
|
"source": [
|
|
348
353
|
"from sklearn.metrics import (\n",
|
|
349
|
-
" accuracy_score
|
|
350
|
-
"
|
|
354
|
+
" accuracy_score,\n",
|
|
355
|
+
" confusion_matrix,\n",
|
|
356
|
+
" f1_score,\n",
|
|
357
|
+
" precision_score,\n",
|
|
358
|
+
" recall_score,\n",
|
|
359
|
+
" roc_auc_score,\n",
|
|
351
360
|
")\n",
|
|
352
361
|
"\n",
|
|
353
362
|
"y_true = predictions_df[\"actual\"]\n",
|
|
@@ -367,7 +376,7 @@
|
|
|
367
376
|
" print(f\" {name}: {value:.4f}\")\n",
|
|
368
377
|
"\n",
|
|
369
378
|
"cm = confusion_matrix(y_true, y_pred)\n",
|
|
370
|
-
"print(
|
|
379
|
+
"print(\"\\nConfusion Matrix:\")\n",
|
|
371
380
|
"print(f\" TN={cm[0,0]:,} FP={cm[0,1]:,}\")\n",
|
|
372
381
|
"print(f\" FN={cm[1,0]:,} TP={cm[1,1]:,}\")"
|
|
373
382
|
]
|
|
@@ -467,12 +476,18 @@
|
|
|
467
476
|
},
|
|
468
477
|
"outputs": [],
|
|
469
478
|
"source": [
|
|
479
|
+
"from IPython.display import display\n",
|
|
470
480
|
"from sklearn.metrics import (\n",
|
|
471
|
-
"
|
|
472
|
-
"
|
|
473
|
-
"
|
|
481
|
+
" accuracy_score,\n",
|
|
482
|
+
" average_precision_score,\n",
|
|
483
|
+
" confusion_matrix,\n",
|
|
484
|
+
" f1_score,\n",
|
|
485
|
+
" precision_recall_curve,\n",
|
|
486
|
+
" precision_score,\n",
|
|
487
|
+
" recall_score,\n",
|
|
488
|
+
" roc_auc_score,\n",
|
|
489
|
+
" roc_curve,\n",
|
|
474
490
|
")\n",
|
|
475
|
-
"from IPython.display import display, HTML\n",
|
|
476
491
|
"\n",
|
|
477
492
|
"mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)\n",
|
|
478
493
|
"client = mlflow.tracking.MlflowClient()\n",
|
|
@@ -1059,7 +1074,7 @@
|
|
|
1059
1074
|
")\n",
|
|
1060
1075
|
"\n",
|
|
1061
1076
|
"print(f\"Customer browser ready with {len(browser_df):,} records\")\n",
|
|
1062
|
-
"print(
|
|
1077
|
+
"print(\"\\nPrediction Distribution:\")\n",
|
|
1063
1078
|
"print(f\" Predicted Positive: {(browser_df['prediction'] == 1).sum():,}\")\n",
|
|
1064
1079
|
"print(f\" Predicted Negative: {(browser_df['prediction'] == 0).sum():,}\")\n",
|
|
1065
1080
|
"print(f\"\\nCorrect Predictions: {browser_df['correct'].sum():,}/{len(browser_df):,} ({browser_df['correct'].mean():.1%})\")"
|
|
@@ -44,6 +44,7 @@
|
|
|
44
44
|
"outputs": [],
|
|
45
45
|
"source": [
|
|
46
46
|
"from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
|
|
47
|
+
"\n",
|
|
47
48
|
"track_and_export_previous(\"12_view_documentation.ipynb\")"
|
|
48
49
|
]
|
|
49
50
|
},
|
|
@@ -70,8 +71,9 @@
|
|
|
70
71
|
"outputs": [],
|
|
71
72
|
"source": [
|
|
72
73
|
"from pathlib import Path\n",
|
|
73
|
-
"
|
|
74
|
+
"\n",
|
|
74
75
|
"from customer_retention.analysis.notebook_html_exporter import check_exported_html\n",
|
|
76
|
+
"from customer_retention.core.config.experiments import get_notebook_experiments_dir\n",
|
|
75
77
|
"\n",
|
|
76
78
|
"docs_dir = get_notebook_experiments_dir() / \"docs\"\n",
|
|
77
79
|
"notebook_dir = Path(\"exploration_notebooks\")\n",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: churnkit
|
|
3
|
-
Version: 0.76.
|
|
3
|
+
Version: 0.76.1a2
|
|
4
4
|
Summary: Structured ML framework for customer churn prediction -- from exploration notebooks to production pipelines, locally or on Databricks.
|
|
5
5
|
Project-URL: Homepage, https://github.com/aladjov/CR
|
|
6
6
|
Project-URL: Documentation, https://github.com/aladjov/CR/wiki
|