churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,854 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "cell-0",
6
+ "metadata": {
7
+ "papermill": {
8
+ "duration": 0.008146,
9
+ "end_time": "2026-02-02T01:47:23.371490",
10
+ "exception": false,
11
+ "start_time": "2026-02-02T01:47:23.363344",
12
+ "status": "completed"
13
+ },
14
+ "tags": []
15
+ },
16
+ "source": [
17
+ "# Chapter 2a: Text Columns Deep Dive\n",
18
+ "\n",
19
+ "**Purpose:** Transform TEXT columns (tickets, emails, messages) into numeric features using embeddings and dimensionality reduction.\n",
20
+ "\n",
21
+ "**When to use this notebook:**\n",
22
+ "- Your dataset contains TEXT columns (unstructured text data)\n",
23
+ "- Detected automatically if ColumnType.TEXT found in findings\n",
24
+ "\n",
25
+ "**What you'll learn:**\n",
26
+ "- How text embeddings capture semantic meaning\n",
27
+ "- Why PCA reduces dimensions while preserving variance\n",
28
+ "- How to choose between fast vs high-quality embedding models\n",
29
+ "\n",
30
+ "**Outputs:**\n",
31
+ "- PC features (text_pc1, text_pc2, ...) for each TEXT column\n",
32
+ "- TextProcessingMetadata in findings\n",
33
+ "- Recommendations for production pipeline\n",
34
+ "\n",
35
+ "---\n",
36
+ "\n",
37
+ "## Two Approaches to Text Feature Engineering\n",
38
+ "\n",
39
+ "| Approach | Method | When to Use |\n",
40
+ "|----------|--------|-------------|\n",
41
+ "| **1. Embeddings + PCA** (This notebook) | Sentence-transformers → PCA | General semantic features |\n",
42
+ "| **2. LLM Labeling** (Future) | LLM on samples → Train classifier | Specific categories needed |\n",
43
+ "\n",
44
+ "### Approach 1: Embeddings + Dimensionality Reduction (Current)\n",
45
+ "\n",
46
+ "```\n",
47
+ "TEXT Column → Embeddings → PCA → pc1, pc2, ..., pcN\n",
48
+ "```\n",
49
+ "\n",
50
+ "- **Embeddings**: Dense vectors capturing semantic meaning (similar texts = similar vectors)\n",
51
+ "- **PCA**: Reduces dimensions to N components covering target variance (default 95%)\n",
52
+ "- **Output**: Numeric features usable with standard ML models\n",
53
+ "\n",
54
+ "### Embedding Model Options\n",
55
+ "\n",
56
+ "| Model | Size | Embedding Dim | Speed | Quality | Best For |\n",
57
+ "|-------|------|---------------|-------|---------|----------|\n",
58
+ "| **MiniLM** (default) | 90 MB | 384 | Fast | Good | CPU, quick iteration, small datasets |\n",
59
+ "| **Qwen3-0.6B** | 1.2 GB | 1024 | Medium | Better | GPU available, production quality |\n",
60
+ "| **Qwen3-4B** | 8 GB | 2560 | Slow | High | 16GB+ GPU, multilingual, high accuracy |\n",
61
+ "| **Qwen3-8B** | 16 GB | 4096 | Slowest | Highest | 32GB+ GPU, research, max quality |\n",
62
+ "\n",
63
+ "**Note:** Models are downloaded on first use (lazy loading). Qwen3 models require GPU for reasonable performance.\n",
64
+ "\n",
65
+ "### Approach 2: LLM Labeling (Future Enhancement)\n",
66
+ "\n",
67
+ "```\n",
68
+ "TEXT Column → Sample → LLM Labels → Train Classifier → Apply to All\n",
69
+ "```\n",
70
+ "\n",
71
+ "- Use when you need specific categorical labels (sentiment, topic, intent)\n",
72
+ "- More expensive but more interpretable"
73
+ ]
74
+ },
75
+ {
76
+ "cell_type": "markdown",
77
+ "id": "cell-1",
78
+ "metadata": {
79
+ "papermill": {
80
+ "duration": 0.005848,
81
+ "end_time": "2026-02-02T01:47:23.384304",
82
+ "exception": false,
83
+ "start_time": "2026-02-02T01:47:23.378456",
84
+ "status": "completed"
85
+ },
86
+ "tags": []
87
+ },
88
+ "source": [
89
+ "## 2a.1 Load Previous Findings"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "code",
94
+ "execution_count": null,
95
+ "id": "cell-2",
96
+ "metadata": {
97
+ "execution": {
98
+ "iopub.execute_input": "2026-02-02T01:47:23.396604Z",
99
+ "iopub.status.busy": "2026-02-02T01:47:23.396464Z",
100
+ "iopub.status.idle": "2026-02-02T01:47:25.750830Z",
101
+ "shell.execute_reply": "2026-02-02T01:47:25.750340Z"
102
+ },
103
+ "papermill": {
104
+ "duration": 2.361354,
105
+ "end_time": "2026-02-02T01:47:25.751648",
106
+ "exception": false,
107
+ "start_time": "2026-02-02T01:47:23.390294",
108
+ "status": "completed"
109
+ },
110
+ "tags": []
111
+ },
112
+ "outputs": [],
113
+ "source": [
114
+ "from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
115
+ "track_and_export_previous(\"02a_text_columns_deep_dive.ipynb\")\n",
116
+ "\n",
117
+ "from customer_retention.analysis.auto_explorer import ExplorationFindings, TextProcessingMetadata\n",
118
+ "from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table, console\n",
119
+ "from customer_retention.core.config.column_config import ColumnType\n",
120
+ "from customer_retention.stages.profiling import (\n",
121
+ " TextColumnProcessor, TextProcessingConfig, TextColumnResult,\n",
122
+ " TextEmbedder, TextDimensionalityReducer,\n",
123
+ " EMBEDDING_MODELS, get_model_info, list_available_models\n",
124
+ ")\n",
125
+ "import pandas as pd\n",
126
+ "import numpy as np\n",
127
+ "import plotly.graph_objects as go\n",
128
+ "import plotly.express as px\n",
129
+ "from plotly.subplots import make_subplots\n",
130
+ "from customer_retention.core.config.experiments import FINDINGS_DIR, EXPERIMENTS_DIR, OUTPUT_DIR, setup_experiments_structure"
131
+ ]
132
+ },
133
+ {
134
+ "cell_type": "code",
135
+ "execution_count": null,
136
+ "id": "cell-3",
137
+ "metadata": {
138
+ "execution": {
139
+ "iopub.execute_input": "2026-02-02T01:47:25.761974Z",
140
+ "iopub.status.busy": "2026-02-02T01:47:25.761777Z",
141
+ "iopub.status.idle": "2026-02-02T01:47:25.956580Z",
142
+ "shell.execute_reply": "2026-02-02T01:47:25.955962Z"
143
+ },
144
+ "papermill": {
145
+ "duration": 0.20067,
146
+ "end_time": "2026-02-02T01:47:25.957216",
147
+ "exception": false,
148
+ "start_time": "2026-02-02T01:47:25.756546",
149
+ "status": "completed"
150
+ },
151
+ "tags": []
152
+ },
153
+ "outputs": [],
154
+ "source": [
155
+ "# === CONFIGURATION ===\n",
156
+ "from pathlib import Path\n",
157
+ "\n",
158
+ "# FINDINGS_DIR imported from customer_retention.core.config.experiments\n",
159
+ "\n",
160
+ "findings_files = [f for f in FINDINGS_DIR.glob(\"*_findings.yaml\") if \"multi_dataset\" not in f.name]\n",
161
+ "if not findings_files:\n",
162
+ " raise FileNotFoundError(f\"No findings files found in {FINDINGS_DIR}. Run notebook 01 first.\")\n",
163
+ "\n",
164
+ "findings_files.sort(key=lambda f: f.stat().st_mtime, reverse=True)\n",
165
+ "FINDINGS_PATH = str(findings_files[0])\n",
166
+ "\n",
167
+ "print(f\"Found {len(findings_files)} findings file(s)\")\n",
168
+ "print(f\"Using: {FINDINGS_PATH}\")\n",
169
+ "\n",
170
+ "findings = ExplorationFindings.load(FINDINGS_PATH)\n",
171
+ "print(f\"\\nLoaded findings for {findings.column_count} columns from {findings.source_path}\")"
172
+ ]
173
+ },
174
+ {
175
+ "cell_type": "code",
176
+ "execution_count": null,
177
+ "id": "cell-4",
178
+ "metadata": {
179
+ "execution": {
180
+ "iopub.execute_input": "2026-02-02T01:47:25.967854Z",
181
+ "iopub.status.busy": "2026-02-02T01:47:25.967561Z",
182
+ "iopub.status.idle": "2026-02-02T01:47:25.970428Z",
183
+ "shell.execute_reply": "2026-02-02T01:47:25.969940Z"
184
+ },
185
+ "papermill": {
186
+ "duration": 0.009088,
187
+ "end_time": "2026-02-02T01:47:25.971120",
188
+ "exception": false,
189
+ "start_time": "2026-02-02T01:47:25.962032",
190
+ "status": "completed"
191
+ },
192
+ "tags": []
193
+ },
194
+ "outputs": [],
195
+ "source": [
196
+ "# Identify TEXT columns\n",
197
+ "text_columns = [\n",
198
+ " name for name, col in findings.columns.items()\n",
199
+ " if col.inferred_type == ColumnType.TEXT\n",
200
+ "]\n",
201
+ "\n",
202
+ "if not text_columns:\n",
203
+ " print(\"\\u26a0\\ufe0f No TEXT columns detected in this dataset.\")\n",
204
+ " print(\" This notebook is only needed when TEXT columns are present.\")\n",
205
+ " print(\" Continue to notebook 03_quality_assessment.ipynb\")\n",
206
+ "else:\n",
207
+ " print(f\"\\u2705 Found {len(text_columns)} TEXT column(s):\")\n",
208
+ " for col in text_columns:\n",
209
+ " col_info = findings.columns[col]\n",
210
+ " print(f\" - {col} (Confidence: {col_info.confidence:.0%})\")"
211
+ ]
212
+ },
213
+ {
214
+ "cell_type": "markdown",
215
+ "id": "cell-5",
216
+ "metadata": {
217
+ "papermill": {
218
+ "duration": 0.004835,
219
+ "end_time": "2026-02-02T01:47:25.980918",
220
+ "exception": false,
221
+ "start_time": "2026-02-02T01:47:25.976083",
222
+ "status": "completed"
223
+ },
224
+ "tags": []
225
+ },
226
+ "source": [
227
+ "## 2a.2 Load Source Data"
228
+ ]
229
+ },
230
+ {
231
+ "cell_type": "code",
232
+ "execution_count": null,
233
+ "id": "cell-6",
234
+ "metadata": {
235
+ "execution": {
236
+ "iopub.execute_input": "2026-02-02T01:47:25.991285Z",
237
+ "iopub.status.busy": "2026-02-02T01:47:25.991167Z",
238
+ "iopub.status.idle": "2026-02-02T01:47:26.241129Z",
239
+ "shell.execute_reply": "2026-02-02T01:47:26.240166Z"
240
+ },
241
+ "papermill": {
242
+ "duration": 0.259568,
243
+ "end_time": "2026-02-02T01:47:26.245361",
244
+ "exception": false,
245
+ "start_time": "2026-02-02T01:47:25.985793",
246
+ "status": "completed"
247
+ },
248
+ "tags": []
249
+ },
250
+ "outputs": [],
251
+ "source": [
252
+ "from customer_retention.stages.temporal import load_data_with_snapshot_preference, TEMPORAL_METADATA_COLS\n",
253
+ "\n",
254
+ "df, data_source = load_data_with_snapshot_preference(findings, output_dir=str(FINDINGS_DIR))\n",
255
+ "charts = ChartBuilder()\n",
256
+ "\n",
257
+ "print(f\"Loaded {len(df):,} rows x {len(df.columns)} columns\")\n",
258
+ "print(f\"Data source: {data_source}\")"
259
+ ]
260
+ },
261
+ {
262
+ "cell_type": "markdown",
263
+ "id": "cell-7",
264
+ "metadata": {
265
+ "papermill": {
266
+ "duration": 0.005765,
267
+ "end_time": "2026-02-02T01:47:26.256952",
268
+ "exception": false,
269
+ "start_time": "2026-02-02T01:47:26.251187",
270
+ "status": "completed"
271
+ },
272
+ "tags": []
273
+ },
274
+ "source": [
275
+ "## 2a.3 Configuration\n",
276
+ "\n",
277
+ "### Available Embedding Models\n",
278
+ "\n",
279
+ "Run the cell below to see available models and their specifications. Then configure your choice."
280
+ ]
281
+ },
282
+ {
283
+ "cell_type": "code",
284
+ "execution_count": null,
285
+ "id": "cell-8",
286
+ "metadata": {
287
+ "execution": {
288
+ "iopub.execute_input": "2026-02-02T01:47:26.269516Z",
289
+ "iopub.status.busy": "2026-02-02T01:47:26.269372Z",
290
+ "iopub.status.idle": "2026-02-02T01:47:26.273517Z",
291
+ "shell.execute_reply": "2026-02-02T01:47:26.272670Z"
292
+ },
293
+ "papermill": {
294
+ "duration": 0.011181,
295
+ "end_time": "2026-02-02T01:47:26.274532",
296
+ "exception": false,
297
+ "start_time": "2026-02-02T01:47:26.263351",
298
+ "status": "completed"
299
+ },
300
+ "tags": []
301
+ },
302
+ "outputs": [],
303
+ "source": [
304
+ "# Display available embedding models\n",
305
+ "print(\"Available Embedding Models\")\n",
306
+ "print(\"=\" * 80)\n",
307
+ "print(f\"{'Preset':<15} {'Model':<35} {'Size':<10} {'Dim':<8} {'GPU?'}\")\n",
308
+ "print(\"-\" * 80)\n",
309
+ "\n",
310
+ "for preset in list_available_models():\n",
311
+ " info = get_model_info(preset)\n",
312
+ " size = f\"{info['size_mb']} MB\" if info['size_mb'] < 1000 else f\"{info['size_mb']/1000:.1f} GB\"\n",
313
+ " gpu = \"Yes\" if info['gpu_recommended'] else \"No\"\n",
314
+ " print(f\"{preset:<15} {info['model_name']:<35} {size:<10} {info['embedding_dim']:<8} {gpu}\")\n",
315
+ " print(f\" {info['description']}\")\n",
316
+ " print()\n",
317
+ "\n",
318
+ "print(\"\\nModels are downloaded on first use. Choose based on your hardware and quality needs.\")"
319
+ ]
320
+ },
321
+ {
322
+ "cell_type": "code",
323
+ "execution_count": null,
324
+ "id": "em1rk2p1n0m",
325
+ "metadata": {
326
+ "execution": {
327
+ "iopub.execute_input": "2026-02-02T01:47:26.288846Z",
328
+ "iopub.status.busy": "2026-02-02T01:47:26.288653Z",
329
+ "iopub.status.idle": "2026-02-02T01:47:26.292398Z",
330
+ "shell.execute_reply": "2026-02-02T01:47:26.291943Z"
331
+ },
332
+ "papermill": {
333
+ "duration": 0.011236,
334
+ "end_time": "2026-02-02T01:47:26.293108",
335
+ "exception": false,
336
+ "start_time": "2026-02-02T01:47:26.281872",
337
+ "status": "completed"
338
+ },
339
+ "tags": []
340
+ },
341
+ "outputs": [],
342
+ "source": [
343
+ "# === TEXT PROCESSING CONFIGURATION ===\n",
344
+ "# Choose your embedding model preset:\n",
345
+ "# \"minilm\" - Fast, CPU-friendly, good for exploration (default)\n",
346
+ "# \"qwen3-0.6b\" - Better quality, needs GPU\n",
347
+ "# \"qwen3-4b\" - High quality, needs 16GB+ GPU\n",
348
+ "# \"qwen3-8b\" - Highest quality, needs 32GB+ GPU\n",
349
+ "\n",
350
+ "EMBEDDING_PRESET = \"minilm\" # Change this to try different models\n",
351
+ "\n",
352
+ "# PCA configuration\n",
353
+ "VARIANCE_THRESHOLD = 0.95 # Keep components explaining 95% of variance\n",
354
+ "MIN_COMPONENTS = 2 # At least 2 features per text column\n",
355
+ "MAX_COMPONENTS = None # No upper limit (set to e.g., 20 to cap)\n",
356
+ "\n",
357
+ "# Get model info and create config\n",
358
+ "model_info = get_model_info(EMBEDDING_PRESET)\n",
359
+ "config = TextProcessingConfig(\n",
360
+ " embedding_model=model_info[\"model_name\"],\n",
361
+ " variance_threshold=VARIANCE_THRESHOLD,\n",
362
+ " max_components=MAX_COMPONENTS,\n",
363
+ " min_components=MIN_COMPONENTS,\n",
364
+ " batch_size=32\n",
365
+ ")\n",
366
+ "\n",
367
+ "print(\"Text Processing Configuration\")\n",
368
+ "print(\"=\" * 50)\n",
369
+ "print(f\" Preset: {EMBEDDING_PRESET}\")\n",
370
+ "print(f\" Model: {config.embedding_model}\")\n",
371
+ "print(f\" Model size: {model_info['size_mb']} MB\")\n",
372
+ "print(f\" Embedding dimension: {model_info['embedding_dim']}\")\n",
373
+ "print(f\" GPU recommended: {'Yes' if model_info['gpu_recommended'] else 'No'}\")\n",
374
+ "print()\n",
375
+ "print(f\" Variance threshold: {config.variance_threshold:.0%}\")\n",
376
+ "print(f\" Min components: {config.min_components}\")\n",
377
+ "print(f\" Max components: {config.max_components or 'unlimited'}\")\n",
378
+ "\n",
379
+ "if model_info['gpu_recommended']:\n",
380
+ " print()\n",
381
+ " print(\"Note: This model works best with GPU. Processing may be slow on CPU.\")"
382
+ ]
383
+ },
384
+ {
385
+ "cell_type": "markdown",
386
+ "id": "cell-9",
387
+ "metadata": {
388
+ "papermill": {
389
+ "duration": 0.005323,
390
+ "end_time": "2026-02-02T01:47:26.304338",
391
+ "exception": false,
392
+ "start_time": "2026-02-02T01:47:26.299015",
393
+ "status": "completed"
394
+ },
395
+ "tags": []
396
+ },
397
+ "source": [
398
+ "## 2a.4 Text Column Analysis\n",
399
+ "\n",
400
+ "Before processing, let's understand each TEXT column."
401
+ ]
402
+ },
403
+ {
404
+ "cell_type": "code",
405
+ "execution_count": null,
406
+ "id": "cell-10",
407
+ "metadata": {
408
+ "execution": {
409
+ "iopub.execute_input": "2026-02-02T01:47:26.316494Z",
410
+ "iopub.status.busy": "2026-02-02T01:47:26.316369Z",
411
+ "iopub.status.idle": "2026-02-02T01:47:26.320318Z",
412
+ "shell.execute_reply": "2026-02-02T01:47:26.319793Z"
413
+ },
414
+ "papermill": {
415
+ "duration": 0.011045,
416
+ "end_time": "2026-02-02T01:47:26.321049",
417
+ "exception": false,
418
+ "start_time": "2026-02-02T01:47:26.310004",
419
+ "status": "completed"
420
+ },
421
+ "tags": []
422
+ },
423
+ "outputs": [],
424
+ "source": [
425
+ "if text_columns:\n",
426
+ " for col_name in text_columns:\n",
427
+ " print(f\"\\n{'='*70}\")\n",
428
+ " print(f\"Column: {col_name}\")\n",
429
+ " print(f\"{'='*70}\")\n",
430
+ " \n",
431
+ " text_series = df[col_name].fillna(\"\")\n",
432
+ " \n",
433
+ " # Basic statistics\n",
434
+ " non_empty = (text_series.str.len() > 0).sum()\n",
435
+ " avg_length = text_series.str.len().mean()\n",
436
+ " max_length = text_series.str.len().max()\n",
437
+ " \n",
438
+ " print(f\"\\n\\U0001f4ca Statistics:\")\n",
439
+ " print(f\" Total rows: {len(text_series):,}\")\n",
440
+ " print(f\" Non-empty: {non_empty:,} ({non_empty/len(text_series)*100:.1f}%)\")\n",
441
+ " print(f\" Avg length: {avg_length:.0f} characters\")\n",
442
+ " print(f\" Max length: {max_length:,} characters\")\n",
443
+ " \n",
444
+ " # Sample texts\n",
445
+ " print(f\"\\n\\U0001f4dd Sample texts:\")\n",
446
+ " samples = text_series[text_series.str.len() > 10].head(3)\n",
447
+ " for i, sample in enumerate(samples, 1):\n",
448
+ " truncated = sample[:100] + \"...\" if len(sample) > 100 else sample\n",
449
+ " print(f\" {i}. {truncated}\")\n",
450
+ " \n",
451
+ " # Text length distribution\n",
452
+ " lengths = text_series.str.len()\n",
453
+ " fig = go.Figure()\n",
454
+ " fig.add_trace(go.Histogram(x=lengths[lengths > 0], nbinsx=50,\n",
455
+ " marker_color='steelblue', opacity=0.7))\n",
456
+ " fig.add_vline(x=lengths.median(), line_dash=\"solid\", line_color=\"green\",\n",
457
+ " annotation_text=f\"Median: {lengths.median():.0f}\")\n",
458
+ " fig.update_layout(\n",
459
+ " title=f\"Text Length Distribution: {col_name}\",\n",
460
+ " xaxis_title=\"Character Count\",\n",
461
+ " yaxis_title=\"Frequency\",\n",
462
+ " template=\"plotly_white\",\n",
463
+ " height=350\n",
464
+ " )\n",
465
+ " display_figure(fig)"
466
+ ]
467
+ },
468
+ {
469
+ "cell_type": "markdown",
470
+ "id": "cell-11",
471
+ "metadata": {
472
+ "papermill": {
473
+ "duration": 0.005005,
474
+ "end_time": "2026-02-02T01:47:26.331524",
475
+ "exception": false,
476
+ "start_time": "2026-02-02T01:47:26.326519",
477
+ "status": "completed"
478
+ },
479
+ "tags": []
480
+ },
481
+ "source": [
482
+ "## 2a.5 Process Text Columns\n",
483
+ "\n",
484
+ "This step:\n",
485
+ "1. Generates embeddings using sentence-transformers\n",
486
+ "2. Applies PCA to reduce dimensions\n",
487
+ "3. Creates PC feature columns"
488
+ ]
489
+ },
490
+ {
491
+ "cell_type": "code",
492
+ "execution_count": null,
493
+ "id": "cell-12",
494
+ "metadata": {
495
+ "execution": {
496
+ "iopub.execute_input": "2026-02-02T01:47:26.342884Z",
497
+ "iopub.status.busy": "2026-02-02T01:47:26.342734Z",
498
+ "iopub.status.idle": "2026-02-02T01:47:26.347577Z",
499
+ "shell.execute_reply": "2026-02-02T01:47:26.345847Z"
500
+ },
501
+ "papermill": {
502
+ "duration": 0.012844,
503
+ "end_time": "2026-02-02T01:47:26.349482",
504
+ "exception": false,
505
+ "start_time": "2026-02-02T01:47:26.336638",
506
+ "status": "completed"
507
+ },
508
+ "tags": []
509
+ },
510
+ "outputs": [],
511
+ "source": [
512
+ "if text_columns:\n",
513
+ " processor = TextColumnProcessor(config)\n",
514
+ " \n",
515
+ " print(\"Processing TEXT columns...\")\n",
516
+ " print(\"(This may take a moment for large datasets)\\n\")\n",
517
+ " \n",
518
+ " results = []\n",
519
+ " df_processed = df.copy()\n",
520
+ " \n",
521
+ " for col_name in text_columns:\n",
522
+ " print(f\"\\n{'='*70}\")\n",
523
+ " print(f\"Processing: {col_name}\")\n",
524
+ " print(f\"{'='*70}\")\n",
525
+ " \n",
526
+ " df_processed, result = processor.process_column(df_processed, col_name)\n",
527
+ " results.append(result)\n",
528
+ " \n",
529
+ " print(f\"\\n\\u2705 Processing complete:\")\n",
530
+ " print(f\" Embedding shape: {result.embeddings_shape}\")\n",
531
+ " print(f\" Components kept: {result.n_components}\")\n",
532
+ " print(f\" Explained variance: {result.explained_variance:.1%}\")\n",
533
+ " print(f\" Features created: {', '.join(result.component_columns)}\")\n",
534
+ " \n",
535
+ " print(f\"\\n\\n{'='*70}\")\n",
536
+ " print(\"PROCESSING SUMMARY\")\n",
537
+ " print(f\"{'='*70}\")\n",
538
+ " print(f\"\\nOriginal columns: {len(df.columns)}\")\n",
539
+ " print(f\"New columns added: {len(df_processed.columns) - len(df.columns)}\")\n",
540
+ " print(f\"Total columns: {len(df_processed.columns)}\")"
541
+ ]
542
+ },
543
+ {
544
+ "cell_type": "markdown",
545
+ "id": "cell-13",
546
+ "metadata": {
547
+ "papermill": {
548
+ "duration": 0.007613,
549
+ "end_time": "2026-02-02T01:47:26.363074",
550
+ "exception": false,
551
+ "start_time": "2026-02-02T01:47:26.355461",
552
+ "status": "completed"
553
+ },
554
+ "tags": []
555
+ },
556
+ "source": [
557
+ "## 2a.6 Visualize Results\n",
558
+ "\n",
559
+ "Understanding the PC features created from text embeddings."
560
+ ]
561
+ },
562
+ {
563
+ "cell_type": "code",
564
+ "execution_count": null,
565
+ "id": "cell-14",
566
+ "metadata": {
567
+ "execution": {
568
+ "iopub.execute_input": "2026-02-02T01:47:26.376406Z",
569
+ "iopub.status.busy": "2026-02-02T01:47:26.376286Z",
570
+ "iopub.status.idle": "2026-02-02T01:47:26.380333Z",
571
+ "shell.execute_reply": "2026-02-02T01:47:26.379711Z"
572
+ },
573
+ "papermill": {
574
+ "duration": 0.011065,
575
+ "end_time": "2026-02-02T01:47:26.381178",
576
+ "exception": false,
577
+ "start_time": "2026-02-02T01:47:26.370113",
578
+ "status": "completed"
579
+ },
580
+ "tags": []
581
+ },
582
+ "outputs": [],
583
+ "source": [
584
+ "if text_columns and results:\n",
585
+ " for result in results:\n",
586
+ " print(f\"\\n{'='*70}\")\n",
587
+ " print(f\"Results: {result.column_name}\")\n",
588
+ " print(f\"{'='*70}\")\n",
589
+ " \n",
590
+ " # Explained variance per component\n",
591
+ " reducer = processor._reducers[result.column_name]\n",
592
+ " var_ratios = reducer._pca.explained_variance_ratio_\n",
593
+ " cumulative = np.cumsum(var_ratios)\n",
594
+ " \n",
595
+ " fig = make_subplots(rows=1, cols=2,\n",
596
+ " subplot_titles=(\"Variance per Component\", \"Cumulative Variance\"))\n",
597
+ " \n",
598
+ " fig.add_trace(go.Bar(\n",
599
+ " x=[f\"PC{i+1}\" for i in range(len(var_ratios))],\n",
600
+ " y=var_ratios,\n",
601
+ " marker_color='steelblue'\n",
602
+ " ), row=1, col=1)\n",
603
+ " \n",
604
+ " fig.add_trace(go.Scatter(\n",
605
+ " x=[f\"PC{i+1}\" for i in range(len(cumulative))],\n",
606
+ " y=cumulative,\n",
607
+ " mode='lines+markers',\n",
608
+ " line_color='green'\n",
609
+ " ), row=1, col=2)\n",
610
+ " \n",
611
+ " fig.add_hline(y=config.variance_threshold, line_dash=\"dash\", line_color=\"red\",\n",
612
+ " annotation_text=f\"Target: {config.variance_threshold:.0%}\",\n",
613
+ " row=1, col=2)\n",
614
+ " \n",
615
+ " fig.update_layout(\n",
616
+ " title=f\"PCA Results: {result.column_name}\",\n",
617
+ " height=400,\n",
618
+ " template=\"plotly_white\",\n",
619
+ " showlegend=False\n",
620
+ " )\n",
621
+ " fig.update_yaxes(title_text=\"Variance Ratio\", row=1, col=1)\n",
622
+ " fig.update_yaxes(title_text=\"Cumulative Variance\", row=1, col=2)\n",
623
+ " display_figure(fig)\n",
624
+ " \n",
625
+ " # PC feature distributions\n",
626
+ " if len(result.component_columns) >= 2:\n",
627
+ " fig = px.scatter(\n",
628
+ " df_processed,\n",
629
+ " x=result.component_columns[0],\n",
630
+ " y=result.component_columns[1],\n",
631
+ " title=f\"PC1 vs PC2: {result.column_name}\",\n",
632
+ " opacity=0.5\n",
633
+ " )\n",
634
+ " fig.update_layout(template=\"plotly_white\", height=400)\n",
635
+ " display_figure(fig)"
636
+ ]
637
+ },
638
+ {
639
+ "cell_type": "markdown",
640
+ "id": "cell-15",
641
+ "metadata": {
642
+ "papermill": {
643
+ "duration": 0.005635,
644
+ "end_time": "2026-02-02T01:47:26.393531",
645
+ "exception": false,
646
+ "start_time": "2026-02-02T01:47:26.387896",
647
+ "status": "completed"
648
+ },
649
+ "tags": []
650
+ },
651
+ "source": [
652
+ "## 2a.7 Update Findings with Text Processing Metadata"
653
+ ]
654
+ },
655
+ {
656
+ "cell_type": "code",
657
+ "execution_count": null,
658
+ "id": "cell-16",
659
+ "metadata": {
660
+ "execution": {
661
+ "iopub.execute_input": "2026-02-02T01:47:26.409118Z",
662
+ "iopub.status.busy": "2026-02-02T01:47:26.408976Z",
663
+ "iopub.status.idle": "2026-02-02T01:47:29.401562Z",
664
+ "shell.execute_reply": "2026-02-02T01:47:29.401101Z"
665
+ },
666
+ "papermill": {
667
+ "duration": 3.000962,
668
+ "end_time": "2026-02-02T01:47:29.402237",
669
+ "exception": false,
670
+ "start_time": "2026-02-02T01:47:26.401275",
671
+ "status": "completed"
672
+ },
673
+ "tags": []
674
+ },
675
+ "outputs": [],
676
+ "source": [
677
+ "if text_columns and results:\n",
678
+ " for result in results:\n",
679
+ " metadata = TextProcessingMetadata(\n",
680
+ " column_name=result.column_name,\n",
681
+ " embedding_model=config.embedding_model,\n",
682
+ " embedding_dim=result.embeddings_shape[1],\n",
683
+ " n_components=result.n_components,\n",
684
+ " explained_variance=result.explained_variance,\n",
685
+ " component_columns=result.component_columns,\n",
686
+ " variance_threshold_used=config.variance_threshold,\n",
687
+ " processing_approach=\"pca\"\n",
688
+ " )\n",
689
+ " findings.text_processing[result.column_name] = metadata\n",
690
+ " \n",
691
+ " print(f\"\\u2705 Added metadata for {result.column_name}:\")\n",
692
+ " print(f\" Model: {metadata.embedding_model}\")\n",
693
+ " print(f\" Components: {metadata.n_components}\")\n",
694
+ " print(f\" Explained variance: {metadata.explained_variance:.1%}\")\n",
695
+ " \n",
696
+ " findings.save(FINDINGS_PATH)\n",
697
+ " print(f\"\\nFindings saved to: {FINDINGS_PATH}\")\n",
698
+ "\n",
699
+ "from customer_retention.analysis.notebook_html_exporter import export_notebook_html\n",
700
+ "export_notebook_html(Path(\"02a_text_columns_deep_dive.ipynb\"), EXPERIMENTS_DIR / \"docs\")\n"
701
+ ]
702
+ },
703
+ {
704
+ "cell_type": "markdown",
705
+ "id": "cell-17",
706
+ "metadata": {
707
+ "papermill": {
708
+ "duration": 0.006327,
709
+ "end_time": "2026-02-02T01:47:29.415379",
710
+ "exception": false,
711
+ "start_time": "2026-02-02T01:47:29.409052",
712
+ "status": "completed"
713
+ },
714
+ "tags": []
715
+ },
716
+ "source": [
717
+ "## 2a.8 Generate Recommendations"
718
+ ]
719
+ },
720
+ {
721
+ "cell_type": "code",
722
+ "execution_count": null,
723
+ "id": "cell-18",
724
+ "metadata": {
725
+ "execution": {
726
+ "iopub.execute_input": "2026-02-02T01:47:29.429066Z",
727
+ "iopub.status.busy": "2026-02-02T01:47:29.428925Z",
728
+ "iopub.status.idle": "2026-02-02T01:47:29.432197Z",
729
+ "shell.execute_reply": "2026-02-02T01:47:29.431402Z"
730
+ },
731
+ "papermill": {
732
+ "duration": 0.011237,
733
+ "end_time": "2026-02-02T01:47:29.433125",
734
+ "exception": false,
735
+ "start_time": "2026-02-02T01:47:29.421888",
736
+ "status": "completed"
737
+ },
738
+ "tags": []
739
+ },
740
+ "outputs": [],
741
+ "source": [
742
+ "if text_columns and results:\n",
743
+ " print(\"\\n\" + \"=\"*70)\n",
744
+ " print(\"PRODUCTION RECOMMENDATIONS\")\n",
745
+ " print(\"=\"*70)\n",
746
+ " \n",
747
+ " for result in results:\n",
748
+ " print(f\"\\n\\U0001f527 {result.column_name}:\")\n",
749
+ " print(f\" Action: embed_reduce (embeddings + PCA)\")\n",
750
+ " print(f\" Model: {config.embedding_model}\")\n",
751
+ " print(f\" Variance threshold: {config.variance_threshold:.0%}\")\n",
752
+ " print(f\" Expected features: {result.n_components}\")\n",
753
+ " print(f\" Feature names: {', '.join(result.component_columns[:3])}...\")\n",
754
+ " \n",
755
+ " print(\"\\n\\U0001f4a1 These recommendations will be used by the pipeline generator.\")\n",
756
+ " print(\" The same processing will be applied in production.\")"
757
+ ]
758
+ },
759
+ {
760
+ "cell_type": "markdown",
761
+ "id": "cell-19",
762
+ "metadata": {
763
+ "papermill": {
764
+ "duration": 0.007691,
765
+ "end_time": "2026-02-02T01:47:29.446230",
766
+ "exception": false,
767
+ "start_time": "2026-02-02T01:47:29.438539",
768
+ "status": "completed"
769
+ },
770
+ "tags": []
771
+ },
772
+ "source": [
773
+ "---\n",
774
+ "\n",
775
+ "## Summary\n",
776
+ "\n",
777
+ "In this notebook, we:\n",
778
+ "\n",
779
+ "1. **Analyzed** TEXT columns for length and content patterns\n",
780
+ "2. **Generated embeddings** using sentence-transformers\n",
781
+ "3. **Applied PCA** to reduce dimensions while preserving variance\n",
782
+ "4. **Created numeric features** (pc1, pc2, ...) for downstream ML\n",
783
+ "5. **Updated findings** with processing metadata\n",
784
+ "\n",
785
+ "## Key Results\n",
786
+ "\n",
787
+ "| Column | Components | Explained Variance |\n",
788
+ "|--------|------------|--------------------|\n",
789
+ "| (Filled by execution) | | |\n",
790
+ "\n",
791
+ "---\n",
792
+ "\n",
793
+ "## Next Steps\n",
794
+ "\n",
795
+ "Continue to **03_quality_assessment.ipynb** to:\n",
796
+ "- Analyze duplicate records and value conflicts\n",
797
+ "- Deep dive into missing value patterns\n",
798
+ "- Analyze outliers with IQR method\n",
799
+ "- Get cleaning recommendations"
800
+ ]
801
+ },
802
+ {
803
+ "cell_type": "markdown",
804
+ "id": "g5gchegxiyl",
805
+ "metadata": {
806
+ "papermill": {
807
+ "duration": 0.006287,
808
+ "end_time": "2026-02-02T01:47:29.459193",
809
+ "exception": false,
810
+ "start_time": "2026-02-02T01:47:29.452906",
811
+ "status": "completed"
812
+ },
813
+ "tags": []
814
+ },
815
+ "source": [
816
+ "> **Save Reminder:** Save this notebook (Ctrl+S / Cmd+S) before running the next one.\n",
817
+ "> The next notebook will automatically export this notebook's HTML documentation from the saved file."
818
+ ]
819
+ }
820
+ ],
821
+ "metadata": {
822
+ "kernelspec": {
823
+ "display_name": "Python 3",
824
+ "language": "python",
825
+ "name": "python3"
826
+ },
827
+ "language_info": {
828
+ "codemirror_mode": {
829
+ "name": "ipython",
830
+ "version": 3
831
+ },
832
+ "file_extension": ".py",
833
+ "mimetype": "text/x-python",
834
+ "name": "python",
835
+ "nbconvert_exporter": "python",
836
+ "pygments_lexer": "ipython3",
837
+ "version": "3.12.4"
838
+ },
839
+ "papermill": {
840
+ "default_parameters": {},
841
+ "duration": 9.591376,
842
+ "end_time": "2026-02-02T01:47:32.083387",
843
+ "environment_variables": {},
844
+ "exception": null,
845
+ "input_path": "/Users/Vital/python/CustomerRetention/exploration_notebooks/02a_text_columns_deep_dive.ipynb",
846
+ "output_path": "/Users/Vital/python/CustomerRetention/exploration_notebooks/02a_text_columns_deep_dive.ipynb",
847
+ "parameters": {},
848
+ "start_time": "2026-02-02T01:47:22.492011",
849
+ "version": "2.6.0"
850
+ }
851
+ },
852
+ "nbformat": 4,
853
+ "nbformat_minor": 5
854
+ }