churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,961 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "cell-0",
6
+ "metadata": {
7
+ "papermill": {
8
+ "duration": 0.007095,
9
+ "end_time": "2026-02-02T01:44:16.460374",
10
+ "exception": false,
11
+ "start_time": "2026-02-02T01:44:16.453279",
12
+ "status": "completed"
13
+ },
14
+ "tags": []
15
+ },
16
+ "source": [
17
+ "# Chapter 1a.a: Temporal Text Columns Deep Dive\n",
18
+ "\n",
19
+ "**Purpose:** Transform TEXT columns in event-level data into numeric features, then aggregate across time windows.\n",
20
+ "\n",
21
+ "**When to use this notebook:**\n",
22
+ "- Your dataset is EVENT_LEVEL (time series)\n",
23
+ "- You have TEXT columns (tickets, messages, emails, etc.)\n",
24
+ "- Run after 01a_temporal_deep_dive.ipynb\n",
25
+ "\n",
26
+ "**Processing Flow:**\n",
27
+ "```\n",
28
+ "Event TEXT → Embeddings → PCA → pc1, pc2, ... → Time Window Aggregation\n",
29
+ "```\n",
30
+ "\n",
31
+ "**What you'll learn:**\n",
32
+ "- How to embed text at the event level\n",
33
+ "- How to choose between fast vs high-quality embedding models\n",
34
+ "- How PCA features aggregate across time windows\n",
35
+ "- Creating features like `ticket_text_pc1_mean_30d`\n",
36
+ "\n",
37
+ "**Outputs:**\n",
38
+ "- PC features per event\n",
39
+ "- Aggregation plan for PC features\n",
40
+ "- Updated findings with text processing metadata\n",
41
+ "\n",
42
+ "---\n",
43
+ "\n",
44
+ "## Two Approaches to Text Feature Engineering\n",
45
+ "\n",
46
+ "| Approach | Method | When to Use |\n",
47
+ "|----------|--------|-------------|\n",
48
+ "| **1. Embeddings + PCA + Aggregation** (This notebook) | Per-event PCA → aggregate | Temporal patterns in text |\n",
49
+ "| **2. LLM Labeling** (Future) | LLM labels → categorical aggregation | Specific categories needed |\n",
50
+ "\n",
51
+ "### Embedding Model Options\n",
52
+ "\n",
53
+ "| Model | Size | Embedding Dim | Speed | Quality | Best For |\n",
54
+ "|-------|------|---------------|-------|---------|----------|\n",
55
+ "| **MiniLM** (default) | 90 MB | 384 | Fast | Good | CPU, quick iteration, small datasets |\n",
56
+ "| **Qwen3-0.6B** | 1.2 GB | 1024 | Medium | Better | GPU available, production quality |\n",
57
+ "| **Qwen3-4B** | 8 GB | 2560 | Slow | High | 16GB+ GPU, multilingual, high accuracy |\n",
58
+ "| **Qwen3-8B** | 16 GB | 4096 | Slowest | Highest | 32GB+ GPU, research, max quality |\n",
59
+ "\n",
60
+ "**Note:** Models are downloaded on first use. For event-level data with many rows, faster models (MiniLM) are recommended unless you have a powerful GPU.\n",
61
+ "\n",
62
+ "### Processing Flow\n",
63
+ "\n",
64
+ "```\n",
65
+ "Per Event: TEXT → Embedding → [pc1, pc2, pc3]\n",
66
+ "Aggregate: customer_id → ticket_text_pc1_mean_30d, ticket_text_pc2_std_7d, ...\n",
67
+ "```\n",
68
+ "\n",
69
+ "This captures how text semantics change over time windows."
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "markdown",
74
+ "id": "cell-1",
75
+ "metadata": {
76
+ "papermill": {
77
+ "duration": 0.005664,
78
+ "end_time": "2026-02-02T01:44:16.472356",
79
+ "exception": false,
80
+ "start_time": "2026-02-02T01:44:16.466692",
81
+ "status": "completed"
82
+ },
83
+ "tags": []
84
+ },
85
+ "source": [
86
+ "## 1a.a.1 Load Previous Findings"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": null,
92
+ "id": "cell-2",
93
+ "metadata": {
94
+ "execution": {
95
+ "iopub.execute_input": "2026-02-02T01:44:16.484376Z",
96
+ "iopub.status.busy": "2026-02-02T01:44:16.484246Z",
97
+ "iopub.status.idle": "2026-02-02T01:44:18.851167Z",
98
+ "shell.execute_reply": "2026-02-02T01:44:18.850589Z"
99
+ },
100
+ "papermill": {
101
+ "duration": 2.373857,
102
+ "end_time": "2026-02-02T01:44:18.852091",
103
+ "exception": false,
104
+ "start_time": "2026-02-02T01:44:16.478234",
105
+ "status": "completed"
106
+ },
107
+ "tags": []
108
+ },
109
+ "outputs": [],
110
+ "source": [
111
+ "from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
112
+ "track_and_export_previous(\"01a_a_temporal_text_deep_dive.ipynb\")\n",
113
+ "\n",
114
+ "from customer_retention.analysis.auto_explorer import ExplorationFindings, TextProcessingMetadata\n",
115
+ "from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table, console\n",
116
+ "from customer_retention.core.config.column_config import ColumnType, DatasetGranularity\n",
117
+ "from customer_retention.stages.profiling import (\n",
118
+ " TextColumnProcessor, TextProcessingConfig, TextColumnResult,\n",
119
+ " TimeWindowAggregator, AggregationPlan,\n",
120
+ " EMBEDDING_MODELS, get_model_info, list_available_models\n",
121
+ ")\n",
122
+ "import pandas as pd\n",
123
+ "import numpy as np\n",
124
+ "import plotly.graph_objects as go\n",
125
+ "import plotly.express as px\n",
126
+ "from plotly.subplots import make_subplots\n",
127
+ "from customer_retention.core.config.experiments import FINDINGS_DIR, EXPERIMENTS_DIR, OUTPUT_DIR, setup_experiments_structure"
128
+ ]
129
+ },
130
+ {
131
+ "cell_type": "code",
132
+ "execution_count": null,
133
+ "id": "cell-3",
134
+ "metadata": {
135
+ "execution": {
136
+ "iopub.execute_input": "2026-02-02T01:44:18.863269Z",
137
+ "iopub.status.busy": "2026-02-02T01:44:18.863126Z",
138
+ "iopub.status.idle": "2026-02-02T01:44:18.888705Z",
139
+ "shell.execute_reply": "2026-02-02T01:44:18.888264Z"
140
+ },
141
+ "papermill": {
142
+ "duration": 0.032282,
143
+ "end_time": "2026-02-02T01:44:18.889776",
144
+ "exception": false,
145
+ "start_time": "2026-02-02T01:44:18.857494",
146
+ "status": "completed"
147
+ },
148
+ "tags": []
149
+ },
150
+ "outputs": [],
151
+ "source": [
152
+ "# === CONFIGURATION ===\n",
153
+ "from pathlib import Path\n",
154
+ "\n",
155
+ "# FINDINGS_DIR imported from customer_retention.core.config.experiments\n",
156
+ "\n",
157
+ "findings_files = [f for f in FINDINGS_DIR.glob(\"*_findings.yaml\") if \"multi_dataset\" not in f.name]\n",
158
+ "if not findings_files:\n",
159
+ " raise FileNotFoundError(f\"No findings files found in {FINDINGS_DIR}. Run notebook 01 first.\")\n",
160
+ "\n",
161
+ "findings_files.sort(key=lambda f: f.stat().st_mtime, reverse=True)\n",
162
+ "FINDINGS_PATH = str(findings_files[0])\n",
163
+ "\n",
164
+ "print(f\"Found {len(findings_files)} findings file(s)\")\n",
165
+ "print(f\"Using: {FINDINGS_PATH}\")\n",
166
+ "\n",
167
+ "findings = ExplorationFindings.load(FINDINGS_PATH)\n",
168
+ "print(f\"\\nLoaded findings for {findings.column_count} columns from {findings.source_path}\")"
169
+ ]
170
+ },
171
+ {
172
+ "cell_type": "code",
173
+ "execution_count": null,
174
+ "id": "cell-4",
175
+ "metadata": {
176
+ "execution": {
177
+ "iopub.execute_input": "2026-02-02T01:44:18.900980Z",
178
+ "iopub.status.busy": "2026-02-02T01:44:18.900867Z",
179
+ "iopub.status.idle": "2026-02-02T01:44:18.904105Z",
180
+ "shell.execute_reply": "2026-02-02T01:44:18.903459Z"
181
+ },
182
+ "papermill": {
183
+ "duration": 0.009642,
184
+ "end_time": "2026-02-02T01:44:18.904756",
185
+ "exception": false,
186
+ "start_time": "2026-02-02T01:44:18.895114",
187
+ "status": "completed"
188
+ },
189
+ "tags": []
190
+ },
191
+ "outputs": [],
192
+ "source": [
193
+ "# Verify this is a time series dataset\n",
194
+ "# This notebook is ONLY for event-level (time series) data with multiple rows per entity\n",
195
+ "\n",
196
+ "if not findings.is_time_series:\n",
197
+ " print(\"=\" * 70)\n",
198
+ " print(\"WRONG NOTEBOOK FOR THIS DATASET\")\n",
199
+ " print(\"=\" * 70)\n",
200
+ " print()\n",
201
+ " print(\"This dataset is ENTITY-LEVEL (one row per entity), not event-level.\")\n",
202
+ " print()\n",
203
+ " print(\"For TEXT columns in entity-level data, use:\")\n",
204
+ " print(\" 02a_text_columns_deep_dive.ipynb\")\n",
205
+ " print()\n",
206
+ " print(\"This notebook (01a_a) is for TEXT columns in EVENT-LEVEL data where:\")\n",
207
+ " print(\" - Multiple events per entity (e.g., support tickets, transactions)\")\n",
208
+ " print(\" - Text is embedded per-event, then aggregated across time windows\")\n",
209
+ " print()\n",
210
+ " raise SystemExit(\"Please use 02a_text_columns_deep_dive.ipynb for entity-level data.\")\n",
211
+ "\n",
212
+ "ts_meta = findings.time_series_metadata\n",
213
+ "temporal_pattern = (ts_meta.temporal_pattern or \"unknown\").upper()\n",
214
+ "print(f\"Dataset confirmed as {temporal_pattern} (event-level)\")\n",
215
+ "ENTITY_COLUMN = ts_meta.entity_column\n",
216
+ "TIME_COLUMN = ts_meta.time_column\n",
217
+ "print(f\" Entity column: {ENTITY_COLUMN}\")\n",
218
+ "print(f\" Time column: {TIME_COLUMN}\")"
219
+ ]
220
+ },
221
+ {
222
+ "cell_type": "code",
223
+ "execution_count": null,
224
+ "id": "cell-5",
225
+ "metadata": {
226
+ "execution": {
227
+ "iopub.execute_input": "2026-02-02T01:44:18.917113Z",
228
+ "iopub.status.busy": "2026-02-02T01:44:18.916992Z",
229
+ "iopub.status.idle": "2026-02-02T01:44:18.919898Z",
230
+ "shell.execute_reply": "2026-02-02T01:44:18.919272Z"
231
+ },
232
+ "papermill": {
233
+ "duration": 0.009896,
234
+ "end_time": "2026-02-02T01:44:18.920517",
235
+ "exception": false,
236
+ "start_time": "2026-02-02T01:44:18.910621",
237
+ "status": "completed"
238
+ },
239
+ "tags": []
240
+ },
241
+ "outputs": [],
242
+ "source": [
243
+ "# Identify TEXT columns\n",
244
+ "text_columns = [\n",
245
+ " name for name, col in findings.columns.items()\n",
246
+ " if col.inferred_type == ColumnType.TEXT\n",
247
+ "]\n",
248
+ "\n",
249
+ "if not text_columns:\n",
250
+ " print(\"\\u26a0\\ufe0f No TEXT columns detected in this dataset.\")\n",
251
+ " print(\" This notebook is only needed when TEXT columns are present.\")\n",
252
+ " print(\" Continue to notebook 01b_temporal_quality.ipynb\")\n",
253
+ "else:\n",
254
+ " print(f\"\\u2705 Found {len(text_columns)} TEXT column(s):\")\n",
255
+ " for col in text_columns:\n",
256
+ " col_info = findings.columns[col]\n",
257
+ " print(f\" - {col} (Confidence: {col_info.confidence:.0%})\")"
258
+ ]
259
+ },
260
+ {
261
+ "cell_type": "markdown",
262
+ "id": "cell-6",
263
+ "metadata": {
264
+ "papermill": {
265
+ "duration": 0.005389,
266
+ "end_time": "2026-02-02T01:44:18.931448",
267
+ "exception": false,
268
+ "start_time": "2026-02-02T01:44:18.926059",
269
+ "status": "completed"
270
+ },
271
+ "tags": []
272
+ },
273
+ "source": [
274
+ "## 1a.a.2 Load Source Data"
275
+ ]
276
+ },
277
+ {
278
+ "cell_type": "code",
279
+ "execution_count": null,
280
+ "id": "cell-7",
281
+ "metadata": {
282
+ "execution": {
283
+ "iopub.execute_input": "2026-02-02T01:44:18.943724Z",
284
+ "iopub.status.busy": "2026-02-02T01:44:18.943564Z",
285
+ "iopub.status.idle": "2026-02-02T01:44:19.057178Z",
286
+ "shell.execute_reply": "2026-02-02T01:44:19.056794Z"
287
+ },
288
+ "papermill": {
289
+ "duration": 0.121646,
290
+ "end_time": "2026-02-02T01:44:19.058713",
291
+ "exception": false,
292
+ "start_time": "2026-02-02T01:44:18.937067",
293
+ "status": "completed"
294
+ },
295
+ "tags": []
296
+ },
297
+ "outputs": [],
298
+ "source": [
299
+ "from customer_retention.stages.temporal import load_data_with_snapshot_preference, TEMPORAL_METADATA_COLS\n",
300
+ "\n",
301
+ "df, data_source = load_data_with_snapshot_preference(findings, output_dir=str(FINDINGS_DIR))\n",
302
+ "charts = ChartBuilder()\n",
303
+ "\n",
304
+ "print(f\"Loaded {len(df):,} events x {len(df.columns)} columns\")\n",
305
+ "print(f\"Data source: {data_source}\")\n",
306
+ "print(f\"Unique entities: {df[ENTITY_COLUMN].nunique():,}\")"
307
+ ]
308
+ },
309
+ {
310
+ "cell_type": "markdown",
311
+ "id": "cell-8",
312
+ "metadata": {
313
+ "papermill": {
314
+ "duration": 0.005568,
315
+ "end_time": "2026-02-02T01:44:19.069912",
316
+ "exception": false,
317
+ "start_time": "2026-02-02T01:44:19.064344",
318
+ "status": "completed"
319
+ },
320
+ "tags": []
321
+ },
322
+ "source": [
323
+ "## 1a.a.3 Configuration\n",
324
+ "\n",
325
+ "### Available Embedding Models"
326
+ ]
327
+ },
328
+ {
329
+ "cell_type": "code",
330
+ "execution_count": null,
331
+ "id": "cell-9",
332
+ "metadata": {
333
+ "execution": {
334
+ "iopub.execute_input": "2026-02-02T01:44:19.081720Z",
335
+ "iopub.status.busy": "2026-02-02T01:44:19.081594Z",
336
+ "iopub.status.idle": "2026-02-02T01:44:19.085178Z",
337
+ "shell.execute_reply": "2026-02-02T01:44:19.084432Z"
338
+ },
339
+ "papermill": {
340
+ "duration": 0.011809,
341
+ "end_time": "2026-02-02T01:44:19.087109",
342
+ "exception": false,
343
+ "start_time": "2026-02-02T01:44:19.075300",
344
+ "status": "completed"
345
+ },
346
+ "tags": []
347
+ },
348
+ "outputs": [],
349
+ "source": [
350
+ "# Display available embedding models\n",
351
+ "print(\"Available Embedding Models\")\n",
352
+ "print(\"=\" * 80)\n",
353
+ "print(f\"{'Preset':<15} {'Model':<35} {'Size':<10} {'Dim':<8} {'GPU?'}\")\n",
354
+ "print(\"-\" * 80)\n",
355
+ "\n",
356
+ "for preset in list_available_models():\n",
357
+ " info = get_model_info(preset)\n",
358
+ " size = f\"{info['size_mb']} MB\" if info['size_mb'] < 1000 else f\"{info['size_mb']/1000:.1f} GB\"\n",
359
+ " gpu = \"Yes\" if info['gpu_recommended'] else \"No\"\n",
360
+ " print(f\"{preset:<15} {info['model_name']:<35} {size:<10} {info['embedding_dim']:<8} {gpu}\")\n",
361
+ "\n",
362
+ "print(\"\\nFor event-level data with many rows, MiniLM is recommended for faster processing.\")\n",
363
+ "print(\"Qwen3 models produce higher quality embeddings but require GPU for reasonable speed.\")"
364
+ ]
365
+ },
366
+ {
367
+ "cell_type": "code",
368
+ "execution_count": null,
369
+ "id": "bc5ekeprioi",
370
+ "metadata": {
371
+ "execution": {
372
+ "iopub.execute_input": "2026-02-02T01:44:19.101537Z",
373
+ "iopub.status.busy": "2026-02-02T01:44:19.101409Z",
374
+ "iopub.status.idle": "2026-02-02T01:44:19.105129Z",
375
+ "shell.execute_reply": "2026-02-02T01:44:19.104551Z"
376
+ },
377
+ "papermill": {
378
+ "duration": 0.01134,
379
+ "end_time": "2026-02-02T01:44:19.105873",
380
+ "exception": false,
381
+ "start_time": "2026-02-02T01:44:19.094533",
382
+ "status": "completed"
383
+ },
384
+ "tags": []
385
+ },
386
+ "outputs": [],
387
+ "source": [
388
+ "# === TEXT PROCESSING CONFIGURATION ===\n",
389
+ "# Choose your embedding model preset:\n",
390
+ "# \"minilm\" - Fast, CPU-friendly, recommended for event-level data (default)\n",
391
+ "# \"qwen3-0.6b\" - Better quality, needs GPU\n",
392
+ "# \"qwen3-4b\" - High quality, needs 16GB+ GPU\n",
393
+ "# \"qwen3-8b\" - Highest quality, needs 32GB+ GPU\n",
394
+ "\n",
395
+ "EMBEDDING_PRESET = \"minilm\" # Recommended for event-level data\n",
396
+ "\n",
397
+ "# PCA configuration (capped at 10 for manageability in aggregation)\n",
398
+ "VARIANCE_THRESHOLD = 0.95 # Keep components explaining 95% of variance\n",
399
+ "MIN_COMPONENTS = 2 # At least 2 features per text column\n",
400
+ "MAX_COMPONENTS = 10 # Cap at 10 to keep aggregation manageable\n",
401
+ "\n",
402
+ "# Aggregation configuration\n",
403
+ "AGGREGATION_WINDOWS = [\"7d\", \"30d\", \"90d\", \"all_time\"]\n",
404
+ "AGGREGATION_FUNCS = [\"mean\", \"std\", \"first\", \"last\"]\n",
405
+ "\n",
406
+ "# Create configuration\n",
407
+ "model_info = get_model_info(EMBEDDING_PRESET)\n",
408
+ "text_config = TextProcessingConfig(\n",
409
+ " embedding_model=model_info[\"model_name\"],\n",
410
+ " variance_threshold=VARIANCE_THRESHOLD,\n",
411
+ " max_components=MAX_COMPONENTS,\n",
412
+ " min_components=MIN_COMPONENTS,\n",
413
+ " batch_size=32\n",
414
+ ")\n",
415
+ "\n",
416
+ "print(\"Text Processing Configuration\")\n",
417
+ "print(\"=\" * 50)\n",
418
+ "print(f\" Preset: {EMBEDDING_PRESET}\")\n",
419
+ "print(f\" Model: {text_config.embedding_model}\")\n",
420
+ "print(f\" Model size: {model_info['size_mb']} MB\")\n",
421
+ "print(f\" Embedding dimension: {model_info['embedding_dim']}\")\n",
422
+ "print(f\" GPU recommended: {'Yes' if model_info['gpu_recommended'] else 'No'}\")\n",
423
+ "print()\n",
424
+ "print(f\" Variance threshold: {text_config.variance_threshold:.0%}\")\n",
425
+ "print(f\" Max components: {text_config.max_components}\")\n",
426
+ "print()\n",
427
+ "print(\"Aggregation Configuration\")\n",
428
+ "print(\"=\" * 50)\n",
429
+ "print(f\" Windows: {AGGREGATION_WINDOWS}\")\n",
430
+ "print(f\" Functions: {AGGREGATION_FUNCS}\")\n",
431
+ "\n",
432
+ "if model_info['gpu_recommended']:\n",
433
+ " print()\n",
434
+ " print(\"Warning: This model works best with GPU. Consider 'minilm' for faster processing.\")"
435
+ ]
436
+ },
437
+ {
438
+ "cell_type": "markdown",
439
+ "id": "cell-10",
440
+ "metadata": {
441
+ "papermill": {
442
+ "duration": 0.011381,
443
+ "end_time": "2026-02-02T01:44:19.123406",
444
+ "exception": false,
445
+ "start_time": "2026-02-02T01:44:19.112025",
446
+ "status": "completed"
447
+ },
448
+ "tags": []
449
+ },
450
+ "source": [
451
+ "## 1a.a.4 Text Column Analysis"
452
+ ]
453
+ },
454
+ {
455
+ "cell_type": "code",
456
+ "execution_count": null,
457
+ "id": "cell-11",
458
+ "metadata": {
459
+ "execution": {
460
+ "iopub.execute_input": "2026-02-02T01:44:19.143004Z",
461
+ "iopub.status.busy": "2026-02-02T01:44:19.142803Z",
462
+ "iopub.status.idle": "2026-02-02T01:44:19.148785Z",
463
+ "shell.execute_reply": "2026-02-02T01:44:19.148008Z"
464
+ },
465
+ "papermill": {
466
+ "duration": 0.014313,
467
+ "end_time": "2026-02-02T01:44:19.149575",
468
+ "exception": false,
469
+ "start_time": "2026-02-02T01:44:19.135262",
470
+ "status": "completed"
471
+ },
472
+ "tags": []
473
+ },
474
+ "outputs": [],
475
+ "source": [
476
+ "if text_columns:\n",
477
+ " for col_name in text_columns:\n",
478
+ " print(f\"\\n{'='*70}\")\n",
479
+ " print(f\"Column: {col_name}\")\n",
480
+ " print(f\"{'='*70}\")\n",
481
+ " \n",
482
+ " text_series = df[col_name].fillna(\"\")\n",
483
+ " \n",
484
+ " non_empty = (text_series.str.len() > 0).sum()\n",
485
+ " avg_length = text_series.str.len().mean()\n",
486
+ " \n",
487
+ " print(f\"\\n\\U0001f4ca Statistics:\")\n",
488
+ " print(f\" Total events: {len(text_series):,}\")\n",
489
+ " print(f\" Non-empty: {non_empty:,} ({non_empty/len(text_series)*100:.1f}%)\")\n",
490
+ " print(f\" Avg length: {avg_length:.0f} characters\")\n",
491
+ " \n",
492
+ " # Texts per entity\n",
493
+ " texts_per_entity = df.groupby(ENTITY_COLUMN)[col_name].apply(\n",
494
+ " lambda x: (x.fillna(\"\").str.len() > 0).sum()\n",
495
+ " )\n",
496
+ " print(f\"\\n\\U0001f465 Text events per entity:\")\n",
497
+ " print(f\" Mean: {texts_per_entity.mean():.1f}\")\n",
498
+ " print(f\" Median: {texts_per_entity.median():.0f}\")\n",
499
+ " print(f\" Max: {texts_per_entity.max():,}\")\n",
500
+ " \n",
501
+ " # Sample texts\n",
502
+ " print(f\"\\n\\U0001f4dd Sample texts:\")\n",
503
+ " samples = text_series[text_series.str.len() > 10].head(3)\n",
504
+ " for i, sample in enumerate(samples, 1):\n",
505
+ " truncated = sample[:80] + \"...\" if len(sample) > 80 else sample\n",
506
+ " print(f\" {i}. {truncated}\")"
507
+ ]
508
+ },
509
+ {
510
+ "cell_type": "markdown",
511
+ "id": "cell-12",
512
+ "metadata": {
513
+ "papermill": {
514
+ "duration": 0.007406,
515
+ "end_time": "2026-02-02T01:44:19.164113",
516
+ "exception": false,
517
+ "start_time": "2026-02-02T01:44:19.156707",
518
+ "status": "completed"
519
+ },
520
+ "tags": []
521
+ },
522
+ "source": [
523
+ "## 1a.a.5 Process Text Columns (Per-Event Embeddings)"
524
+ ]
525
+ },
526
+ {
527
+ "cell_type": "code",
528
+ "execution_count": null,
529
+ "id": "cell-13",
530
+ "metadata": {
531
+ "execution": {
532
+ "iopub.execute_input": "2026-02-02T01:44:19.177537Z",
533
+ "iopub.status.busy": "2026-02-02T01:44:19.177372Z",
534
+ "iopub.status.idle": "2026-02-02T01:44:19.180490Z",
535
+ "shell.execute_reply": "2026-02-02T01:44:19.179931Z"
536
+ },
537
+ "papermill": {
538
+ "duration": 0.011249,
539
+ "end_time": "2026-02-02T01:44:19.181334",
540
+ "exception": false,
541
+ "start_time": "2026-02-02T01:44:19.170085",
542
+ "status": "completed"
543
+ },
544
+ "tags": []
545
+ },
546
+ "outputs": [],
547
+ "source": [
548
+ "if text_columns and findings.is_time_series:\n",
549
+ " processor = TextColumnProcessor(text_config)\n",
550
+ " \n",
551
+ " print(\"Processing TEXT columns...\")\n",
552
+ " print(\"(This may take a moment for large datasets)\\n\")\n",
553
+ " \n",
554
+ " results = []\n",
555
+ " df_with_pcs = df.copy()\n",
556
+ " \n",
557
+ " for col_name in text_columns:\n",
558
+ " print(f\"\\n{'='*70}\")\n",
559
+ " print(f\"Processing: {col_name}\")\n",
560
+ " print(f\"{'='*70}\")\n",
561
+ " \n",
562
+ " df_with_pcs, result = processor.process_column(df_with_pcs, col_name)\n",
563
+ " results.append(result)\n",
564
+ " \n",
565
+ " print(f\"\\n\\u2705 Per-event processing complete:\")\n",
566
+ " print(f\" Components: {result.n_components}\")\n",
567
+ " print(f\" Explained variance: {result.explained_variance:.1%}\")\n",
568
+ " print(f\" Features: {', '.join(result.component_columns)}\")\n",
569
+ " \n",
570
+ " print(f\"\\n\\nDataFrame now has {len(df_with_pcs.columns)} columns (added {len(df_with_pcs.columns) - len(df.columns)} PC columns)\")"
571
+ ]
572
+ },
573
+ {
574
+ "cell_type": "markdown",
575
+ "id": "cell-14",
576
+ "metadata": {
577
+ "papermill": {
578
+ "duration": 0.006049,
579
+ "end_time": "2026-02-02T01:44:19.193144",
580
+ "exception": false,
581
+ "start_time": "2026-02-02T01:44:19.187095",
582
+ "status": "completed"
583
+ },
584
+ "tags": []
585
+ },
586
+ "source": [
587
+ "## 1a.a.6 Plan Time Window Aggregation\n",
588
+ "\n",
589
+ "PC features will be aggregated across time windows to create entity-level features.\n",
590
+ "\n",
591
+ "**Example output features:**\n",
592
+ "- `ticket_text_pc1_mean_7d` - Average of PC1 over last 7 days\n",
593
+ "- `ticket_text_pc2_std_30d` - Standard deviation of PC2 over last 30 days"
594
+ ]
595
+ },
596
+ {
597
+ "cell_type": "code",
598
+ "execution_count": null,
599
+ "id": "cell-15",
600
+ "metadata": {
601
+ "execution": {
602
+ "iopub.execute_input": "2026-02-02T01:44:19.206713Z",
603
+ "iopub.status.busy": "2026-02-02T01:44:19.206571Z",
604
+ "iopub.status.idle": "2026-02-02T01:44:19.209944Z",
605
+ "shell.execute_reply": "2026-02-02T01:44:19.209398Z"
606
+ },
607
+ "papermill": {
608
+ "duration": 0.011853,
609
+ "end_time": "2026-02-02T01:44:19.210725",
610
+ "exception": false,
611
+ "start_time": "2026-02-02T01:44:19.198872",
612
+ "status": "completed"
613
+ },
614
+ "tags": []
615
+ },
616
+ "outputs": [],
617
+ "source": [
618
+ "if text_columns and findings.is_time_series and results:\n",
619
+ " # Collect all PC columns\n",
620
+ " all_pc_columns = []\n",
621
+ " for result in results:\n",
622
+ " all_pc_columns.extend(result.component_columns)\n",
623
+ " \n",
624
+ " print(f\"\\n{'='*70}\")\n",
625
+ " print(\"AGGREGATION PLAN\")\n",
626
+ " print(f\"{'='*70}\")\n",
627
+ " \n",
628
+ " aggregator = TimeWindowAggregator(ENTITY_COLUMN, TIME_COLUMN)\n",
629
+ " plan = aggregator.generate_plan(\n",
630
+ " df_with_pcs,\n",
631
+ " windows=AGGREGATION_WINDOWS,\n",
632
+ " value_columns=all_pc_columns,\n",
633
+ " agg_funcs=AGGREGATION_FUNCS,\n",
634
+ " include_event_count=False,\n",
635
+ " include_recency=False\n",
636
+ " )\n",
637
+ " \n",
638
+ " print(f\"\\n\\U0001f4ca Plan Summary:\")\n",
639
+ " print(f\" Entity column: {plan.entity_column}\")\n",
640
+ " print(f\" Time column: {plan.time_column}\")\n",
641
+ " print(f\" Windows: {[w.name for w in plan.windows]}\")\n",
642
+ " print(f\" Value columns: {len(plan.value_columns)}\")\n",
643
+ " print(f\" Aggregation functions: {plan.agg_funcs}\")\n",
644
+ " print(f\" Total features to create: {len(plan.feature_columns)}\")\n",
645
+ " \n",
646
+ " print(f\"\\n\\U0001f4dd Sample feature names:\")\n",
647
+ " for feat in plan.feature_columns[:10]:\n",
648
+ " print(f\" - {feat}\")\n",
649
+ " if len(plan.feature_columns) > 10:\n",
650
+ " print(f\" ... and {len(plan.feature_columns) - 10} more\")"
651
+ ]
652
+ },
653
+ {
654
+ "cell_type": "markdown",
655
+ "id": "cell-16",
656
+ "metadata": {
657
+ "papermill": {
658
+ "duration": 0.006214,
659
+ "end_time": "2026-02-02T01:44:19.223400",
660
+ "exception": false,
661
+ "start_time": "2026-02-02T01:44:19.217186",
662
+ "status": "completed"
663
+ },
664
+ "tags": []
665
+ },
666
+ "source": [
667
+ "## 1a.a.7 Visualize PC Distributions"
668
+ ]
669
+ },
670
+ {
671
+ "cell_type": "code",
672
+ "execution_count": null,
673
+ "id": "cell-17",
674
+ "metadata": {
675
+ "execution": {
676
+ "iopub.execute_input": "2026-02-02T01:44:19.236409Z",
677
+ "iopub.status.busy": "2026-02-02T01:44:19.236281Z",
678
+ "iopub.status.idle": "2026-02-02T01:44:19.239879Z",
679
+ "shell.execute_reply": "2026-02-02T01:44:19.239304Z"
680
+ },
681
+ "papermill": {
682
+ "duration": 0.011111,
683
+ "end_time": "2026-02-02T01:44:19.240565",
684
+ "exception": false,
685
+ "start_time": "2026-02-02T01:44:19.229454",
686
+ "status": "completed"
687
+ },
688
+ "tags": []
689
+ },
690
+ "outputs": [],
691
+ "source": [
692
+ "if text_columns and results:\n",
693
+ " for result in results:\n",
694
+ " print(f\"\\n{'='*70}\")\n",
695
+ " print(f\"PC Feature Distributions: {result.column_name}\")\n",
696
+ " print(f\"{'='*70}\")\n",
697
+ " \n",
698
+ " # Distribution of PC1 and PC2\n",
699
+ " if len(result.component_columns) >= 2:\n",
700
+ " fig = make_subplots(rows=1, cols=2,\n",
701
+ " subplot_titles=(result.component_columns[0], result.component_columns[1]))\n",
702
+ " \n",
703
+ " fig.add_trace(go.Histogram(\n",
704
+ " x=df_with_pcs[result.component_columns[0]],\n",
705
+ " nbinsx=50, marker_color='steelblue', opacity=0.7\n",
706
+ " ), row=1, col=1)\n",
707
+ " \n",
708
+ " fig.add_trace(go.Histogram(\n",
709
+ " x=df_with_pcs[result.component_columns[1]],\n",
710
+ " nbinsx=50, marker_color='coral', opacity=0.7\n",
711
+ " ), row=1, col=2)\n",
712
+ " \n",
713
+ " fig.update_layout(\n",
714
+ " title=f\"PC Feature Distributions: {result.column_name}\",\n",
715
+ " height=350, template=\"plotly_white\", showlegend=False\n",
716
+ " )\n",
717
+ " display_figure(fig)\n",
718
+ " \n",
719
+ " # Scatter plot of PC1 vs PC2\n",
720
+ " if len(result.component_columns) >= 2:\n",
721
+ " fig = px.scatter(\n",
722
+ " df_with_pcs.sample(min(5000, len(df_with_pcs))),\n",
723
+ " x=result.component_columns[0],\n",
724
+ " y=result.component_columns[1],\n",
725
+ " title=f\"PC1 vs PC2 (sample): {result.column_name}\",\n",
726
+ " opacity=0.4\n",
727
+ " )\n",
728
+ " fig.update_layout(template=\"plotly_white\", height=400)\n",
729
+ " display_figure(fig)"
730
+ ]
731
+ },
732
+ {
733
+ "cell_type": "markdown",
734
+ "id": "cell-18",
735
+ "metadata": {
736
+ "papermill": {
737
+ "duration": 0.006033,
738
+ "end_time": "2026-02-02T01:44:19.252472",
739
+ "exception": false,
740
+ "start_time": "2026-02-02T01:44:19.246439",
741
+ "status": "completed"
742
+ },
743
+ "tags": []
744
+ },
745
+ "source": [
746
+ "## 1a.a.8 Update Findings"
747
+ ]
748
+ },
749
+ {
750
+ "cell_type": "code",
751
+ "execution_count": null,
752
+ "id": "cell-19",
753
+ "metadata": {
754
+ "execution": {
755
+ "iopub.execute_input": "2026-02-02T01:44:19.265553Z",
756
+ "iopub.status.busy": "2026-02-02T01:44:19.265406Z",
757
+ "iopub.status.idle": "2026-02-02T01:44:21.946608Z",
758
+ "shell.execute_reply": "2026-02-02T01:44:21.946004Z"
759
+ },
760
+ "papermill": {
761
+ "duration": 2.688722,
762
+ "end_time": "2026-02-02T01:44:21.947559",
763
+ "exception": false,
764
+ "start_time": "2026-02-02T01:44:19.258837",
765
+ "status": "completed"
766
+ },
767
+ "tags": []
768
+ },
769
+ "outputs": [],
770
+ "source": [
771
+ "if text_columns and results:\n",
772
+ " for result in results:\n",
773
+ " metadata = TextProcessingMetadata(\n",
774
+ " column_name=result.column_name,\n",
775
+ " embedding_model=text_config.embedding_model,\n",
776
+ " embedding_dim=result.embeddings_shape[1],\n",
777
+ " n_components=result.n_components,\n",
778
+ " explained_variance=result.explained_variance,\n",
779
+ " component_columns=result.component_columns,\n",
780
+ " variance_threshold_used=text_config.variance_threshold,\n",
781
+ " processing_approach=\"pca\"\n",
782
+ " )\n",
783
+ " findings.text_processing[result.column_name] = metadata\n",
784
+ " \n",
785
+ " print(f\"\\u2705 Added text processing metadata for {result.column_name}\")\n",
786
+ " \n",
787
+ " findings.save(FINDINGS_PATH)\n",
788
+ " print(f\"\\nFindings saved to: {FINDINGS_PATH}\")\n",
789
+ "\n",
790
+ "from customer_retention.analysis.notebook_html_exporter import export_notebook_html\n",
791
+ "export_notebook_html(Path(\"01a_a_temporal_text_deep_dive.ipynb\"), EXPERIMENTS_DIR / \"docs\")\n"
792
+ ]
793
+ },
794
+ {
795
+ "cell_type": "markdown",
796
+ "id": "cell-20",
797
+ "metadata": {
798
+ "papermill": {
799
+ "duration": 0.006498,
800
+ "end_time": "2026-02-02T01:44:21.960437",
801
+ "exception": false,
802
+ "start_time": "2026-02-02T01:44:21.953939",
803
+ "status": "completed"
804
+ },
805
+ "tags": []
806
+ },
807
+ "source": [
808
+ "## 1a.a.9 Production Recommendations"
809
+ ]
810
+ },
811
+ {
812
+ "cell_type": "code",
813
+ "execution_count": null,
814
+ "id": "cell-21",
815
+ "metadata": {
816
+ "execution": {
817
+ "iopub.execute_input": "2026-02-02T01:44:21.975562Z",
818
+ "iopub.status.busy": "2026-02-02T01:44:21.975429Z",
819
+ "iopub.status.idle": "2026-02-02T01:44:21.978458Z",
820
+ "shell.execute_reply": "2026-02-02T01:44:21.977893Z"
821
+ },
822
+ "papermill": {
823
+ "duration": 0.011763,
824
+ "end_time": "2026-02-02T01:44:21.979243",
825
+ "exception": false,
826
+ "start_time": "2026-02-02T01:44:21.967480",
827
+ "status": "completed"
828
+ },
829
+ "tags": []
830
+ },
831
+ "outputs": [],
832
+ "source": [
833
+ "if text_columns and results:\n",
834
+ " print(\"\\n\" + \"=\"*70)\n",
835
+ " print(\"PRODUCTION PIPELINE RECOMMENDATIONS\")\n",
836
+ " print(\"=\"*70)\n",
837
+ " \n",
838
+ " print(\"\\n\\U0001f527 Bronze Layer (per-event processing):\")\n",
839
+ " for result in results:\n",
840
+ " print(f\"\\n {result.column_name}:\")\n",
841
+ " print(f\" Action: embed_reduce\")\n",
842
+ " print(f\" Model: {text_config.embedding_model}\")\n",
843
+ " print(f\" Components: {result.n_components}\")\n",
844
+ " print(f\" Output: {', '.join(result.component_columns[:3])}...\")\n",
845
+ " \n",
846
+ " print(\"\\n\\U0001f527 Silver Layer (entity aggregation):\")\n",
847
+ " print(f\" Windows: {AGGREGATION_WINDOWS}\")\n",
848
+ " print(f\" Functions: {AGGREGATION_FUNCS}\")\n",
849
+ " print(f\" Example features:\")\n",
850
+ " for result in results[:1]:\n",
851
+ " pc1 = result.component_columns[0]\n",
852
+ " for window in AGGREGATION_WINDOWS[:2]:\n",
853
+ " for func in AGGREGATION_FUNCS[:2]:\n",
854
+ " print(f\" - {pc1}_{func}_{window}\")\n",
855
+ " \n",
856
+ " print(\"\\n\\U0001f4a1 The pipeline generator will create these transformations automatically.\")"
857
+ ]
858
+ },
859
+ {
860
+ "cell_type": "markdown",
861
+ "id": "cell-22",
862
+ "metadata": {
863
+ "papermill": {
864
+ "duration": 0.006461,
865
+ "end_time": "2026-02-02T01:44:21.994543",
866
+ "exception": false,
867
+ "start_time": "2026-02-02T01:44:21.988082",
868
+ "status": "completed"
869
+ },
870
+ "tags": []
871
+ },
872
+ "source": [
873
+ "---\n",
874
+ "\n",
875
+ "## Summary\n",
876
+ "\n",
877
+ "In this notebook, we:\n",
878
+ "\n",
879
+ "1. **Analyzed** TEXT columns in event-level data\n",
880
+ "2. **Generated per-event embeddings** using sentence-transformers\n",
881
+ "3. **Applied PCA** to reduce dimensions\n",
882
+ "4. **Planned aggregation** across time windows\n",
883
+ "5. **Updated findings** with processing metadata\n",
884
+ "\n",
885
+ "## Processing Flow\n",
886
+ "\n",
887
+ "```\n",
888
+ "Event TEXT → Embeddings (384-dim) → PCA (N components) → Aggregate by entity+window\n",
889
+ "```\n",
890
+ "\n",
891
+ "## Example Output Features\n",
892
+ "\n",
893
+ "For a `ticket_text` column with 3 PC components and 4 time windows:\n",
894
+ "- `ticket_text_pc1_mean_7d`, `ticket_text_pc1_std_7d`, ...\n",
895
+ "- `ticket_text_pc2_mean_7d`, `ticket_text_pc2_std_7d`, ...\n",
896
+ "- Total: 3 PCs × 4 windows × 4 functions = 48 features\n",
897
+ "\n",
898
+ "---\n",
899
+ "\n",
900
+ "## Next Steps\n",
901
+ "\n",
902
+ "Continue with the **Event Bronze Track**:\n",
903
+ "\n",
904
+ "1. **01b_temporal_quality.ipynb** - Check for duplicate events, temporal gaps\n",
905
+ "2. **01c_temporal_patterns.ipynb** - Detect trends, seasonality\n",
906
+ "3. **01d_event_aggregation.ipynb** - Aggregate all features (including text PCs) to entity-level"
907
+ ]
908
+ },
909
+ {
910
+ "cell_type": "markdown",
911
+ "id": "48dncdcdryw",
912
+ "metadata": {
913
+ "papermill": {
914
+ "duration": 0.006469,
915
+ "end_time": "2026-02-02T01:44:22.007428",
916
+ "exception": false,
917
+ "start_time": "2026-02-02T01:44:22.000959",
918
+ "status": "completed"
919
+ },
920
+ "tags": []
921
+ },
922
+ "source": [
923
+ "> **Save Reminder:** Save this notebook (Ctrl+S / Cmd+S) before running the next one.\n",
924
+ "> The next notebook will automatically export this notebook's HTML documentation from the saved file."
925
+ ]
926
+ }
927
+ ],
928
+ "metadata": {
929
+ "kernelspec": {
930
+ "display_name": "Python 3",
931
+ "language": "python",
932
+ "name": "python3"
933
+ },
934
+ "language_info": {
935
+ "codemirror_mode": {
936
+ "name": "ipython",
937
+ "version": 3
938
+ },
939
+ "file_extension": ".py",
940
+ "mimetype": "text/x-python",
941
+ "name": "python",
942
+ "nbconvert_exporter": "python",
943
+ "pygments_lexer": "ipython3",
944
+ "version": "3.12.4"
945
+ },
946
+ "papermill": {
947
+ "default_parameters": {},
948
+ "duration": 8.956491,
949
+ "end_time": "2026-02-02T01:44:24.631716",
950
+ "environment_variables": {},
951
+ "exception": null,
952
+ "input_path": "/Users/Vital/python/CustomerRetention/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb",
953
+ "output_path": "/Users/Vital/python/CustomerRetention/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb",
954
+ "parameters": {},
955
+ "start_time": "2026-02-02T01:44:15.675225",
956
+ "version": "2.6.0"
957
+ }
958
+ },
959
+ "nbformat": 4,
960
+ "nbformat_minor": 5
961
+ }