churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,115 @@
1
+ from dataclasses import dataclass
2
+ from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
3
+
4
+ from customer_retention.core.compat import DataFrame
5
+
6
+ from .text_embedder import TextEmbedder
7
+ from .text_reducer import TextDimensionalityReducer
8
+
9
+ if TYPE_CHECKING:
10
+ from customer_retention.artifacts import FitArtifactRegistry
11
+
12
+
13
+ @dataclass
14
+ class TextProcessingConfig:
15
+ embedding_model: str = "all-MiniLM-L6-v2"
16
+ variance_threshold: float = 0.95
17
+ max_components: Optional[int] = None
18
+ min_components: int = 2
19
+ batch_size: int = 32
20
+
21
+
22
+ @dataclass
23
+ class TextColumnResult:
24
+ column_name: str
25
+ n_components: int
26
+ explained_variance: float
27
+ component_columns: List[str]
28
+ embeddings_shape: Tuple[int, int]
29
+ sample_size: int
30
+
31
+
32
+ class TextColumnProcessor:
33
+ def __init__(self, config: Optional[TextProcessingConfig] = None,
34
+ registry: Optional["FitArtifactRegistry"] = None):
35
+ self.config = config or TextProcessingConfig()
36
+ self.registry = registry
37
+ self._embedder: Optional[TextEmbedder] = None
38
+ self._reducers: Dict[str, TextDimensionalityReducer] = {}
39
+
40
+ @property
41
+ def embedder(self) -> TextEmbedder:
42
+ if self._embedder is None:
43
+ self._embedder = TextEmbedder(self.config.embedding_model)
44
+ return self._embedder
45
+
46
+ def process_column(self, df: DataFrame, column: str,
47
+ fit: bool = True) -> Tuple[DataFrame, TextColumnResult]:
48
+ embeddings = self.embedder.embed_column(df, column, batch_size=self.config.batch_size)
49
+ reducer = self._get_or_create_reducer(column, fit)
50
+ if fit:
51
+ result = reducer.fit_transform(embeddings, column)
52
+ self._register_reducer(column, reducer)
53
+ else:
54
+ result = reducer.transform(embeddings, column)
55
+ output_df = self._add_components_to_df(df, result.components, result.component_names)
56
+ return output_df, TextColumnResult(
57
+ column_name=column,
58
+ n_components=result.n_components,
59
+ explained_variance=result.cumulative_variance,
60
+ component_columns=result.component_names,
61
+ embeddings_shape=embeddings.shape,
62
+ sample_size=len(df)
63
+ )
64
+
65
+ def _register_reducer(self, column: str, reducer: TextDimensionalityReducer) -> None:
66
+ if self.registry is None or reducer._pca is None:
67
+ return
68
+ self.registry.register(
69
+ artifact_type="reducer",
70
+ target_column=column,
71
+ transformer=reducer._pca
72
+ )
73
+
74
+ def process_all_text_columns(self, df: DataFrame,
75
+ text_columns: List[str]) -> Tuple[DataFrame, List[TextColumnResult]]:
76
+ results = []
77
+ output_df = df.copy()
78
+ for column in text_columns:
79
+ output_df, result = self.process_column(output_df, column)
80
+ results.append(result)
81
+ return output_df, results
82
+
83
+ def _get_or_create_reducer(self, column: str, fit: bool) -> TextDimensionalityReducer:
84
+ if fit:
85
+ self._reducers[column] = TextDimensionalityReducer(
86
+ variance_threshold=self.config.variance_threshold,
87
+ max_components=self.config.max_components,
88
+ min_components=self.config.min_components
89
+ )
90
+ return self._reducers[column]
91
+ if column in self._reducers:
92
+ return self._reducers[column]
93
+ if self.registry is not None and self.registry.has_artifact(f"{column}_reducer"):
94
+ pca = self.registry.load(f"{column}_reducer")
95
+ reducer = TextDimensionalityReducer(
96
+ variance_threshold=self.config.variance_threshold,
97
+ max_components=self.config.max_components,
98
+ min_components=self.config.min_components
99
+ )
100
+ reducer._pca = pca
101
+ reducer._fitted = True
102
+ self._reducers[column] = reducer
103
+ return reducer
104
+ self._reducers[column] = TextDimensionalityReducer(
105
+ variance_threshold=self.config.variance_threshold,
106
+ max_components=self.config.max_components,
107
+ min_components=self.config.min_components
108
+ )
109
+ return self._reducers[column]
110
+
111
+ def _add_components_to_df(self, df: DataFrame, components, names: List[str]) -> DataFrame:
112
+ output_df = df.copy()
113
+ for i, name in enumerate(names):
114
+ output_df[name] = components[:, i]
115
+ return output_df
@@ -0,0 +1,60 @@
1
+ from dataclasses import dataclass
2
+ from typing import List, Optional
3
+
4
+ import numpy as np
5
+
6
+
7
+ @dataclass
8
+ class ReductionResult:
9
+ components: np.ndarray
10
+ n_components: int
11
+ explained_variance_ratio: np.ndarray
12
+ cumulative_variance: float
13
+ component_names: List[str]
14
+
15
+
16
+ class TextDimensionalityReducer:
17
+ def __init__(self, variance_threshold: float = 0.95,
18
+ max_components: Optional[int] = None, min_components: int = 2):
19
+ self.variance_threshold = variance_threshold
20
+ self.max_components = max_components
21
+ self.min_components = min_components
22
+ self._pca = None
23
+ self._fitted = False
24
+
25
+ def fit(self, embeddings: np.ndarray) -> "TextDimensionalityReducer":
26
+ from sklearn.decomposition import PCA
27
+ n_components = self._compute_n_components(embeddings)
28
+ self._pca = PCA(n_components=n_components)
29
+ self._pca.fit(embeddings)
30
+ self._fitted = True
31
+ return self
32
+
33
+ def transform(self, embeddings: np.ndarray, column_prefix: str) -> ReductionResult:
34
+ if not self._fitted:
35
+ raise ValueError("Must call fit() before transform()")
36
+ components = self._pca.transform(embeddings)
37
+ return ReductionResult(
38
+ components=components,
39
+ n_components=self._pca.n_components_,
40
+ explained_variance_ratio=self._pca.explained_variance_ratio_,
41
+ cumulative_variance=float(np.sum(self._pca.explained_variance_ratio_)),
42
+ component_names=[f"{column_prefix}_pc{i+1}" for i in range(self._pca.n_components_)]
43
+ )
44
+
45
+ def fit_transform(self, embeddings: np.ndarray, column_prefix: str) -> ReductionResult:
46
+ self.fit(embeddings)
47
+ return self.transform(embeddings, column_prefix)
48
+
49
+ def _compute_n_components(self, embeddings: np.ndarray) -> int:
50
+ from sklearn.decomposition import PCA
51
+ n_samples, n_features = embeddings.shape
52
+ max_possible = min(n_samples, n_features)
53
+ full_pca = PCA(n_components=max_possible)
54
+ full_pca.fit(embeddings)
55
+ cumsum = np.cumsum(full_pca.explained_variance_ratio_)
56
+ n_components = int(np.searchsorted(cumsum, self.variance_threshold) + 1)
57
+ n_components = max(n_components, self.min_components)
58
+ if self.max_components:
59
+ n_components = min(n_components, self.max_components)
60
+ return min(n_components, max_possible)
@@ -0,0 +1,303 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional
3
+
4
+ import numpy as np
5
+
6
+ from customer_retention.core.compat import DataFrame, pd
7
+
8
+
9
+ @dataclass
10
+ class DistributionStats:
11
+ min: float
12
+ max: float
13
+ mean: float
14
+ median: float
15
+ std: float
16
+ q25: Optional[float] = None
17
+ q75: Optional[float] = None
18
+
19
+
20
+ @dataclass
21
+ class LifecycleQuadrantResult:
22
+ lifecycles: DataFrame
23
+ tenure_threshold: float
24
+ intensity_threshold: float
25
+ recommendations: DataFrame
26
+
27
+
28
+ _QUADRANT_RECOMMENDATIONS = {
29
+ "Steady & Loyal": {
30
+ "Windows": "All available windows",
31
+ "Feature Strategy": "Trend/seasonality features, engagement decay",
32
+ "Risk": "Low churn risk; monitor for engagement decline",
33
+ },
34
+ "Occasional & Loyal": {
35
+ "Windows": "Wider windows (capture sparse events)",
36
+ "Feature Strategy": "Long-window aggregations, recency gap",
37
+ "Risk": "May churn silently; long gaps are normal",
38
+ },
39
+ "Intense & Brief": {
40
+ "Windows": "Narrower windows (capture recency)",
41
+ "Feature Strategy": "Recency features, burst detection",
42
+ "Risk": "High churn risk; may be early churners",
43
+ },
44
+ "One-shot": {
45
+ "Windows": "N/A (insufficient history)",
46
+ "Feature Strategy": "Cold-start fallback, population-level stats",
47
+ "Risk": "Cannot build temporal features; consider separate handling",
48
+ },
49
+ }
50
+
51
+
52
+ def _assign_lifecycle_quadrant(duration_days: np.ndarray, intensity: np.ndarray,
53
+ tenure_threshold: float, intensity_threshold: float) -> np.ndarray:
54
+ long = duration_days >= tenure_threshold
55
+ high = intensity >= intensity_threshold
56
+ result = np.where(long & high, "Steady & Loyal",
57
+ np.where(long, "Occasional & Loyal",
58
+ np.where(high, "Intense & Brief", "One-shot")))
59
+ return result
60
+
61
+
62
+ def classify_lifecycle_quadrants(entity_lifecycles: DataFrame) -> LifecycleQuadrantResult:
63
+ lc = entity_lifecycles.copy()
64
+ tenure_threshold = float(lc["duration_days"].median())
65
+ lc["intensity"] = lc["event_count"] / lc["duration_days"].clip(lower=1)
66
+ intensity_threshold = float(lc["intensity"].median())
67
+
68
+ lc["lifecycle_quadrant"] = _assign_lifecycle_quadrant(
69
+ lc["duration_days"].values, lc["intensity"].values,
70
+ tenure_threshold, intensity_threshold
71
+ )
72
+
73
+ counts = lc["lifecycle_quadrant"].value_counts()
74
+ total = len(lc)
75
+ rows = []
76
+ for quadrant in counts.index:
77
+ n = counts[quadrant]
78
+ rec = _QUADRANT_RECOMMENDATIONS[quadrant]
79
+ rows.append({
80
+ "Quadrant": quadrant,
81
+ "Entities": n,
82
+ "Share": f"{n / total * 100:.1f}%",
83
+ "Windows": rec["Windows"],
84
+ "Feature Strategy": rec["Feature Strategy"],
85
+ "Risk": rec["Risk"],
86
+ })
87
+
88
+ return LifecycleQuadrantResult(
89
+ lifecycles=lc,
90
+ tenure_threshold=tenure_threshold,
91
+ intensity_threshold=intensity_threshold,
92
+ recommendations=pd.DataFrame(rows),
93
+ )
94
+
95
+
96
+ @dataclass
97
+ class ActivitySegmentResult:
98
+ lifecycles: DataFrame
99
+ q25_threshold: float
100
+ q75_threshold: float
101
+ recommendations: DataFrame
102
+
103
+
104
+ _SEGMENT_RECOMMENDATIONS = {
105
+ "One-time": {
106
+ "Feature Approach": "No temporal features possible; use event-level attributes only",
107
+ "Modeling Implication": "Cold-start problem; consider population-level fallback or separate model",
108
+ },
109
+ "Low Activity": {
110
+ "Feature Approach": "Wider windows with count/recency; sparse aggregations",
111
+ "Modeling Implication": "Features will be noisy; log-transform counts, handle many zeros",
112
+ },
113
+ "Medium Activity": {
114
+ "Feature Approach": "Standard windows; mean/std aggregations reliable",
115
+ "Modeling Implication": "Core modeling population; most features well-populated",
116
+ },
117
+ "High Activity": {
118
+ "Feature Approach": "All windows including narrower; trends and velocity meaningful",
119
+ "Modeling Implication": "Rich feature space; watch for dominance in training set",
120
+ },
121
+ }
122
+
123
+
124
+ def _assign_activity_segment(event_count: np.ndarray, q25: float, q75: float) -> np.ndarray:
125
+ return np.where(event_count <= 1, "One-time",
126
+ np.where(event_count <= q25, "Low Activity",
127
+ np.where(event_count <= q75, "Medium Activity", "High Activity")))
128
+
129
+
130
+ def classify_activity_segments(entity_lifecycles: DataFrame) -> ActivitySegmentResult:
131
+ lc = entity_lifecycles.copy()
132
+ q25 = float(lc["event_count"].quantile(0.25))
133
+ q75 = float(lc["event_count"].quantile(0.75))
134
+
135
+ lc["activity_segment"] = _assign_activity_segment(lc["event_count"].values, q25, q75)
136
+
137
+ counts = lc["activity_segment"].value_counts()
138
+ total = len(lc)
139
+ rows = []
140
+ for segment in counts.index:
141
+ n = counts[segment]
142
+ subset = lc[lc["activity_segment"] == segment]
143
+ rec = _SEGMENT_RECOMMENDATIONS[segment]
144
+ rows.append({
145
+ "Segment": segment,
146
+ "Entities": n,
147
+ "Share": f"{n / total * 100:.1f}%",
148
+ "Avg Events": f"{subset['event_count'].mean():.1f}",
149
+ "Feature Approach": rec["Feature Approach"],
150
+ "Modeling Implication": rec["Modeling Implication"],
151
+ })
152
+
153
+ return ActivitySegmentResult(
154
+ lifecycles=lc,
155
+ q25_threshold=q25,
156
+ q75_threshold=q75,
157
+ recommendations=pd.DataFrame(rows),
158
+ )
159
+
160
+
161
+ @dataclass
162
+ class EntityLifecycle:
163
+ entity: str
164
+ first_event: pd.Timestamp
165
+ last_event: pd.Timestamp
166
+ duration_days: int
167
+ event_count: int
168
+
169
+
170
+ @dataclass
171
+ class TimeSeriesProfile:
172
+ entity_column: str
173
+ time_column: str
174
+ total_events: int
175
+ unique_entities: int
176
+ time_span_days: int
177
+ events_per_entity: DistributionStats
178
+ entity_lifecycles: DataFrame
179
+ avg_inter_event_days: Optional[float] = None
180
+ first_event_date: Optional[pd.Timestamp] = None
181
+ last_event_date: Optional[pd.Timestamp] = None
182
+
183
+
184
+ class TimeSeriesProfiler:
185
+ SECONDS_PER_DAY = 86400
186
+
187
+ def __init__(self, entity_column: str, time_column: str):
188
+ self.entity_column = entity_column
189
+ self.time_column = time_column
190
+
191
+ def profile(self, df: DataFrame) -> TimeSeriesProfile:
192
+ if len(df) == 0:
193
+ return self._empty_profile()
194
+
195
+ self._validate_columns(df)
196
+ df = self._prepare_dataframe(df)
197
+
198
+ total_events = len(df)
199
+ unique_entities = df[self.entity_column].nunique()
200
+
201
+ lifecycles = self._compute_entity_lifecycles(df)
202
+ events_per_entity = self._compute_events_distribution(lifecycles)
203
+ time_span = self._compute_time_span(df)
204
+ avg_inter_event = self._compute_avg_inter_event_time(df)
205
+
206
+ return TimeSeriesProfile(
207
+ entity_column=self.entity_column,
208
+ time_column=self.time_column,
209
+ total_events=total_events,
210
+ unique_entities=unique_entities,
211
+ time_span_days=time_span,
212
+ events_per_entity=events_per_entity,
213
+ entity_lifecycles=lifecycles,
214
+ avg_inter_event_days=avg_inter_event,
215
+ first_event_date=df[self.time_column].min(),
216
+ last_event_date=df[self.time_column].max(),
217
+ )
218
+
219
+ def _validate_columns(self, df: DataFrame) -> None:
220
+ if self.entity_column not in df.columns:
221
+ raise KeyError(f"Entity column '{self.entity_column}' not found")
222
+ if self.time_column not in df.columns:
223
+ raise KeyError(f"Time column '{self.time_column}' not found")
224
+
225
+ def _prepare_dataframe(self, df: DataFrame) -> DataFrame:
226
+ df = df.copy()
227
+ if not pd.api.types.is_datetime64_any_dtype(df[self.time_column]):
228
+ df[self.time_column] = pd.to_datetime(df[self.time_column])
229
+ return df
230
+
231
+ def _compute_entity_lifecycles(self, df: DataFrame) -> DataFrame:
232
+ grouped = df.groupby(self.entity_column)[self.time_column]
233
+
234
+ lifecycles = pd.DataFrame({
235
+ "entity": grouped.first().index.tolist(),
236
+ "first_event": grouped.min().values,
237
+ "last_event": grouped.max().values,
238
+ "event_count": grouped.count().values,
239
+ })
240
+
241
+ lifecycles["duration_days"] = (
242
+ (lifecycles["last_event"] - lifecycles["first_event"]).dt.days
243
+ )
244
+
245
+ return lifecycles
246
+
247
+ def _compute_events_distribution(self, lifecycles: DataFrame) -> DistributionStats:
248
+ counts = lifecycles["event_count"]
249
+
250
+ if len(counts) == 0:
251
+ return DistributionStats(
252
+ min=0, max=0, mean=0, median=0, std=0, q25=0, q75=0
253
+ )
254
+
255
+ return DistributionStats(
256
+ min=float(counts.min()),
257
+ max=float(counts.max()),
258
+ mean=float(counts.mean()),
259
+ median=float(counts.median()),
260
+ std=float(counts.std()) if len(counts) > 1 else 0.0,
261
+ q25=float(counts.quantile(0.25)),
262
+ q75=float(counts.quantile(0.75)),
263
+ )
264
+
265
+ def _compute_time_span(self, df: DataFrame) -> int:
266
+ if len(df) == 0:
267
+ return 0
268
+ min_date = df[self.time_column].min()
269
+ max_date = df[self.time_column].max()
270
+ return (max_date - min_date).days
271
+
272
+ def _compute_avg_inter_event_time(self, df: DataFrame) -> Optional[float]:
273
+ if len(df) < 2:
274
+ return None
275
+
276
+ inter_event_days = []
277
+ for _, group in df.groupby(self.entity_column):
278
+ if len(group) < 2:
279
+ continue
280
+ sorted_dates = group[self.time_column].sort_values()
281
+ diffs = sorted_dates.diff().dropna()
282
+ inter_event_days.extend(diffs.dt.total_seconds() / self.SECONDS_PER_DAY)
283
+
284
+ if not inter_event_days:
285
+ return None
286
+
287
+ return float(sum(inter_event_days) / len(inter_event_days))
288
+
289
+ def _empty_profile(self) -> TimeSeriesProfile:
290
+ return TimeSeriesProfile(
291
+ entity_column=self.entity_column,
292
+ time_column=self.time_column,
293
+ total_events=0,
294
+ unique_entities=0,
295
+ time_span_days=0,
296
+ events_per_entity=DistributionStats(
297
+ min=0, max=0, mean=0, median=0, std=0, q25=0, q75=0
298
+ ),
299
+ entity_lifecycles=pd.DataFrame(columns=[
300
+ "entity", "first_event", "last_event", "duration_days", "event_count"
301
+ ]),
302
+ avg_inter_event_days=None,
303
+ )