churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,505 @@
1
+ """
2
+ Customer segmentation module for feature engineering.
3
+
4
+ This module provides functions for creating customer segments based on
5
+ value, engagement, recency, and other behavioral patterns.
6
+ """
7
+
8
+ from dataclasses import dataclass, field
9
+ from enum import Enum
10
+ from typing import Any, Dict, List, Optional
11
+
12
+ from customer_retention.core.compat import DataFrame, pd
13
+
14
+
15
+ class SegmentationType(Enum):
16
+ """Types of customer segmentation."""
17
+ VALUE_FREQUENCY = "value_frequency"
18
+ RECENCY = "recency"
19
+ ENGAGEMENT = "engagement"
20
+ LIFECYCLE = "lifecycle"
21
+ RFM = "rfm"
22
+
23
+
24
+ @dataclass
25
+ class SegmentDefinition:
26
+ """Definition of a customer segment."""
27
+ name: str
28
+ segment_type: SegmentationType
29
+ description: str
30
+ criteria: Dict[str, Any] = field(default_factory=dict)
31
+ count: int = 0
32
+ percentage: float = 0.0
33
+
34
+
35
+ @dataclass
36
+ class SegmentationResult:
37
+ """Result of customer segmentation."""
38
+ segment_column: str
39
+ segment_type: SegmentationType
40
+ total_customers: int
41
+ segments: List[SegmentDefinition] = field(default_factory=list)
42
+ segment_distribution: Dict[str, int] = field(default_factory=dict)
43
+
44
+ def to_dict(self) -> Dict[str, Any]:
45
+ """Convert to dictionary for display."""
46
+ return {
47
+ "segment_column": self.segment_column,
48
+ "segment_type": self.segment_type.value,
49
+ "total_customers": self.total_customers,
50
+ "segment_distribution": self.segment_distribution,
51
+ "segments": [
52
+ {
53
+ "name": s.name,
54
+ "description": s.description,
55
+ "count": s.count,
56
+ "percentage": round(s.percentage, 2)
57
+ }
58
+ for s in self.segments
59
+ ]
60
+ }
61
+
62
+
63
+ class CustomerSegmenter:
64
+ """
65
+ Creates customer segments based on various behavioral patterns.
66
+
67
+ Provides methods for value-based, recency-based, engagement-based,
68
+ and RFM segmentation.
69
+ """
70
+
71
+ def segment_by_value_frequency(
72
+ self,
73
+ df: DataFrame,
74
+ value_column: str,
75
+ frequency_column: str,
76
+ value_threshold: Optional[float] = None,
77
+ frequency_threshold: Optional[float] = None,
78
+ output_column: str = "customer_segment"
79
+ ) -> tuple[DataFrame, SegmentationResult]:
80
+ """
81
+ Segment customers by value and purchase frequency.
82
+
83
+ Creates 4 segments:
84
+ - High_Value_Frequent: High value + high frequency
85
+ - High_Value_Infrequent: High value + low frequency
86
+ - Low_Value_Frequent: Low value + high frequency
87
+ - Low_Value_Infrequent: Low value + low frequency
88
+
89
+ Parameters
90
+ ----------
91
+ df : DataFrame
92
+ Data to segment
93
+ value_column : str
94
+ Column representing customer value (e.g., total revenue)
95
+ frequency_column : str
96
+ Column representing purchase frequency
97
+ value_threshold : float, optional
98
+ Threshold for high value. Default: median
99
+ frequency_threshold : float, optional
100
+ Threshold for high frequency. Default: median
101
+ output_column : str
102
+ Name of the output segment column
103
+
104
+ Returns
105
+ -------
106
+ tuple[DataFrame, SegmentationResult]
107
+ DataFrame with segment column and segmentation results
108
+ """
109
+ df_result = df.copy()
110
+
111
+ # Calculate thresholds if not provided
112
+ if value_threshold is None:
113
+ value_threshold = df[value_column].median()
114
+ if frequency_threshold is None:
115
+ frequency_threshold = df[frequency_column].median()
116
+
117
+ def assign_segment(row):
118
+ high_value = row[value_column] >= value_threshold
119
+ high_freq = row[frequency_column] >= frequency_threshold
120
+
121
+ if high_value and high_freq:
122
+ return "High_Value_Frequent"
123
+ elif high_value and not high_freq:
124
+ return "High_Value_Infrequent"
125
+ elif not high_value and high_freq:
126
+ return "Low_Value_Frequent"
127
+ else:
128
+ return "Low_Value_Infrequent"
129
+
130
+ df_result[output_column] = df_result.apply(assign_segment, axis=1)
131
+
132
+ # Build result
133
+ distribution = df_result[output_column].value_counts().to_dict()
134
+ total = len(df_result)
135
+
136
+ segments = [
137
+ SegmentDefinition(
138
+ name="High_Value_Frequent",
139
+ segment_type=SegmentationType.VALUE_FREQUENCY,
140
+ description="Best customers - high value and frequent purchases",
141
+ criteria={"value": f">= {value_threshold:.2f}", "frequency": f">= {frequency_threshold:.2f}"},
142
+ count=distribution.get("High_Value_Frequent", 0),
143
+ percentage=(distribution.get("High_Value_Frequent", 0) / total * 100) if total > 0 else 0
144
+ ),
145
+ SegmentDefinition(
146
+ name="High_Value_Infrequent",
147
+ segment_type=SegmentationType.VALUE_FREQUENCY,
148
+ description="Potential for increased frequency - high value but low frequency",
149
+ criteria={"value": f">= {value_threshold:.2f}", "frequency": f"< {frequency_threshold:.2f}"},
150
+ count=distribution.get("High_Value_Infrequent", 0),
151
+ percentage=(distribution.get("High_Value_Infrequent", 0) / total * 100) if total > 0 else 0
152
+ ),
153
+ SegmentDefinition(
154
+ name="Low_Value_Frequent",
155
+ segment_type=SegmentationType.VALUE_FREQUENCY,
156
+ description="Potential for upselling - frequent but low value",
157
+ criteria={"value": f"< {value_threshold:.2f}", "frequency": f">= {frequency_threshold:.2f}"},
158
+ count=distribution.get("Low_Value_Frequent", 0),
159
+ percentage=(distribution.get("Low_Value_Frequent", 0) / total * 100) if total > 0 else 0
160
+ ),
161
+ SegmentDefinition(
162
+ name="Low_Value_Infrequent",
163
+ segment_type=SegmentationType.VALUE_FREQUENCY,
164
+ description="Needs activation - low value and low frequency",
165
+ criteria={"value": f"< {value_threshold:.2f}", "frequency": f"< {frequency_threshold:.2f}"},
166
+ count=distribution.get("Low_Value_Infrequent", 0),
167
+ percentage=(distribution.get("Low_Value_Infrequent", 0) / total * 100) if total > 0 else 0
168
+ )
169
+ ]
170
+
171
+ result = SegmentationResult(
172
+ segment_column=output_column,
173
+ segment_type=SegmentationType.VALUE_FREQUENCY,
174
+ total_customers=total,
175
+ segments=segments,
176
+ segment_distribution=distribution
177
+ )
178
+
179
+ return df_result, result
180
+
181
+ def segment_by_recency(
182
+ self,
183
+ df: DataFrame,
184
+ days_since_column: str,
185
+ thresholds: Optional[Dict[str, int]] = None,
186
+ output_column: str = "recency_segment"
187
+ ) -> tuple[DataFrame, SegmentationResult]:
188
+ """
189
+ Segment customers by recency (days since last activity).
190
+
191
+ Default segments:
192
+ - Active_30d: Active within 30 days
193
+ - Recent_90d: Active 31-90 days ago
194
+ - Lapsing_180d: Active 91-180 days ago
195
+ - Dormant_180d+: Inactive for 180+ days
196
+
197
+ Parameters
198
+ ----------
199
+ df : DataFrame
200
+ Data to segment
201
+ days_since_column : str
202
+ Column with days since last activity
203
+ thresholds : Dict[str, int], optional
204
+ Custom thresholds {"active": 30, "recent": 90, "lapsing": 180}
205
+ output_column : str
206
+ Name of the output segment column
207
+
208
+ Returns
209
+ -------
210
+ tuple[DataFrame, SegmentationResult]
211
+ DataFrame with segment column and segmentation results
212
+ """
213
+ df_result = df.copy()
214
+
215
+ if thresholds is None:
216
+ thresholds = {"active": 30, "recent": 90, "lapsing": 180}
217
+
218
+ active_days = thresholds.get("active", 30)
219
+ recent_days = thresholds.get("recent", 90)
220
+ lapsing_days = thresholds.get("lapsing", 180)
221
+
222
+ def assign_recency_bucket(days):
223
+ if pd.isna(days):
224
+ return "Unknown"
225
+ days = int(days)
226
+ if days <= active_days:
227
+ return f"Active_{active_days}d"
228
+ elif days <= recent_days:
229
+ return f"Recent_{recent_days}d"
230
+ elif days <= lapsing_days:
231
+ return f"Lapsing_{lapsing_days}d"
232
+ else:
233
+ return f"Dormant_{lapsing_days}d+"
234
+
235
+ df_result[output_column] = df_result[days_since_column].apply(assign_recency_bucket)
236
+
237
+ # Build result
238
+ distribution = df_result[output_column].value_counts().to_dict()
239
+ total = len(df_result)
240
+
241
+ segment_names = [f"Active_{active_days}d", f"Recent_{recent_days}d",
242
+ f"Lapsing_{lapsing_days}d", f"Dormant_{lapsing_days}d+"]
243
+ segment_descriptions = [
244
+ "Recently active customers",
245
+ "Customers with recent activity",
246
+ "Customers at risk of churning",
247
+ "Inactive customers needing re-engagement"
248
+ ]
249
+
250
+ segments = []
251
+ for name, desc in zip(segment_names, segment_descriptions):
252
+ count = distribution.get(name, 0)
253
+ segments.append(SegmentDefinition(
254
+ name=name,
255
+ segment_type=SegmentationType.RECENCY,
256
+ description=desc,
257
+ count=count,
258
+ percentage=(count / total * 100) if total > 0 else 0
259
+ ))
260
+
261
+ result = SegmentationResult(
262
+ segment_column=output_column,
263
+ segment_type=SegmentationType.RECENCY,
264
+ total_customers=total,
265
+ segments=segments,
266
+ segment_distribution=distribution
267
+ )
268
+
269
+ return df_result, result
270
+
271
+ def segment_by_engagement(
272
+ self,
273
+ df: DataFrame,
274
+ engagement_column: str,
275
+ low_threshold: float = 0.3,
276
+ high_threshold: float = 0.7,
277
+ output_column: str = "engagement_segment"
278
+ ) -> tuple[DataFrame, SegmentationResult]:
279
+ """
280
+ Segment customers by engagement score.
281
+
282
+ Parameters
283
+ ----------
284
+ df : DataFrame
285
+ Data to segment
286
+ engagement_column : str
287
+ Column with engagement score (0-1 scale)
288
+ low_threshold : float
289
+ Threshold below which engagement is considered low
290
+ high_threshold : float
291
+ Threshold above which engagement is considered high
292
+ output_column : str
293
+ Name of the output segment column
294
+
295
+ Returns
296
+ -------
297
+ tuple[DataFrame, SegmentationResult]
298
+ DataFrame with segment column and segmentation results
299
+ """
300
+ df_result = df.copy()
301
+
302
+ def assign_engagement(score):
303
+ if pd.isna(score):
304
+ return "Unknown"
305
+ if score >= high_threshold:
306
+ return "High_Engagement"
307
+ elif score >= low_threshold:
308
+ return "Medium_Engagement"
309
+ else:
310
+ return "Low_Engagement"
311
+
312
+ df_result[output_column] = df_result[engagement_column].apply(assign_engagement)
313
+
314
+ # Build result
315
+ distribution = df_result[output_column].value_counts().to_dict()
316
+ total = len(df_result)
317
+
318
+ segments = [
319
+ SegmentDefinition(
320
+ name="High_Engagement",
321
+ segment_type=SegmentationType.ENGAGEMENT,
322
+ description=f"Highly engaged customers (score >= {high_threshold})",
323
+ criteria={"score": f">= {high_threshold}"},
324
+ count=distribution.get("High_Engagement", 0),
325
+ percentage=(distribution.get("High_Engagement", 0) / total * 100) if total > 0 else 0
326
+ ),
327
+ SegmentDefinition(
328
+ name="Medium_Engagement",
329
+ segment_type=SegmentationType.ENGAGEMENT,
330
+ description=f"Moderately engaged customers ({low_threshold} <= score < {high_threshold})",
331
+ criteria={"score": f"{low_threshold} - {high_threshold}"},
332
+ count=distribution.get("Medium_Engagement", 0),
333
+ percentage=(distribution.get("Medium_Engagement", 0) / total * 100) if total > 0 else 0
334
+ ),
335
+ SegmentDefinition(
336
+ name="Low_Engagement",
337
+ segment_type=SegmentationType.ENGAGEMENT,
338
+ description=f"Low engagement customers (score < {low_threshold})",
339
+ criteria={"score": f"< {low_threshold}"},
340
+ count=distribution.get("Low_Engagement", 0),
341
+ percentage=(distribution.get("Low_Engagement", 0) / total * 100) if total > 0 else 0
342
+ )
343
+ ]
344
+
345
+ result = SegmentationResult(
346
+ segment_column=output_column,
347
+ segment_type=SegmentationType.ENGAGEMENT,
348
+ total_customers=total,
349
+ segments=segments,
350
+ segment_distribution=distribution
351
+ )
352
+
353
+ return df_result, result
354
+
355
+ def create_engagement_score(
356
+ self,
357
+ df: DataFrame,
358
+ open_rate_column: str,
359
+ click_rate_column: str,
360
+ open_weight: float = 0.6,
361
+ click_weight: float = 0.4,
362
+ output_column: str = "engagement_score"
363
+ ) -> DataFrame:
364
+ """
365
+ Create a composite email engagement score.
366
+
367
+ Parameters
368
+ ----------
369
+ df : DataFrame
370
+ Data to process
371
+ open_rate_column : str
372
+ Column with email open rate (0-100 scale)
373
+ click_rate_column : str
374
+ Column with email click rate (0-100 scale)
375
+ open_weight : float
376
+ Weight for open rate (default: 0.6)
377
+ click_weight : float
378
+ Weight for click rate (default: 0.4)
379
+ output_column : str
380
+ Name of the output column
381
+
382
+ Returns
383
+ -------
384
+ DataFrame
385
+ DataFrame with engagement score column
386
+ """
387
+ df_result = df.copy()
388
+
389
+ # Normalize to 0-1 scale if needed
390
+ open_rate = df_result[open_rate_column]
391
+ click_rate = df_result[click_rate_column]
392
+
393
+ if open_rate.max() > 1:
394
+ open_rate = open_rate / 100
395
+ if click_rate.max() > 1:
396
+ click_rate = click_rate / 100
397
+
398
+ df_result[output_column] = (open_weight * open_rate + click_weight * click_rate)
399
+
400
+ return df_result
401
+
402
+ def create_tenure_features(
403
+ self,
404
+ df: DataFrame,
405
+ created_column: str,
406
+ reference_date: Optional[Any] = None,
407
+ output_prefix: str = ""
408
+ ) -> DataFrame:
409
+ """
410
+ Create tenure-based features from account creation date.
411
+
412
+ Parameters
413
+ ----------
414
+ df : DataFrame
415
+ Data to process
416
+ created_column : str
417
+ Column with account creation date
418
+ reference_date : datetime-like, optional
419
+ Reference date for calculations. Default: max date in data
420
+ output_prefix : str
421
+ Prefix for output column names
422
+
423
+ Returns
424
+ -------
425
+ DataFrame
426
+ DataFrame with tenure features
427
+ """
428
+ df_result = df.copy()
429
+
430
+ # Ensure datetime
431
+ if not pd.api.types.is_datetime64_any_dtype(df_result[created_column]):
432
+ df_result[created_column] = pd.to_datetime(df_result[created_column], errors='coerce', format='mixed')
433
+
434
+ # Set reference date
435
+ if reference_date is None:
436
+ reference_date = df_result[created_column].max()
437
+ else:
438
+ reference_date = pd.to_datetime(reference_date)
439
+
440
+ prefix = f"{output_prefix}_" if output_prefix else ""
441
+
442
+ # Tenure in days
443
+ df_result[f"{prefix}tenure_days"] = (reference_date - df_result[created_column]).dt.days
444
+
445
+ # Tenure in months
446
+ df_result[f"{prefix}tenure_months"] = df_result[f"{prefix}tenure_days"] / 30.44
447
+
448
+ # Tenure bucket
449
+ def tenure_bucket(days):
450
+ if pd.isna(days) or days < 0:
451
+ return "Unknown"
452
+ if days <= 90:
453
+ return "New_0_3m"
454
+ elif days <= 180:
455
+ return "Growing_3_6m"
456
+ elif days <= 365:
457
+ return "Established_6_12m"
458
+ else:
459
+ return "Mature_12m+"
460
+
461
+ df_result[f"{prefix}tenure_bucket"] = df_result[f"{prefix}tenure_days"].apply(tenure_bucket)
462
+
463
+ return df_result
464
+
465
+ def create_recency_features(
466
+ self,
467
+ df: DataFrame,
468
+ last_activity_column: str,
469
+ reference_date: Optional[Any] = None,
470
+ output_column: str = "days_since_last_activity"
471
+ ) -> DataFrame:
472
+ """
473
+ Create recency features from last activity date.
474
+
475
+ Parameters
476
+ ----------
477
+ df : DataFrame
478
+ Data to process
479
+ last_activity_column : str
480
+ Column with last activity date
481
+ reference_date : datetime-like, optional
482
+ Reference date for calculations. Default: max date in data
483
+ output_column : str
484
+ Name of the output column
485
+
486
+ Returns
487
+ -------
488
+ DataFrame
489
+ DataFrame with recency feature
490
+ """
491
+ df_result = df.copy()
492
+
493
+ # Ensure datetime
494
+ if not pd.api.types.is_datetime64_any_dtype(df_result[last_activity_column]):
495
+ df_result[last_activity_column] = pd.to_datetime(df_result[last_activity_column], errors='coerce', format='mixed')
496
+
497
+ # Set reference date
498
+ if reference_date is None:
499
+ reference_date = df_result[last_activity_column].max()
500
+ else:
501
+ reference_date = pd.to_datetime(reference_date)
502
+
503
+ df_result[output_column] = (reference_date - df_result[last_activity_column]).dt.days
504
+
505
+ return df_result