churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,632 @@
1
+ from dataclasses import dataclass, field
2
+ from enum import Enum
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from sklearn.cluster import DBSCAN, AgglomerativeClustering, KMeans
8
+ from sklearn.decomposition import PCA
9
+ from sklearn.manifold import TSNE
10
+ from sklearn.metrics import silhouette_score
11
+ from sklearn.preprocessing import StandardScaler
12
+
13
+ from customer_retention.core.compat import DataFrame, to_pandas
14
+
15
+
16
+ class SegmentationMethod(Enum):
17
+ KMEANS = "kmeans"
18
+ HIERARCHICAL = "hierarchical"
19
+ DBSCAN = "dbscan"
20
+
21
+
22
+ class DimensionReductionMethod(Enum):
23
+ PCA = "pca"
24
+ TSNE = "tsne"
25
+ UMAP = "umap"
26
+
27
+
28
+ @dataclass
29
+ class SegmentProfile:
30
+ segment_id: int
31
+ size: int
32
+ size_pct: float
33
+ target_rate: Optional[float]
34
+ defining_features: Dict[str, Any]
35
+
36
+
37
+ @dataclass
38
+ class SegmentationResult:
39
+ n_segments: int
40
+ method: SegmentationMethod
41
+ quality_score: float
42
+ profiles: List[SegmentProfile]
43
+ target_variance_ratio: Optional[float]
44
+ recommendation: str
45
+ confidence: float
46
+ rationale: List[str]
47
+ labels: np.ndarray = field(repr=False)
48
+
49
+
50
+ @dataclass
51
+ class ClusterVisualizationResult:
52
+ x: np.ndarray
53
+ y: np.ndarray
54
+ labels: np.ndarray
55
+ method: DimensionReductionMethod
56
+ explained_variance_ratio: Optional[float] = None
57
+
58
+
59
+ @dataclass
60
+ class SegmentationDecisionMetrics:
61
+ """Key metrics for segmentation decision-making."""
62
+ silhouette_score: float
63
+ silhouette_interpretation: str
64
+ target_variance_ratio: Optional[float]
65
+ target_variance_interpretation: str
66
+ n_segments: int
67
+ segments_interpretation: str
68
+ confidence: float
69
+ confidence_interpretation: str
70
+ recommendation: str
71
+ rationale: List[str]
72
+
73
+ @classmethod
74
+ def from_segmentation_result(cls, result: "SegmentationResult") -> "SegmentationDecisionMetrics":
75
+ """Create metrics from a SegmentationResult."""
76
+ # Convert normalized quality score back to silhouette
77
+ silhouette = result.quality_score * 2 - 1
78
+
79
+ # Silhouette interpretation
80
+ if silhouette > 0.5:
81
+ sil_interp = "Strong structure"
82
+ elif silhouette > 0.25:
83
+ sil_interp = "Reasonable"
84
+ elif silhouette > 0:
85
+ sil_interp = "Weak structure"
86
+ else:
87
+ sil_interp = "No structure"
88
+
89
+ # Target variance interpretation
90
+ if result.target_variance_ratio is not None:
91
+ tv = result.target_variance_ratio
92
+ if tv > 0.3:
93
+ tv_interp = "High separation"
94
+ elif tv > 0.15:
95
+ tv_interp = "Moderate"
96
+ else:
97
+ tv_interp = "Low separation"
98
+ else:
99
+ tv_interp = "N/A"
100
+
101
+ # Segments interpretation
102
+ seg_interp = "Manageable" if result.n_segments <= 4 else "Complex"
103
+
104
+ # Confidence interpretation
105
+ if result.confidence > 0.6:
106
+ conf_interp = "High"
107
+ elif result.confidence > 0.3:
108
+ conf_interp = "Medium"
109
+ else:
110
+ conf_interp = "Low"
111
+
112
+ return cls(
113
+ silhouette_score=silhouette,
114
+ silhouette_interpretation=sil_interp,
115
+ target_variance_ratio=result.target_variance_ratio,
116
+ target_variance_interpretation=tv_interp,
117
+ n_segments=result.n_segments,
118
+ segments_interpretation=seg_interp,
119
+ confidence=result.confidence,
120
+ confidence_interpretation=conf_interp,
121
+ recommendation=result.recommendation,
122
+ rationale=result.rationale,
123
+ )
124
+
125
+
126
+ @dataclass
127
+ class FullSegmentationResult:
128
+ """Complete segmentation analysis result for dashboard display."""
129
+ metrics: SegmentationDecisionMetrics
130
+ profiles: List[SegmentProfile]
131
+ size_distribution: Dict[str, Any]
132
+ visualization: Optional[ClusterVisualizationResult]
133
+ segmentation_result: Optional[SegmentationResult]
134
+
135
+ @property
136
+ def has_visualization(self) -> bool:
137
+ """Check if visualization is available."""
138
+ return self.visualization is not None
139
+
140
+ def get_decision_summary(self) -> str:
141
+ """Get a human-readable decision summary."""
142
+ if self.metrics.recommendation == "strong_segmentation":
143
+ return (
144
+ "STRONG EVIDENCE FOR SEGMENTATION\n\n"
145
+ "The data shows clear cluster structure with meaningful target rate "
146
+ "differences across segments. Consider building separate models per "
147
+ "segment if EPV requirements are met."
148
+ )
149
+ elif self.metrics.recommendation == "consider_segmentation":
150
+ return (
151
+ "MODERATE EVIDENCE FOR SEGMENTATION\n\n"
152
+ "Some cluster structure exists but may not be strong enough to justify "
153
+ "separate models. Consider:\n"
154
+ "- Using segments as a feature in a single model\n"
155
+ "- Segment-specific preprocessing but unified modeling"
156
+ )
157
+ else:
158
+ return (
159
+ "SINGLE MODEL RECOMMENDED\n\n"
160
+ "The data does not show sufficient cluster structure or target rate "
161
+ "variation to justify segmentation. A single unified model is likely "
162
+ "the best approach."
163
+ )
164
+
165
+
166
+ class SegmentAnalyzer:
167
+ def __init__(self, default_method: SegmentationMethod = SegmentationMethod.KMEANS):
168
+ self.default_method = default_method
169
+ self._scaler = StandardScaler()
170
+
171
+ def analyze(
172
+ self,
173
+ df: DataFrame,
174
+ target_col: Optional[str] = None,
175
+ feature_cols: Optional[List[str]] = None,
176
+ max_segments: int = 5,
177
+ method: Optional[SegmentationMethod] = None,
178
+ ) -> SegmentationResult:
179
+ df = to_pandas(df)
180
+ method = method or self.default_method
181
+
182
+ feature_cols = self._select_features(df, feature_cols, target_col)
183
+ if len(feature_cols) == 0:
184
+ return self._empty_result(df, method)
185
+
186
+ features_df = df[feature_cols].copy()
187
+ features_df = features_df.dropna()
188
+ valid_indices = features_df.index
189
+
190
+ if len(features_df) < 10:
191
+ return self._single_segment_result(df, method, target_col)
192
+
193
+ n_segments = self.find_optimal_segments(
194
+ df.loc[valid_indices], feature_cols, max_k=max_segments
195
+ )
196
+
197
+ scaled_features = self._scaler.fit_transform(features_df)
198
+ labels = self._fit_clusters(scaled_features, n_segments, method)
199
+
200
+ full_labels = np.full(len(df), -1)
201
+ full_labels[valid_indices] = labels
202
+
203
+ quality_score = self._calculate_quality(scaled_features, labels)
204
+ profiles = self.profile_segments(df, full_labels, feature_cols, target_col)
205
+ target_variance = self._calculate_target_variance(df, full_labels, target_col)
206
+ recommendation, confidence, rationale = self._make_recommendation(
207
+ quality_score, target_variance, n_segments, profiles
208
+ )
209
+
210
+ return SegmentationResult(
211
+ n_segments=n_segments,
212
+ method=method,
213
+ quality_score=quality_score,
214
+ profiles=profiles,
215
+ target_variance_ratio=target_variance,
216
+ recommendation=recommendation,
217
+ confidence=confidence,
218
+ rationale=rationale,
219
+ labels=full_labels,
220
+ )
221
+
222
+ def find_optimal_segments(
223
+ self,
224
+ df: DataFrame,
225
+ feature_cols: List[str],
226
+ max_k: int = 10,
227
+ ) -> int:
228
+ df = to_pandas(df)
229
+ features_df = df[feature_cols].dropna()
230
+
231
+ if len(features_df) < 10:
232
+ return 1
233
+
234
+ max_k = min(max_k, len(features_df) // 3, 10)
235
+ if max_k < 2:
236
+ return 1
237
+
238
+ scaled = self._scaler.fit_transform(features_df)
239
+
240
+ silhouette_scores = []
241
+ for k in range(2, max_k + 1):
242
+ kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
243
+ labels = kmeans.fit_predict(scaled)
244
+ if len(set(labels)) > 1:
245
+ score = silhouette_score(scaled, labels)
246
+ silhouette_scores.append((k, score))
247
+
248
+ if not silhouette_scores:
249
+ return 1
250
+
251
+ best_k = max(silhouette_scores, key=lambda x: x[1])[0]
252
+ return best_k
253
+
254
+ def profile_segments(
255
+ self,
256
+ df: DataFrame,
257
+ labels: np.ndarray,
258
+ feature_cols: List[str],
259
+ target_col: Optional[str] = None,
260
+ ) -> List[SegmentProfile]:
261
+ df = to_pandas(df)
262
+ profiles = []
263
+ unique_labels = sorted(set(labels[labels >= 0]))
264
+ total_valid = sum(labels >= 0)
265
+
266
+ for seg_id in unique_labels:
267
+ mask = labels == seg_id
268
+ segment_df = df.loc[mask]
269
+ size = len(segment_df)
270
+
271
+ target_rate = None
272
+ if target_col and target_col in df.columns:
273
+ target_series = segment_df[target_col]
274
+ if target_series.dtype in [np.int64, np.float64, int, float]:
275
+ unique_vals = target_series.dropna().unique()
276
+ if len(unique_vals) == 2 and set(unique_vals).issubset({0, 1, 0.0, 1.0}):
277
+ target_rate = float(target_series.mean())
278
+
279
+ defining_features = {}
280
+ for col in feature_cols:
281
+ if col in segment_df.columns:
282
+ col_data = segment_df[col].dropna()
283
+ if len(col_data) > 0 and np.issubdtype(col_data.dtype, np.number):
284
+ defining_features[col] = {
285
+ "mean": float(col_data.mean()),
286
+ "std": float(col_data.std()),
287
+ "min": float(col_data.min()),
288
+ "max": float(col_data.max()),
289
+ }
290
+
291
+ profiles.append(SegmentProfile(
292
+ segment_id=int(seg_id),
293
+ size=size,
294
+ size_pct=round(size / total_valid * 100, 1) if total_valid > 0 else 0,
295
+ target_rate=target_rate,
296
+ defining_features=defining_features,
297
+ ))
298
+
299
+ return profiles
300
+
301
+ def _select_features(
302
+ self,
303
+ df: pd.DataFrame,
304
+ feature_cols: Optional[List[str]],
305
+ target_col: Optional[str],
306
+ ) -> List[str]:
307
+ if feature_cols:
308
+ return [c for c in feature_cols if c in df.columns]
309
+
310
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
311
+ if target_col and target_col in numeric_cols:
312
+ numeric_cols.remove(target_col)
313
+ return numeric_cols
314
+
315
+ def _fit_clusters(
316
+ self,
317
+ scaled_features: np.ndarray,
318
+ n_clusters: int,
319
+ method: SegmentationMethod,
320
+ ) -> np.ndarray:
321
+ if method == SegmentationMethod.KMEANS:
322
+ model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
323
+ elif method == SegmentationMethod.HIERARCHICAL:
324
+ model = AgglomerativeClustering(n_clusters=n_clusters)
325
+ elif method == SegmentationMethod.DBSCAN:
326
+ model = DBSCAN(eps=0.5, min_samples=5)
327
+ else:
328
+ model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
329
+
330
+ return model.fit_predict(scaled_features)
331
+
332
+ def _calculate_quality(self, scaled_features: np.ndarray, labels: np.ndarray) -> float:
333
+ unique_labels = set(labels)
334
+ if len(unique_labels) < 2:
335
+ return 0.0
336
+ try:
337
+ score = silhouette_score(scaled_features, labels)
338
+ return float(max(0, (score + 1) / 2))
339
+ except Exception:
340
+ return 0.0
341
+
342
+ def _calculate_target_variance(
343
+ self,
344
+ df: pd.DataFrame,
345
+ labels: np.ndarray,
346
+ target_col: Optional[str],
347
+ ) -> Optional[float]:
348
+ if not target_col or target_col not in df.columns:
349
+ return None
350
+
351
+ target = df[target_col]
352
+ if not np.issubdtype(target.dtype, np.number):
353
+ return None
354
+
355
+ unique_vals = target.dropna().unique()
356
+ if not (len(unique_vals) == 2 and set(unique_vals).issubset({0, 1, 0.0, 1.0})):
357
+ return None
358
+
359
+ segment_rates = []
360
+ for seg_id in set(labels[labels >= 0]):
361
+ mask = labels == seg_id
362
+ seg_rate = target[mask].mean()
363
+ if not np.isnan(seg_rate):
364
+ segment_rates.append(seg_rate)
365
+
366
+ if len(segment_rates) < 2:
367
+ return 0.0
368
+
369
+ variance = np.var(segment_rates)
370
+ max_possible_variance = 0.25
371
+ return float(min(1.0, variance / max_possible_variance))
372
+
373
+ def _make_recommendation(
374
+ self,
375
+ quality_score: float,
376
+ target_variance: Optional[float],
377
+ n_segments: int,
378
+ profiles: List[SegmentProfile],
379
+ ) -> tuple:
380
+ rationale = []
381
+ score = 0.0
382
+
383
+ if quality_score > 0.7:
384
+ rationale.append(f"High cluster quality (silhouette: {quality_score:.2f})")
385
+ score += 0.3
386
+ elif quality_score > 0.5:
387
+ rationale.append(f"Moderate cluster quality (silhouette: {quality_score:.2f})")
388
+ score += 0.15
389
+
390
+ if target_variance is not None:
391
+ if target_variance > 0.3:
392
+ rationale.append(f"High target rate variation across segments ({target_variance:.2f})")
393
+ score += 0.4
394
+ elif target_variance > 0.15:
395
+ rationale.append(f"Moderate target rate variation ({target_variance:.2f})")
396
+ score += 0.2
397
+ else:
398
+ rationale.append(f"Low target rate variation ({target_variance:.2f})")
399
+
400
+ min_segment_pct = min(p.size_pct for p in profiles) if profiles else 0
401
+ if min_segment_pct >= 10:
402
+ rationale.append(f"All segments have sufficient size (min: {min_segment_pct:.1f}%)")
403
+ score += 0.2
404
+ elif min_segment_pct >= 5:
405
+ rationale.append(f"Some segments are small (min: {min_segment_pct:.1f}%)")
406
+ score += 0.1
407
+ else:
408
+ rationale.append(f"Very small segments detected (min: {min_segment_pct:.1f}%)")
409
+
410
+ if n_segments > 5:
411
+ rationale.append(f"Many segments ({n_segments}) may complicate maintenance")
412
+ score -= 0.1
413
+
414
+ confidence = min(1.0, max(0.0, score))
415
+
416
+ if score >= 0.6:
417
+ recommendation = "strong_segmentation"
418
+ elif score >= 0.3:
419
+ recommendation = "consider_segmentation"
420
+ else:
421
+ recommendation = "single_model"
422
+
423
+ return recommendation, confidence, rationale
424
+
425
+ def _empty_result(self, df: pd.DataFrame, method: SegmentationMethod) -> SegmentationResult:
426
+ return SegmentationResult(
427
+ n_segments=1,
428
+ method=method,
429
+ quality_score=0.0,
430
+ profiles=[SegmentProfile(
431
+ segment_id=0,
432
+ size=len(df),
433
+ size_pct=100.0,
434
+ target_rate=None,
435
+ defining_features={},
436
+ )],
437
+ target_variance_ratio=None,
438
+ recommendation="single_model",
439
+ confidence=0.0,
440
+ rationale=["No numeric features available for segmentation"],
441
+ labels=np.zeros(len(df), dtype=int),
442
+ )
443
+
444
+ def _single_segment_result(
445
+ self,
446
+ df: pd.DataFrame,
447
+ method: SegmentationMethod,
448
+ target_col: Optional[str],
449
+ ) -> SegmentationResult:
450
+ target_rate = None
451
+ if target_col and target_col in df.columns:
452
+ target = df[target_col]
453
+ if np.issubdtype(target.dtype, np.number):
454
+ target_rate = float(target.mean())
455
+
456
+ return SegmentationResult(
457
+ n_segments=1,
458
+ method=method,
459
+ quality_score=0.0,
460
+ profiles=[SegmentProfile(
461
+ segment_id=0,
462
+ size=len(df),
463
+ size_pct=100.0,
464
+ target_rate=target_rate,
465
+ defining_features={},
466
+ )],
467
+ target_variance_ratio=0.0,
468
+ recommendation="single_model",
469
+ confidence=0.5,
470
+ rationale=["Insufficient data for meaningful segmentation"],
471
+ labels=np.zeros(len(df), dtype=int),
472
+ )
473
+
474
+ def get_cluster_visualization(
475
+ self,
476
+ df: DataFrame,
477
+ labels: np.ndarray,
478
+ feature_cols: List[str],
479
+ method: DimensionReductionMethod = DimensionReductionMethod.PCA,
480
+ ) -> ClusterVisualizationResult:
481
+ """Reduce features to 2D for cluster visualization.
482
+
483
+ Args:
484
+ df: DataFrame with features.
485
+ labels: Cluster labels from analyze().
486
+ feature_cols: Feature columns to use for dimensionality reduction.
487
+ method: Dimensionality reduction method (PCA, TSNE, UMAP).
488
+
489
+ Returns:
490
+ ClusterVisualizationResult with 2D coordinates and labels.
491
+ """
492
+ df = to_pandas(df)
493
+ feature_cols = [c for c in feature_cols if c in df.columns]
494
+
495
+ # Initialize output arrays with NaN
496
+ x = np.full(len(df), np.nan)
497
+ y = np.full(len(df), np.nan)
498
+
499
+ # Get valid rows (non-NaN features)
500
+ features_df = df[feature_cols].copy()
501
+ valid_mask = ~features_df.isna().any(axis=1)
502
+ valid_indices = features_df[valid_mask].index
503
+
504
+ if len(valid_indices) < 2:
505
+ return ClusterVisualizationResult(
506
+ x=x, y=y, labels=labels,
507
+ method=method, explained_variance_ratio=None
508
+ )
509
+
510
+ # Scale features
511
+ scaled = self._scaler.fit_transform(features_df.loc[valid_indices])
512
+
513
+ # Apply dimensionality reduction
514
+ explained_variance = None
515
+
516
+ if method == DimensionReductionMethod.PCA:
517
+ reducer = PCA(n_components=2, random_state=42)
518
+ coords = reducer.fit_transform(scaled)
519
+ explained_variance = float(sum(reducer.explained_variance_ratio_))
520
+
521
+ elif method == DimensionReductionMethod.TSNE:
522
+ perplexity = min(30, len(valid_indices) - 1)
523
+ reducer = TSNE(
524
+ n_components=2,
525
+ perplexity=max(5, perplexity),
526
+ random_state=42,
527
+ )
528
+ coords = reducer.fit_transform(scaled)
529
+
530
+ elif method == DimensionReductionMethod.UMAP:
531
+ try:
532
+ import umap
533
+ n_neighbors = min(15, len(valid_indices) - 1)
534
+ reducer = umap.UMAP(
535
+ n_components=2,
536
+ n_neighbors=max(2, n_neighbors),
537
+ random_state=42,
538
+ )
539
+ coords = reducer.fit_transform(scaled)
540
+ except ImportError:
541
+ # Fall back to PCA if UMAP not installed
542
+ reducer = PCA(n_components=2, random_state=42)
543
+ coords = reducer.fit_transform(scaled)
544
+ explained_variance = float(sum(reducer.explained_variance_ratio_))
545
+ method = DimensionReductionMethod.PCA
546
+
547
+ else:
548
+ reducer = PCA(n_components=2, random_state=42)
549
+ coords = reducer.fit_transform(scaled)
550
+ explained_variance = float(sum(reducer.explained_variance_ratio_))
551
+
552
+ # Fill in valid coordinates
553
+ x[valid_indices] = coords[:, 0]
554
+ y[valid_indices] = coords[:, 1]
555
+
556
+ return ClusterVisualizationResult(
557
+ x=x,
558
+ y=y,
559
+ labels=labels,
560
+ method=method,
561
+ explained_variance_ratio=explained_variance,
562
+ )
563
+
564
+ def run_full_analysis(
565
+ self,
566
+ df: DataFrame,
567
+ feature_cols: List[str],
568
+ target_col: Optional[str] = None,
569
+ max_segments: int = 5,
570
+ method: Optional[SegmentationMethod] = None,
571
+ dim_reduction: DimensionReductionMethod = DimensionReductionMethod.PCA,
572
+ ) -> FullSegmentationResult:
573
+ """Run complete segmentation analysis for dashboard display.
574
+
575
+ Args:
576
+ df: DataFrame with features.
577
+ feature_cols: Feature columns for clustering.
578
+ target_col: Optional target column for variance analysis.
579
+ max_segments: Maximum segments to consider.
580
+ method: Clustering method.
581
+ dim_reduction: Dimensionality reduction for visualization.
582
+
583
+ Returns:
584
+ FullSegmentationResult with metrics, profiles, and visualization.
585
+ """
586
+ df = to_pandas(df)
587
+
588
+ # Run segmentation
589
+ seg_result = self.analyze(
590
+ df,
591
+ target_col=target_col,
592
+ feature_cols=feature_cols,
593
+ max_segments=max_segments,
594
+ method=method,
595
+ )
596
+
597
+ # Create decision metrics
598
+ metrics = SegmentationDecisionMetrics.from_segmentation_result(seg_result)
599
+
600
+ # Calculate size distribution
601
+ total = sum(p.size for p in seg_result.profiles)
602
+ min_size = min(p.size for p in seg_result.profiles)
603
+ max_size = max(p.size for p in seg_result.profiles)
604
+ balance_ratio = min_size / max_size if max_size > 0 else 0
605
+
606
+ size_distribution = {
607
+ "total": total,
608
+ "min_size": min_size,
609
+ "max_size": max_size,
610
+ "min_pct": min_size / total * 100 if total > 0 else 0,
611
+ "max_pct": max_size / total * 100 if total > 0 else 0,
612
+ "balance_ratio": balance_ratio,
613
+ }
614
+
615
+ # Get visualization if multiple segments and enough features
616
+ visualization = None
617
+ valid_features = [c for c in feature_cols if c in df.columns]
618
+ if seg_result.n_segments > 1 and len(valid_features) >= 2:
619
+ visualization = self.get_cluster_visualization(
620
+ df,
621
+ labels=seg_result.labels,
622
+ feature_cols=valid_features,
623
+ method=dim_reduction,
624
+ )
625
+
626
+ return FullSegmentationResult(
627
+ metrics=metrics,
628
+ profiles=seg_result.profiles,
629
+ size_distribution=size_distribution,
630
+ visualization=visualization,
631
+ segmentation_result=seg_result,
632
+ )