churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,265 @@
1
+ """Segment-aware outlier analysis that considers natural data clusters."""
2
+ from dataclasses import dataclass
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ import numpy as np
6
+
7
+ from customer_retention.core.compat import DataFrame, pd, to_pandas
8
+ from customer_retention.stages.cleaning.outlier_handler import OutlierDetectionMethod, OutlierHandler, OutlierResult
9
+
10
+ from .segment_analyzer import SegmentAnalyzer, SegmentationResult
11
+
12
+
13
+ @dataclass
14
+ class SegmentAwareOutlierResult:
15
+ """Results from segment-aware outlier analysis."""
16
+ n_segments: int
17
+ global_analysis: Dict[str, OutlierResult]
18
+ segment_analysis: Dict[Any, Dict[str, OutlierResult]]
19
+ false_outliers: Dict[str, int]
20
+ segmentation_recommended: bool
21
+ recommendations: List[str]
22
+ rationale: List[str]
23
+ segment_labels: Optional[np.ndarray] = None
24
+ segmentation_result: Optional[SegmentationResult] = None
25
+
26
+
27
+ class SegmentAwareOutlierAnalyzer:
28
+ """Analyzes outliers considering natural data segments.
29
+
30
+ Addresses the problem where global outliers may actually be valid data
31
+ points from a different segment (e.g., enterprise vs retail customers).
32
+ """
33
+
34
+ FALSE_OUTLIER_THRESHOLD = 0.5 # If >50% of global outliers are segment-normal
35
+ MIN_SEGMENT_SIZE = 10
36
+
37
+ def __init__(
38
+ self,
39
+ detection_method: OutlierDetectionMethod = OutlierDetectionMethod.IQR,
40
+ iqr_multiplier: float = 1.5,
41
+ zscore_threshold: float = 3.0,
42
+ max_segments: int = 5
43
+ ):
44
+ self.detection_method = detection_method
45
+ self.iqr_multiplier = iqr_multiplier
46
+ self.zscore_threshold = zscore_threshold
47
+ self.max_segments = max_segments
48
+ self._segment_analyzer = SegmentAnalyzer()
49
+
50
+ def analyze(
51
+ self,
52
+ df: DataFrame,
53
+ feature_cols: List[str],
54
+ segment_col: Optional[str] = None,
55
+ target_col: Optional[str] = None
56
+ ) -> SegmentAwareOutlierResult:
57
+ df = to_pandas(df)
58
+
59
+ if len(df) == 0 or all(df[col].isna().all() for col in feature_cols if col in df.columns):
60
+ return self._empty_result(feature_cols)
61
+
62
+ valid_cols = [c for c in feature_cols if c in df.columns]
63
+ if not valid_cols:
64
+ return self._empty_result(feature_cols)
65
+
66
+ global_analysis = self._analyze_global(df, valid_cols)
67
+
68
+ if segment_col and segment_col in df.columns:
69
+ segment_labels, n_segments = self._use_explicit_segments(df, segment_col)
70
+ segmentation_result = None
71
+ else:
72
+ segment_labels, n_segments, segmentation_result = self._detect_segments(
73
+ df, valid_cols, target_col
74
+ )
75
+
76
+ segment_analysis = self._analyze_by_segment(df, valid_cols, segment_labels, n_segments)
77
+ false_outliers = self._identify_false_outliers(
78
+ df, valid_cols, global_analysis, segment_analysis, segment_labels
79
+ )
80
+
81
+ segmentation_recommended, recommendations, rationale = self._make_recommendations(
82
+ global_analysis, segment_analysis, false_outliers, n_segments
83
+ )
84
+
85
+ return SegmentAwareOutlierResult(
86
+ n_segments=n_segments,
87
+ global_analysis=global_analysis,
88
+ segment_analysis=segment_analysis,
89
+ false_outliers=false_outliers,
90
+ segmentation_recommended=segmentation_recommended,
91
+ recommendations=recommendations,
92
+ rationale=rationale,
93
+ segment_labels=segment_labels,
94
+ segmentation_result=segmentation_result
95
+ )
96
+
97
+ def _analyze_global(self, df: DataFrame, feature_cols: List[str]) -> Dict[str, OutlierResult]:
98
+ handler = OutlierHandler(
99
+ detection_method=self.detection_method,
100
+ iqr_multiplier=self.iqr_multiplier,
101
+ zscore_threshold=self.zscore_threshold
102
+ )
103
+ return {col: handler.detect(df[col]) for col in feature_cols}
104
+
105
+ def _use_explicit_segments(self, df: DataFrame, segment_col: str) -> tuple:
106
+ unique_segments = df[segment_col].dropna().unique()
107
+ label_map = {v: i for i, v in enumerate(unique_segments)}
108
+ labels = df[segment_col].map(label_map).fillna(-1).astype(int).values
109
+ return labels, len(unique_segments)
110
+
111
+ def _detect_segments(
112
+ self, df: DataFrame, feature_cols: List[str], target_col: Optional[str]
113
+ ) -> tuple:
114
+ if len(df) < self.MIN_SEGMENT_SIZE * 2:
115
+ return np.zeros(len(df), dtype=int), 1, None
116
+
117
+ try:
118
+ result = self._segment_analyzer.analyze(
119
+ df,
120
+ target_col=target_col,
121
+ feature_cols=feature_cols,
122
+ max_segments=self.max_segments
123
+ )
124
+ return result.labels, result.n_segments, result
125
+ except Exception:
126
+ return np.zeros(len(df), dtype=int), 1, None
127
+
128
+ def _analyze_by_segment(
129
+ self,
130
+ df: DataFrame,
131
+ feature_cols: List[str],
132
+ segment_labels: np.ndarray,
133
+ n_segments: int
134
+ ) -> Dict[Any, Dict[str, OutlierResult]]:
135
+ segment_analysis = {}
136
+ handler = OutlierHandler(
137
+ detection_method=self.detection_method,
138
+ iqr_multiplier=self.iqr_multiplier,
139
+ zscore_threshold=self.zscore_threshold
140
+ )
141
+
142
+ for seg_id in range(n_segments):
143
+ mask = segment_labels == seg_id
144
+ if mask.sum() < self.MIN_SEGMENT_SIZE:
145
+ continue
146
+
147
+ segment_df = df.loc[mask]
148
+ segment_analysis[seg_id] = {
149
+ col: handler.detect(segment_df[col]) for col in feature_cols
150
+ }
151
+
152
+ return segment_analysis
153
+
154
+ def _identify_false_outliers(
155
+ self,
156
+ df: DataFrame,
157
+ feature_cols: List[str],
158
+ global_analysis: Dict[str, OutlierResult],
159
+ segment_analysis: Dict[Any, Dict[str, OutlierResult]],
160
+ segment_labels: np.ndarray
161
+ ) -> Dict[str, int]:
162
+ """Identify global outliers that are normal within their segment."""
163
+ false_outliers = {}
164
+
165
+ for col in feature_cols:
166
+ global_result = global_analysis[col]
167
+ if global_result.outlier_mask is None:
168
+ false_outliers[col] = 0
169
+ continue
170
+
171
+ global_outlier_indices = np.where(global_result.outlier_mask)[0]
172
+ false_count = 0
173
+
174
+ for idx in global_outlier_indices:
175
+ seg_id = segment_labels[idx]
176
+ if seg_id < 0 or seg_id not in segment_analysis:
177
+ continue
178
+
179
+ seg_result = segment_analysis[seg_id].get(col)
180
+ if seg_result is None or seg_result.outlier_mask is None:
181
+ continue
182
+
183
+ # Get the local index within segment
184
+ seg_mask = segment_labels == seg_id
185
+ seg_indices = np.where(seg_mask)[0]
186
+ local_idx = np.where(seg_indices == idx)[0]
187
+
188
+ if len(local_idx) > 0:
189
+ local_pos = local_idx[0]
190
+ seg_outlier_mask = seg_result.outlier_mask.values
191
+ if local_pos < len(seg_outlier_mask) and not seg_outlier_mask[local_pos]:
192
+ false_count += 1
193
+
194
+ false_outliers[col] = false_count
195
+
196
+ return false_outliers
197
+
198
+ def _make_recommendations(
199
+ self,
200
+ global_analysis: Dict[str, OutlierResult],
201
+ segment_analysis: Dict[Any, Dict[str, OutlierResult]],
202
+ false_outliers: Dict[str, int],
203
+ n_segments: int
204
+ ) -> tuple:
205
+ recommendations = []
206
+ rationale = []
207
+ segmentation_recommended = False
208
+
209
+ for col, false_count in false_outliers.items():
210
+ global_count = global_analysis[col].outliers_detected
211
+ if global_count == 0:
212
+ continue
213
+
214
+ false_ratio = false_count / global_count
215
+
216
+ if false_ratio >= self.FALSE_OUTLIER_THRESHOLD:
217
+ segmentation_recommended = True
218
+ rationale.append(
219
+ f"{col}: {false_count}/{global_count} ({false_ratio:.0%}) global outliers "
220
+ f"are normal within their segment"
221
+ )
222
+ recommendations.append(
223
+ f"Consider segment-specific outlier treatment for '{col}' - "
224
+ f"global outliers may be valid data from different customer segments"
225
+ )
226
+ elif false_ratio > 0.2:
227
+ rationale.append(
228
+ f"{col}: {false_count}/{global_count} ({false_ratio:.0%}) false outliers detected"
229
+ )
230
+
231
+ if n_segments > 1 and not segmentation_recommended:
232
+ total_global = sum(r.outliers_detected for r in global_analysis.values())
233
+ total_segment = sum(
234
+ sum(r.outliers_detected for r in seg.values())
235
+ for seg in segment_analysis.values()
236
+ )
237
+
238
+ if total_global > 0:
239
+ reduction = (total_global - total_segment) / total_global
240
+ if reduction > 0.3:
241
+ rationale.append(
242
+ f"Segment-aware analysis reduces outliers by {reduction:.0%} "
243
+ f"({total_global} global → {total_segment} segment-level)"
244
+ )
245
+
246
+ if not segmentation_recommended and n_segments <= 1:
247
+ rationale.append("Data appears homogeneous - global outlier treatment is appropriate")
248
+ recommendations.append("Use standard global outlier detection methods")
249
+
250
+ return segmentation_recommended, recommendations, rationale
251
+
252
+ def _empty_result(self, feature_cols: List[str]) -> SegmentAwareOutlierResult:
253
+ empty_handler = OutlierHandler()
254
+ empty_series = pd.Series([], dtype=float)
255
+ empty_result = empty_handler.detect(empty_series)
256
+
257
+ return SegmentAwareOutlierResult(
258
+ n_segments=0,
259
+ global_analysis={col: empty_result for col in feature_cols},
260
+ segment_analysis={},
261
+ false_outliers={col: 0 for col in feature_cols},
262
+ segmentation_recommended=False,
263
+ recommendations=["Insufficient data for outlier analysis"],
264
+ rationale=["Empty or all-null dataset"]
265
+ )
@@ -0,0 +1,217 @@
1
+ from dataclasses import dataclass, field
2
+ from enum import Enum
3
+ from typing import Dict, List, Optional, Tuple
4
+
5
+ from customer_retention.core.compat import DataFrame
6
+
7
+
8
+ class TargetLevel(Enum):
9
+ ENTITY_LEVEL = "entity_level"
10
+ EVENT_LEVEL = "event_level"
11
+ UNKNOWN = "unknown"
12
+ MISSING = "missing"
13
+
14
+
15
+ class AggregationMethod(Enum):
16
+ MAX = "max"
17
+ MEAN = "mean"
18
+ SUM = "sum"
19
+ LAST = "last"
20
+ FIRST = "first"
21
+
22
+
23
+ @dataclass
24
+ class TargetDistribution:
25
+ value_counts: Dict[int, int]
26
+ total: int
27
+
28
+ @property
29
+ def as_percentages(self) -> Dict[int, float]:
30
+ return {k: v / self.total * 100 for k, v in self.value_counts.items()}
31
+
32
+ def get_label(self, value: int) -> str:
33
+ return {1: "Churned", 0: "Retained"}.get(value, str(value))
34
+
35
+
36
+ @dataclass
37
+ class TargetLevelResult:
38
+ target_column: str
39
+ entity_column: str
40
+ level: TargetLevel
41
+ suggested_aggregation: Optional[AggregationMethod]
42
+ event_distribution: Optional[TargetDistribution] = None
43
+ entity_distribution: Optional[TargetDistribution] = None
44
+ variation_pct: float = 0.0
45
+ is_binary: bool = False
46
+ entity_target_column: Optional[str] = None
47
+ aggregation_used: Optional[AggregationMethod] = None
48
+ messages: List[str] = field(default_factory=list)
49
+
50
+
51
+ class TargetLevelAnalyzer:
52
+ ENTITY_LEVEL_THRESHOLD = 5.0
53
+ TARGET_KEYWORDS = ['churn', 'unsub', 'cancel', 'retain', 'active', 'lost', 'leave', 'target']
54
+
55
+ def __init__(self, variation_threshold: float = 5.0):
56
+ self.variation_threshold = variation_threshold
57
+
58
+ def detect_level(self, df: DataFrame, target_column: str, entity_column: str) -> TargetLevelResult:
59
+ if target_column is None or entity_column is None:
60
+ return TargetLevelResult(
61
+ target_column=target_column or "", entity_column=entity_column or "",
62
+ level=TargetLevel.UNKNOWN, suggested_aggregation=None,
63
+ messages=["Target or entity column not specified"])
64
+
65
+ if target_column not in df.columns:
66
+ return TargetLevelResult(
67
+ target_column=target_column, entity_column=entity_column,
68
+ level=TargetLevel.MISSING, suggested_aggregation=None,
69
+ messages=[f"Target column '{target_column}' not found in data"])
70
+
71
+ event_counts = df[target_column].value_counts().to_dict()
72
+ event_dist = TargetDistribution(value_counts=event_counts, total=len(df))
73
+
74
+ target_per_entity = df.groupby(entity_column)[target_column].nunique()
75
+ total_entities = len(target_per_entity)
76
+ variation_pct = ((target_per_entity > 1).sum() / total_entities * 100) if total_entities > 0 else 0
77
+ is_binary = len(event_counts) == 2
78
+
79
+ if variation_pct < self.variation_threshold:
80
+ entity_target = df.groupby(entity_column)[target_column].first()
81
+ entity_dist = TargetDistribution(value_counts=entity_target.value_counts().to_dict(), total=len(entity_target))
82
+ return TargetLevelResult(
83
+ target_column=target_column, entity_column=entity_column, level=TargetLevel.ENTITY_LEVEL,
84
+ suggested_aggregation=None, event_distribution=event_dist, entity_distribution=entity_dist,
85
+ variation_pct=variation_pct, is_binary=is_binary,
86
+ messages=["Target is consistent within entities (entity-level)"])
87
+
88
+ return TargetLevelResult(
89
+ target_column=target_column, entity_column=entity_column, level=TargetLevel.EVENT_LEVEL,
90
+ suggested_aggregation=self._suggest_aggregation(event_counts, is_binary),
91
+ event_distribution=event_dist, variation_pct=variation_pct, is_binary=is_binary,
92
+ messages=[f"Target varies within entities ({variation_pct:.1f}% have variation)",
93
+ f"Suggested aggregation: {self._suggest_aggregation(event_counts, is_binary).value}"])
94
+
95
+ def aggregate_to_entity(self, df: DataFrame, target_column: str, entity_column: str,
96
+ time_column: Optional[str] = None,
97
+ method: AggregationMethod = AggregationMethod.MAX) -> Tuple[DataFrame, TargetLevelResult]:
98
+ result = self.detect_level(df, target_column, entity_column)
99
+
100
+ if result.level == TargetLevel.ENTITY_LEVEL:
101
+ result.entity_target_column = target_column
102
+ return df, result
103
+
104
+ if result.level in [TargetLevel.MISSING, TargetLevel.UNKNOWN]:
105
+ return df, result
106
+
107
+ entity_target_col = f"{target_column}_entity"
108
+ entity_target = self._compute_entity_target(df, target_column, entity_column, time_column, method, result)
109
+
110
+ entity_dist = TargetDistribution(value_counts=entity_target.value_counts().to_dict(), total=len(entity_target))
111
+ entity_target_map = entity_target.reset_index()
112
+ entity_target_map.columns = [entity_column, entity_target_col]
113
+ df_result = df.merge(entity_target_map, on=entity_column, how="left")
114
+
115
+ result.entity_distribution = entity_dist
116
+ result.entity_target_column = entity_target_col
117
+ result.aggregation_used = method
118
+ result.messages.append(f"Created entity-level target: {entity_target_col}")
119
+ return df_result, result
120
+
121
+ def _compute_entity_target(self, df: DataFrame, target_column: str, entity_column: str,
122
+ time_column: Optional[str], method: AggregationMethod,
123
+ result: TargetLevelResult):
124
+ agg_funcs = {
125
+ AggregationMethod.MAX: lambda: df.groupby(entity_column)[target_column].max(),
126
+ AggregationMethod.MEAN: lambda: df.groupby(entity_column)[target_column].mean(),
127
+ AggregationMethod.SUM: lambda: df.groupby(entity_column)[target_column].sum(),
128
+ }
129
+ if method in agg_funcs:
130
+ return agg_funcs[method]()
131
+
132
+ if method == AggregationMethod.LAST:
133
+ if time_column is None:
134
+ result.messages.append("Warning: 'last' aggregation without time_column uses row order")
135
+ return df.groupby(entity_column)[target_column].last()
136
+ return df.sort_values(time_column).groupby(entity_column)[target_column].last()
137
+
138
+ if method == AggregationMethod.FIRST:
139
+ if time_column is None:
140
+ return df.groupby(entity_column)[target_column].first()
141
+ return df.sort_values(time_column).groupby(entity_column)[target_column].first()
142
+
143
+ return df.groupby(entity_column)[target_column].max()
144
+
145
+ def _suggest_aggregation(self, value_counts: Dict[int, int], is_binary: bool) -> AggregationMethod:
146
+ return AggregationMethod.MAX
147
+
148
+ def print_analysis(self, result: TargetLevelResult):
149
+ print("=" * 70 + "\nTARGET LEVEL ANALYSIS\n" + "=" * 70)
150
+ print(f"\nColumn: {result.target_column}\nLevel: {result.level.value.upper()}")
151
+
152
+ if result.level == TargetLevel.EVENT_LEVEL:
153
+ print(f"\n⚠️ EVENT-LEVEL TARGET DETECTED\n {result.variation_pct:.1f}% of entities have varying target values")
154
+ if result.event_distribution:
155
+ print("\n Event-level distribution:")
156
+ for val, count in sorted(result.event_distribution.value_counts.items()):
157
+ print(f" {result.target_column}={val}: {count:,} events ({result.event_distribution.as_percentages[val]:.1f}%)")
158
+ if result.suggested_aggregation:
159
+ print(f"\n Suggested aggregation: {result.suggested_aggregation.value}")
160
+
161
+ elif result.level == TargetLevel.ENTITY_LEVEL:
162
+ print("\n✓ Target is already at entity-level")
163
+ if result.entity_distribution:
164
+ print("\n Entity-level distribution:")
165
+ for val, count in sorted(result.entity_distribution.value_counts.items()):
166
+ pct = result.entity_distribution.as_percentages[val]
167
+ label = result.entity_distribution.get_label(val)
168
+ print(f" {label} ({result.target_column}={val}): {count:,} entities ({pct:.1f}%)")
169
+
170
+ if result.aggregation_used:
171
+ print(f"\n Aggregation applied: {result.aggregation_used.value}")
172
+ print(f" Entity target column: {result.entity_target_column}")
173
+ if result.entity_distribution:
174
+ print("\n Entity-level distribution (after aggregation):")
175
+ for val, count in sorted(result.entity_distribution.value_counts.items()):
176
+ pct = result.entity_distribution.as_percentages[val]
177
+ label = result.entity_distribution.get_label(val)
178
+ print(f" {label} ({result.entity_target_column}={val}): {count:,} entities ({pct:.1f}%)")
179
+ print()
180
+
181
+
182
+ class TargetColumnDetector:
183
+ TARGET_KEYWORDS = ['churn', 'unsub', 'cancel', 'retain', 'active', 'lost', 'leave', 'target']
184
+
185
+ def detect(self, findings, df: DataFrame, override: Optional[str] = None) -> Tuple[Optional[str], str]:
186
+ from customer_retention.core.config.column_config import ColumnType
187
+
188
+ if override == "DEFER_TO_MULTI_DATASET":
189
+ return None, "deferred"
190
+ if override is not None:
191
+ return override, "override"
192
+
193
+ for col_name, col_info in findings.columns.items():
194
+ if col_info.inferred_type == ColumnType.TARGET:
195
+ return col_name, "auto-detected"
196
+
197
+ for col_name, col_info in findings.columns.items():
198
+ if col_info.inferred_type == ColumnType.BINARY:
199
+ if any(kw in col_name.lower() for kw in self.TARGET_KEYWORDS):
200
+ return col_name, "binary-candidate"
201
+
202
+ return None, "not-found"
203
+
204
+ def print_detection(self, target_column: Optional[str], method: str,
205
+ other_candidates: Optional[List[str]] = None):
206
+ messages = {
207
+ "deferred": "\n⏳ Target deferred to multi-dataset notebook (05)\n Analysis will proceed without target-based comparisons",
208
+ "override": f"\n🔧 Using override target: {target_column}",
209
+ "auto-detected": f"\n🔍 Auto-detected target: {target_column}",
210
+ "not-found": "\n🔍 No target column detected"
211
+ }
212
+ if method == "binary-candidate":
213
+ print(f"\n🔍 No explicit target detected, using binary candidate: {target_column}")
214
+ if other_candidates:
215
+ print(f" Other candidates: {other_candidates}")
216
+ else:
217
+ print(messages.get(method, ""))