churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,454 @@
1
+ """RelationshipRecommender - generates actionable recommendations from relationship analysis."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from enum import Enum
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ import numpy as np
8
+
9
+ from customer_retention.core.compat import pd
10
+
11
+
12
+ class RecommendationCategory(Enum):
13
+ FEATURE_SELECTION = "feature_selection"
14
+ FEATURE_ENGINEERING = "feature_engineering"
15
+ STRATIFICATION = "stratification"
16
+ MODEL_SELECTION = "model_selection"
17
+
18
+
19
+ @dataclass
20
+ class RelationshipRecommendation:
21
+ category: RecommendationCategory
22
+ title: str
23
+ description: str
24
+ action: str
25
+ priority: str # "high", "medium", "low"
26
+ affected_features: List[str] = field(default_factory=list)
27
+ evidence: Dict[str, Any] = field(default_factory=dict)
28
+
29
+ def to_dict(self) -> Dict[str, Any]:
30
+ return {
31
+ "category": self.category.value,
32
+ "title": self.title,
33
+ "description": self.description,
34
+ "action": self.action,
35
+ "priority": self.priority,
36
+ "affected_features": self.affected_features,
37
+ "evidence": self.evidence,
38
+ }
39
+
40
+
41
+ @dataclass
42
+ class RelationshipAnalysisSummary:
43
+ recommendations: List[RelationshipRecommendation]
44
+ correlation_matrix: Optional[pd.DataFrame] = None
45
+ strong_predictors: List[Dict[str, Any]] = field(default_factory=list)
46
+ weak_predictors: List[str] = field(default_factory=list)
47
+ multicollinear_pairs: List[Dict[str, Any]] = field(default_factory=list)
48
+ high_risk_segments: List[Dict[str, Any]] = field(default_factory=list)
49
+ categorical_associations: List[Dict[str, Any]] = field(default_factory=list)
50
+
51
+ @property
52
+ def recommendations_by_category(self) -> Dict[RecommendationCategory, List[RelationshipRecommendation]]:
53
+ grouped = {cat: [] for cat in RecommendationCategory}
54
+ for rec in self.recommendations:
55
+ grouped[rec.category].append(rec)
56
+ return grouped
57
+
58
+ @property
59
+ def high_priority_actions(self) -> List[RelationshipRecommendation]:
60
+ return [r for r in self.recommendations if r.priority == "high"]
61
+
62
+
63
+ class RelationshipRecommender:
64
+ HIGH_CORRELATION_THRESHOLD = 0.7
65
+ STRONG_PREDICTOR_THRESHOLD = 0.3
66
+ WEAK_PREDICTOR_THRESHOLD = 0.1
67
+ HIGH_RISK_LIFT_THRESHOLD = 0.85
68
+ MIN_CATEGORY_SIZE = 10
69
+
70
+ def analyze(
71
+ self,
72
+ df: pd.DataFrame,
73
+ numeric_cols: Optional[List[str]] = None,
74
+ categorical_cols: Optional[List[str]] = None,
75
+ target_col: Optional[str] = None,
76
+ ) -> RelationshipAnalysisSummary:
77
+ numeric_cols = numeric_cols or []
78
+ categorical_cols = categorical_cols or []
79
+ recommendations = []
80
+
81
+ correlation_matrix = None
82
+ strong_predictors = []
83
+ weak_predictors = []
84
+ multicollinear_pairs = []
85
+ high_risk_segments = []
86
+ categorical_associations = []
87
+
88
+ # Analyze numeric features
89
+ if numeric_cols:
90
+ corr_results = self._analyze_numeric_correlations(df, numeric_cols, target_col)
91
+ correlation_matrix = corr_results["correlation_matrix"]
92
+ multicollinear_pairs = corr_results["multicollinear_pairs"]
93
+ recommendations.extend(corr_results["recommendations"])
94
+
95
+ if target_col:
96
+ predictor_results = self._analyze_predictive_power(df, numeric_cols, target_col)
97
+ strong_predictors = predictor_results["strong"]
98
+ weak_predictors = predictor_results["weak"]
99
+ recommendations.extend(predictor_results["recommendations"])
100
+
101
+ # Analyze categorical features
102
+ if categorical_cols and target_col:
103
+ cat_results = self._analyze_categorical_relationships(df, categorical_cols, target_col)
104
+ high_risk_segments = cat_results["high_risk_segments"]
105
+ categorical_associations = cat_results["associations"]
106
+ recommendations.extend(cat_results["recommendations"])
107
+
108
+ # Model selection recommendations
109
+ model_recs = self._generate_model_recommendations(
110
+ multicollinear_pairs, strong_predictors, categorical_associations
111
+ )
112
+ recommendations.extend(model_recs)
113
+
114
+ # Feature engineering recommendations
115
+ eng_recs = self._generate_engineering_recommendations(df, numeric_cols, target_col)
116
+ recommendations.extend(eng_recs)
117
+
118
+ return RelationshipAnalysisSummary(
119
+ recommendations=recommendations,
120
+ correlation_matrix=correlation_matrix,
121
+ strong_predictors=strong_predictors,
122
+ weak_predictors=weak_predictors,
123
+ multicollinear_pairs=multicollinear_pairs,
124
+ high_risk_segments=high_risk_segments,
125
+ categorical_associations=categorical_associations,
126
+ )
127
+
128
+ def _analyze_numeric_correlations(
129
+ self, df: pd.DataFrame, numeric_cols: List[str], target_col: Optional[str]
130
+ ) -> Dict[str, Any]:
131
+ recommendations = []
132
+ multicollinear_pairs = []
133
+
134
+ cols_to_analyze = [c for c in numeric_cols if c in df.columns]
135
+ if len(cols_to_analyze) < 2:
136
+ return {
137
+ "correlation_matrix": None,
138
+ "multicollinear_pairs": [],
139
+ "recommendations": [],
140
+ }
141
+
142
+ correlation_matrix = df[cols_to_analyze].corr()
143
+
144
+ # Find multicollinear pairs
145
+ for i, col1 in enumerate(cols_to_analyze):
146
+ for col2 in cols_to_analyze[i + 1:]:
147
+ if col1 == target_col or col2 == target_col:
148
+ continue
149
+ corr_val = correlation_matrix.loc[col1, col2]
150
+ if abs(corr_val) >= self.HIGH_CORRELATION_THRESHOLD:
151
+ multicollinear_pairs.append({
152
+ "feature1": col1,
153
+ "feature2": col2,
154
+ "correlation": float(corr_val),
155
+ })
156
+
157
+ # Generate recommendations for multicollinearity
158
+ if multicollinear_pairs:
159
+ for pair in multicollinear_pairs:
160
+ recommendations.append(RelationshipRecommendation(
161
+ category=RecommendationCategory.FEATURE_SELECTION,
162
+ title="Remove multicollinear feature",
163
+ description=f"{pair['feature1']} and {pair['feature2']} are highly correlated (r={pair['correlation']:.2f})",
164
+ action="Consider dropping one of these features. Keep the one with stronger business meaning or higher target correlation.",
165
+ priority="high" if abs(pair["correlation"]) >= 0.85 else "medium",
166
+ affected_features=[pair["feature1"], pair["feature2"]],
167
+ evidence={"correlation": pair["correlation"]},
168
+ ))
169
+
170
+ return {
171
+ "correlation_matrix": correlation_matrix,
172
+ "multicollinear_pairs": multicollinear_pairs,
173
+ "recommendations": recommendations,
174
+ }
175
+
176
+ def _analyze_predictive_power(
177
+ self, df: pd.DataFrame, numeric_cols: List[str], target_col: str
178
+ ) -> Dict[str, Any]:
179
+ recommendations = []
180
+ strong = []
181
+ weak = []
182
+
183
+ if target_col not in df.columns:
184
+ return {"strong": [], "weak": [], "recommendations": []}
185
+
186
+ df[target_col]
187
+
188
+ for col in numeric_cols:
189
+ if col == target_col or col not in df.columns:
190
+ continue
191
+
192
+ corr = df[[col, target_col]].corr().iloc[0, 1]
193
+ effect_size = self._calculate_effect_size(df, col, target_col)
194
+
195
+ predictor_info = {
196
+ "feature": col,
197
+ "correlation": float(corr),
198
+ "effect_size": effect_size,
199
+ }
200
+
201
+ if abs(effect_size) >= 0.5 or abs(corr) >= self.STRONG_PREDICTOR_THRESHOLD:
202
+ strong.append(predictor_info)
203
+ elif abs(effect_size) < 0.2 and abs(corr) < self.WEAK_PREDICTOR_THRESHOLD:
204
+ weak.append(col)
205
+
206
+ # Recommendations for strong predictors
207
+ if strong:
208
+ top_predictors = sorted(strong, key=lambda x: abs(x["effect_size"]), reverse=True)[:3]
209
+ features_list = [p["feature"] for p in top_predictors]
210
+ recommendations.append(RelationshipRecommendation(
211
+ category=RecommendationCategory.FEATURE_SELECTION,
212
+ title="Prioritize strong predictors",
213
+ description=f"Top predictive features: {', '.join(features_list)}",
214
+ action="Ensure these features are included in your model and check for data quality issues.",
215
+ priority="high",
216
+ affected_features=features_list,
217
+ evidence={"predictors": top_predictors},
218
+ ))
219
+
220
+ # Recommendations for weak predictors
221
+ if weak:
222
+ recommendations.append(RelationshipRecommendation(
223
+ category=RecommendationCategory.FEATURE_SELECTION,
224
+ title="Consider removing weak predictors",
225
+ description=f"Features with low predictive power: {', '.join(weak[:5])}",
226
+ action="These features may add noise. Consider removing or combining with other features.",
227
+ priority="low",
228
+ affected_features=weak[:5],
229
+ evidence={"weak_features": weak},
230
+ ))
231
+
232
+ return {"strong": strong, "weak": weak, "recommendations": recommendations}
233
+
234
+ def _calculate_effect_size(self, df: pd.DataFrame, col: str, target_col: str) -> float:
235
+ """Calculate Cohen's d effect size."""
236
+ group0 = df[df[target_col] == 0][col].dropna()
237
+ group1 = df[df[target_col] == 1][col].dropna()
238
+
239
+ if len(group0) < 2 or len(group1) < 2:
240
+ return 0.0
241
+
242
+ pooled_std = np.sqrt(
243
+ ((len(group0) - 1) * group0.std() ** 2 + (len(group1) - 1) * group1.std() ** 2)
244
+ / (len(group0) + len(group1) - 2)
245
+ )
246
+
247
+ if pooled_std == 0:
248
+ return 0.0
249
+
250
+ return float((group1.mean() - group0.mean()) / pooled_std)
251
+
252
+ def _analyze_categorical_relationships(
253
+ self, df: pd.DataFrame, categorical_cols: List[str], target_col: str
254
+ ) -> Dict[str, Any]:
255
+ recommendations = []
256
+ high_risk_segments = []
257
+ associations = []
258
+
259
+ if target_col not in df.columns:
260
+ return {"high_risk_segments": [], "associations": [], "recommendations": []}
261
+
262
+ overall_rate = df[target_col].mean()
263
+
264
+ for col in categorical_cols:
265
+ if col not in df.columns:
266
+ continue
267
+
268
+ # Calculate retention rates by category
269
+ cat_stats = df.groupby(col)[target_col].agg(["mean", "count"]).reset_index()
270
+ cat_stats.columns = [col, "retention_rate", "count"]
271
+ cat_stats["lift"] = cat_stats["retention_rate"] / overall_rate
272
+
273
+ # Calculate Cramér's V
274
+ cramers_v = self._calculate_cramers_v(df, col, target_col)
275
+ associations.append({"feature": col, "cramers_v": cramers_v})
276
+
277
+ # Identify high-risk segments
278
+ for _, row in cat_stats.iterrows():
279
+ if row["count"] >= self.MIN_CATEGORY_SIZE and row["lift"] < self.HIGH_RISK_LIFT_THRESHOLD:
280
+ high_risk_segments.append({
281
+ "feature": col,
282
+ "segment": row[col],
283
+ "retention_rate": float(row["retention_rate"]),
284
+ "lift": float(row["lift"]),
285
+ "count": int(row["count"]),
286
+ })
287
+
288
+ # Check if category sizes are imbalanced
289
+ size_ratio = cat_stats["count"].max() / cat_stats["count"].min() if cat_stats["count"].min() > 0 else float("inf")
290
+ rate_spread = cat_stats["retention_rate"].max() - cat_stats["retention_rate"].min()
291
+
292
+ if rate_spread > 0.15 or size_ratio > 10:
293
+ recommendations.append(RelationshipRecommendation(
294
+ category=RecommendationCategory.STRATIFICATION,
295
+ title=f"Stratify by {col}",
296
+ description=f"Significant variation in retention rates across {col} categories (spread: {rate_spread:.1%})",
297
+ action=f"Use stratified sampling by {col} in train/test split to ensure all segments are represented.",
298
+ priority="high" if rate_spread > 0.25 else "medium",
299
+ affected_features=[col],
300
+ evidence={"rate_spread": rate_spread, "size_ratio": size_ratio, "cramers_v": cramers_v},
301
+ ))
302
+
303
+ # High risk segment recommendations
304
+ if high_risk_segments:
305
+ segment_names = list(set(s["segment"] for s in high_risk_segments[:3]))
306
+ recommendations.append(RelationshipRecommendation(
307
+ category=RecommendationCategory.STRATIFICATION,
308
+ title="Monitor high-risk segments",
309
+ description=f"Segments with below-average retention: {', '.join(str(s) for s in segment_names)}",
310
+ action="Target these segments for intervention campaigns and ensure adequate representation in training data.",
311
+ priority="high",
312
+ affected_features=[s["feature"] for s in high_risk_segments[:3]],
313
+ evidence={"high_risk_segments": high_risk_segments[:5]},
314
+ ))
315
+
316
+ return {
317
+ "high_risk_segments": high_risk_segments,
318
+ "associations": associations,
319
+ "recommendations": recommendations,
320
+ }
321
+
322
+ def _calculate_cramers_v(self, df: pd.DataFrame, col: str, target_col: str) -> float:
323
+ """Calculate Cramér's V for categorical association."""
324
+ try:
325
+ from scipy.stats import chi2_contingency
326
+ contingency = pd.crosstab(df[col], df[target_col])
327
+ chi2, _, _, _ = chi2_contingency(contingency)
328
+ n = len(df)
329
+ min_dim = min(contingency.shape) - 1
330
+ if min_dim == 0:
331
+ return 0.0
332
+ return float(np.sqrt(chi2 / (n * min_dim)))
333
+ except Exception:
334
+ return 0.0
335
+
336
+ def _generate_model_recommendations(
337
+ self,
338
+ multicollinear_pairs: List[Dict],
339
+ strong_predictors: List[Dict],
340
+ categorical_associations: List[Dict],
341
+ ) -> List[RelationshipRecommendation]:
342
+ recommendations = []
343
+
344
+ # Based on multicollinearity
345
+ if multicollinear_pairs:
346
+ recommendations.append(RelationshipRecommendation(
347
+ category=RecommendationCategory.MODEL_SELECTION,
348
+ title="Consider tree-based models for multicollinearity",
349
+ description=f"Found {len(multicollinear_pairs)} highly correlated feature pairs",
350
+ action="Tree-based models (Random Forest, XGBoost) are robust to multicollinearity. For linear models, remove redundant features first.",
351
+ priority="medium",
352
+ affected_features=[],
353
+ evidence={"n_multicollinear_pairs": len(multicollinear_pairs)},
354
+ ))
355
+
356
+ # Based on predictor strength
357
+ if strong_predictors:
358
+ avg_effect = np.mean([abs(p["effect_size"]) for p in strong_predictors])
359
+ if avg_effect >= 0.5:
360
+ recommendations.append(RelationshipRecommendation(
361
+ category=RecommendationCategory.MODEL_SELECTION,
362
+ title="Linear models may perform well",
363
+ description=f"Strong linear relationships detected (avg effect size: {avg_effect:.2f})",
364
+ action="Start with Logistic Regression as baseline. Clear feature-target relationships suggest interpretable models may work well.",
365
+ priority="medium",
366
+ affected_features=[p["feature"] for p in strong_predictors],
367
+ evidence={"avg_effect_size": avg_effect},
368
+ ))
369
+ else:
370
+ recommendations.append(RelationshipRecommendation(
371
+ category=RecommendationCategory.MODEL_SELECTION,
372
+ title="Non-linear models may improve performance",
373
+ description="Moderate effect sizes suggest potential non-linear relationships",
374
+ action="Try ensemble methods (Random Forest, Gradient Boosting) to capture non-linear patterns and interactions.",
375
+ priority="medium",
376
+ affected_features=[],
377
+ evidence={"avg_effect_size": avg_effect},
378
+ ))
379
+ else:
380
+ recommendations.append(RelationshipRecommendation(
381
+ category=RecommendationCategory.MODEL_SELECTION,
382
+ title="Explore ensemble methods",
383
+ description="No strong linear predictors identified",
384
+ action="Use tree-based ensembles to discover non-linear patterns and feature interactions.",
385
+ priority="medium",
386
+ affected_features=[],
387
+ evidence={},
388
+ ))
389
+
390
+ # Based on categorical strength
391
+ strong_cats = [a for a in categorical_associations if a.get("cramers_v", 0) >= 0.2]
392
+ if strong_cats:
393
+ features = [a["feature"] for a in strong_cats]
394
+ recommendations.append(RelationshipRecommendation(
395
+ category=RecommendationCategory.MODEL_SELECTION,
396
+ title="Categorical features are predictive",
397
+ description=f"Strong categorical associations: {', '.join(features)}",
398
+ action="Use target encoding for tree-based models or one-hot encoding for linear models. Consider CatBoost for native categorical handling.",
399
+ priority="medium",
400
+ affected_features=features,
401
+ evidence={"strong_categorical": strong_cats},
402
+ ))
403
+
404
+ return recommendations
405
+
406
+ def _generate_engineering_recommendations(
407
+ self,
408
+ df: pd.DataFrame,
409
+ numeric_cols: List[str],
410
+ target_col: Optional[str],
411
+ ) -> List[RelationshipRecommendation]:
412
+ recommendations = []
413
+
414
+ if not numeric_cols or len(numeric_cols) < 2:
415
+ return recommendations
416
+
417
+ # Suggest ratio features for correlated pairs
418
+ cols_in_df = [c for c in numeric_cols if c in df.columns and c != target_col]
419
+ if len(cols_in_df) >= 2:
420
+ # Check for potential ratio/interaction features
421
+ corr_matrix = df[cols_in_df].corr()
422
+ moderate_pairs = []
423
+
424
+ for i, col1 in enumerate(cols_in_df):
425
+ for col2 in cols_in_df[i + 1:]:
426
+ corr = corr_matrix.loc[col1, col2]
427
+ if 0.3 <= abs(corr) < 0.7:
428
+ moderate_pairs.append((col1, col2, corr))
429
+
430
+ if moderate_pairs:
431
+ pair_strs = [f"{p[0]}/{p[1]}" for p in moderate_pairs[:3]]
432
+ recommendations.append(RelationshipRecommendation(
433
+ category=RecommendationCategory.FEATURE_ENGINEERING,
434
+ title="Consider ratio features",
435
+ description=f"Moderately correlated pairs may benefit from ratio features: {', '.join(pair_strs)}",
436
+ action="Create ratio features (e.g., feature_a / feature_b) to capture relative relationships.",
437
+ priority="low",
438
+ affected_features=[p[0] for p in moderate_pairs[:3]] + [p[1] for p in moderate_pairs[:3]],
439
+ evidence={"moderate_pairs": moderate_pairs[:3]},
440
+ ))
441
+
442
+ # General interaction recommendation
443
+ if len(cols_in_df) >= 2:
444
+ recommendations.append(RelationshipRecommendation(
445
+ category=RecommendationCategory.FEATURE_ENGINEERING,
446
+ title="Test feature interactions",
447
+ description="Interaction terms may capture non-linear relationships",
448
+ action="Use PolynomialFeatures(interaction_only=True) or tree-based models which automatically discover interactions.",
449
+ priority="low",
450
+ affected_features=cols_in_df[:4],
451
+ evidence={},
452
+ ))
453
+
454
+ return recommendations