churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,692 @@
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+ from typing import Any, Dict, List, Optional, Tuple
4
+
5
+ import numpy as np
6
+ from scipy import stats
7
+
8
+ from customer_retention.core.compat import DataFrame, pd
9
+ from customer_retention.core.utils import compute_effect_size
10
+
11
+
12
+ class FeatureType(str, Enum):
13
+ VELOCITY = "velocity"
14
+ ACCELERATION = "acceleration"
15
+ MOMENTUM = "momentum"
16
+ LAG = "lag"
17
+ ROLLING = "rolling"
18
+ RATIO = "ratio"
19
+
20
+
21
+ @dataclass
22
+ class VelocityResult:
23
+ column: str
24
+ window_days: int
25
+ mean_velocity: float
26
+ std_velocity: float
27
+ trend_direction: str
28
+
29
+
30
+ @dataclass
31
+ class MomentumResult:
32
+ column: str
33
+ short_window: int
34
+ long_window: int
35
+ mean_momentum: float
36
+ std_momentum: float
37
+ interpretation: str
38
+
39
+
40
+ @dataclass
41
+ class LagCorrelationResult:
42
+ column: str
43
+ correlations: List[float]
44
+ best_lag: int
45
+ best_correlation: float
46
+ has_weekly_pattern: bool
47
+
48
+
49
+ @dataclass
50
+ class PredictivePowerResult:
51
+ column: str
52
+ information_value: float
53
+ iv_interpretation: str
54
+ ks_statistic: float
55
+ ks_pvalue: float
56
+ ks_interpretation: str
57
+
58
+
59
+ @dataclass
60
+ class CohortComparison:
61
+ velocity: float
62
+ momentum: float
63
+ mean_value: float
64
+
65
+
66
+ @dataclass
67
+ class FeatureRecommendation:
68
+ feature_name: str
69
+ feature_type: FeatureType
70
+ formula: str
71
+ rationale: str
72
+ priority: int
73
+ source_column: str
74
+
75
+
76
+ @dataclass
77
+ class CohortVelocityResult:
78
+ column: str
79
+ window_days: int
80
+ retained_velocity: List[float]
81
+ churned_velocity: List[float]
82
+ overall_velocity: List[float]
83
+ retained_accel: List[float]
84
+ churned_accel: List[float]
85
+ overall_accel: List[float]
86
+ velocity_effect_size: float
87
+ velocity_effect_interp: str
88
+ accel_effect_size: float
89
+ accel_effect_interp: str
90
+ period_label: str
91
+
92
+
93
+ @dataclass
94
+ class VelocityRecommendation:
95
+ source_column: str
96
+ action: str
97
+ description: str
98
+ params: Dict[str, Any]
99
+ effect_size: float
100
+ priority: int
101
+
102
+
103
+ @dataclass
104
+ class CohortMomentumResult:
105
+ column: str
106
+ short_window: int
107
+ long_window: int
108
+ retained_momentum: float
109
+ churned_momentum: float
110
+ overall_momentum: float
111
+ effect_size: float
112
+ effect_interp: str
113
+ window_label: str
114
+
115
+
116
+ class TemporalFeatureAnalyzer:
117
+ IV_THRESHOLDS = {"weak": 0.02, "medium": 0.1, "strong": 0.3, "suspicious": 0.5}
118
+ KS_THRESHOLDS = {"weak": 0.2, "medium": 0.4}
119
+
120
+ def __init__(self, time_column: str, entity_column: str):
121
+ self.time_column = time_column
122
+ self.entity_column = entity_column
123
+
124
+ def calculate_velocity(
125
+ self, df: DataFrame, value_columns: List[str], window_days: int = 7
126
+ ) -> Dict[str, VelocityResult]:
127
+ df = self._prepare_dataframe(df)
128
+ return {col: self._velocity_for_column(df, col, window_days)
129
+ for col in value_columns if col in df.columns}
130
+
131
+ def _daily_diff_series(self, df: DataFrame, col: str, window_days: int):
132
+ daily = df.groupby(df[self.time_column].dt.date)[col].mean()
133
+ return daily.diff(window_days)
134
+
135
+ def _velocity_for_column(self, df: DataFrame, col: str, window_days: int) -> VelocityResult:
136
+ velocity = self._daily_diff_series(df, col, window_days) / window_days
137
+ mean_vel = velocity.mean()
138
+ return VelocityResult(
139
+ column=col,
140
+ window_days=window_days,
141
+ mean_velocity=float(mean_vel) if not np.isnan(mean_vel) else 0.0,
142
+ std_velocity=float(velocity.std()) if not np.isnan(velocity.std()) else 0.0,
143
+ trend_direction=self._classify_trend(mean_vel),
144
+ )
145
+
146
+ def _classify_trend(self, mean_velocity: float) -> str:
147
+ if mean_velocity > 0.01:
148
+ return "increasing"
149
+ return "decreasing" if mean_velocity < -0.01 else "stable"
150
+
151
+ def calculate_acceleration(self, df: DataFrame, value_columns: List[str], window_days: int = 7) -> Dict[str, float]:
152
+ df = self._prepare_dataframe(df)
153
+ return {col: self._acceleration_for_column(df, col, window_days)
154
+ for col in value_columns if col in df.columns}
155
+
156
+ def _acceleration_for_column(self, df: DataFrame, col: str, window_days: int) -> float:
157
+ acceleration = self._daily_diff_series(df, col, window_days).diff(window_days)
158
+ return float(acceleration.mean()) if not np.isnan(acceleration.mean()) else 0.0
159
+
160
+ def compute_cohort_velocity_signals(
161
+ self, df: DataFrame, value_columns: List[str], target_column: str,
162
+ windows: Optional[List[int]] = None
163
+ ) -> Dict[str, List[CohortVelocityResult]]:
164
+ if target_column not in df.columns:
165
+ raise ValueError(f"target_column '{target_column}' not found in DataFrame")
166
+ windows = windows or [7, 14, 30, 90, 180, 365]
167
+ df = self._prepare_dataframe(df)
168
+ retained_df = df[df[target_column] == 1]
169
+ churned_df = df[df[target_column] == 0]
170
+ results = {}
171
+ for col in value_columns:
172
+ if col not in df.columns:
173
+ continue
174
+ col_results = []
175
+ for window in windows:
176
+ result = self._cohort_velocity_for_window(
177
+ retained_df, churned_df, col, window, df
178
+ )
179
+ col_results.append(result)
180
+ results[col] = col_results
181
+ return results
182
+
183
+ def _cohort_velocity_for_window(
184
+ self, retained_df: DataFrame, churned_df: DataFrame, col: str, window: int,
185
+ overall_df: DataFrame
186
+ ) -> CohortVelocityResult:
187
+ ret_vel, ret_accel = self._velocity_accel_series(retained_df, col, window)
188
+ churn_vel, churn_accel = self._velocity_accel_series(churned_df, col, window)
189
+ overall_vel, overall_accel = self._velocity_accel_series(overall_df, col, window)
190
+ vel_d, vel_interp = compute_effect_size(ret_vel, churn_vel)
191
+ accel_d, accel_interp = compute_effect_size(ret_accel, churn_accel)
192
+ period_label = self._window_to_period_label(window)
193
+ return CohortVelocityResult(
194
+ column=col, window_days=window,
195
+ retained_velocity=ret_vel, churned_velocity=churn_vel, overall_velocity=overall_vel,
196
+ retained_accel=ret_accel, churned_accel=churn_accel, overall_accel=overall_accel,
197
+ velocity_effect_size=vel_d, velocity_effect_interp=vel_interp,
198
+ accel_effect_size=accel_d, accel_effect_interp=accel_interp,
199
+ period_label=period_label
200
+ )
201
+
202
+ _WINDOW_MAPPING = [
203
+ (7, "W", "Weekly"),
204
+ (14, "2W", "Bi-weekly"),
205
+ (30, "M", "Monthly"),
206
+ (90, "Q", "Quarterly"),
207
+ (180, "2Q", "Semi-annual"),
208
+ ]
209
+
210
+ def _get_window_info(self, window_days: int) -> tuple:
211
+ for threshold, period_code, label in self._WINDOW_MAPPING:
212
+ if window_days <= threshold:
213
+ return period_code, label
214
+ return "Y", "Yearly"
215
+
216
+ def _window_to_period_label(self, window_days: int) -> str:
217
+ return self._get_window_info(window_days)[1]
218
+
219
+ def _window_to_period(self, window_days: int) -> str:
220
+ return self._get_window_info(window_days)[0]
221
+
222
+ def _velocity_accel_series(self, df: DataFrame, col: str, window: int) -> Tuple[List[float], List[float]]:
223
+ if df.empty or col not in df.columns:
224
+ return [], []
225
+ period_code = self._window_to_period(window)
226
+ period_col = df[self.time_column].dt.to_period(period_code).dt.start_time
227
+ period_means = df.groupby(period_col)[col].mean()
228
+ velocity = period_means.diff().dropna()
229
+ accel = velocity.diff().dropna()
230
+ return velocity.tolist(), accel.tolist()
231
+
232
+ def generate_velocity_recommendations(
233
+ self, results: Dict[str, List[CohortVelocityResult]]
234
+ ) -> List[VelocityRecommendation]:
235
+ recommendations = []
236
+ for col, col_results in results.items():
237
+ best = self._find_best_velocity_window(col_results)
238
+ if best and abs(best.velocity_effect_size) >= 0.5:
239
+ recommendations.append(VelocityRecommendation(
240
+ source_column=col, action="add_velocity_feature",
241
+ description=f"Add {best.period_label} velocity for {col} (d={best.velocity_effect_size:.2f})",
242
+ params={"window_days": best.window_days, "period": best.period_label},
243
+ effect_size=best.velocity_effect_size, priority=1 if abs(best.velocity_effect_size) >= 0.8 else 2
244
+ ))
245
+ if best and abs(best.accel_effect_size) >= 0.5:
246
+ recommendations.append(VelocityRecommendation(
247
+ source_column=col, action="add_acceleration_feature",
248
+ description=f"Add {best.period_label} acceleration for {col} (d={best.accel_effect_size:.2f})",
249
+ params={"window_days": best.window_days, "period": best.period_label},
250
+ effect_size=best.accel_effect_size, priority=2
251
+ ))
252
+ return sorted(recommendations, key=lambda r: (-abs(r.effect_size), r.priority))
253
+
254
+ def _find_best_velocity_window(
255
+ self, results: List[CohortVelocityResult]
256
+ ) -> Optional[CohortVelocityResult]:
257
+ if not results:
258
+ return None
259
+ return max(results, key=lambda r: abs(r.velocity_effect_size))
260
+
261
+ def generate_velocity_interpretation(
262
+ self, results: Dict[str, List[CohortVelocityResult]]
263
+ ) -> List[str]:
264
+ notes = []
265
+ for col, col_results in results.items():
266
+ best = self._find_best_velocity_window(col_results)
267
+ if not best:
268
+ continue
269
+ d = best.velocity_effect_size
270
+ if abs(d) >= 0.8:
271
+ direction = "increasing" if d > 0 else "decreasing"
272
+ notes.append(f"• {col}: Strong signal at {best.period_label} - retained customers show "
273
+ f"{direction} velocity vs churned (d={d:.2f})")
274
+ elif abs(d) >= 0.5:
275
+ notes.append(f"• {col}: Moderate signal at {best.period_label} (d={d:.2f}) - "
276
+ f"consider as secondary predictor")
277
+ elif abs(d) >= 0.2:
278
+ notes.append(f"• {col}: Weak signal at {best.period_label} (d={d:.2f}) - "
279
+ f"may contribute in feature combinations")
280
+ else:
281
+ notes.append(f"• {col}: No significant velocity difference between cohorts")
282
+ return notes
283
+
284
+ def calculate_momentum(
285
+ self, df: DataFrame, value_columns: List[str], short_window: int = 7, long_window: int = 30
286
+ ) -> Dict[str, MomentumResult]:
287
+ df = self._prepare_dataframe(df)
288
+ reference_date = df[self.time_column].max()
289
+ return {col: self._momentum_for_column(df, col, short_window, long_window, reference_date)
290
+ for col in value_columns if col in df.columns}
291
+
292
+ def _momentum_for_column(
293
+ self, df: DataFrame, col: str, short_window: int, long_window: int, reference_date
294
+ ) -> MomentumResult:
295
+ entity_momentum = []
296
+ for entity_id in df[self.entity_column].unique():
297
+ entity_data = df[df[self.entity_column] == entity_id].copy()
298
+ entity_data["days_ago"] = (reference_date - entity_data[self.time_column]).dt.days
299
+ short_mean = entity_data[entity_data["days_ago"] <= short_window][col].mean()
300
+ long_mean = entity_data[entity_data["days_ago"] <= long_window][col].mean()
301
+ if long_mean > 0 and not np.isnan(short_mean):
302
+ entity_momentum.append(short_mean / long_mean)
303
+
304
+ mean_mom = np.mean(entity_momentum) if entity_momentum else 1.0
305
+ std_mom = np.std(entity_momentum) if entity_momentum else 0.0
306
+ return MomentumResult(
307
+ column=col, short_window=short_window, long_window=long_window,
308
+ mean_momentum=float(mean_mom), std_momentum=float(std_mom),
309
+ interpretation=self._classify_momentum(mean_mom),
310
+ )
311
+
312
+ def _classify_momentum(self, mean_momentum: float) -> str:
313
+ if mean_momentum > 1.1:
314
+ return "accelerating"
315
+ return "decelerating" if mean_momentum < 0.9 else "stable"
316
+
317
+ def compute_cohort_momentum_signals(
318
+ self, df: DataFrame, value_columns: List[str], target_column: str,
319
+ window_pairs: Optional[List[Tuple[int, int]]] = None
320
+ ) -> Dict[str, List[CohortMomentumResult]]:
321
+ if target_column not in df.columns:
322
+ raise ValueError(f"target_column '{target_column}' not found in DataFrame")
323
+ window_pairs = window_pairs or [(7, 30), (30, 90), (7, 90)]
324
+ df = self._prepare_dataframe(df)
325
+ retained_df = df[df[target_column] == 1]
326
+ churned_df = df[df[target_column] == 0]
327
+ results = {}
328
+ for col in value_columns:
329
+ if col not in df.columns:
330
+ continue
331
+ col_results = []
332
+ for short_w, long_w in window_pairs:
333
+ result = self._cohort_momentum_for_pair(
334
+ retained_df, churned_df, df, col, short_w, long_w
335
+ )
336
+ col_results.append(result)
337
+ results[col] = col_results
338
+ return results
339
+
340
+ def _cohort_momentum_for_pair(
341
+ self, retained_df: DataFrame, churned_df: DataFrame, overall_df: DataFrame,
342
+ col: str, short_w: int, long_w: int
343
+ ) -> CohortMomentumResult:
344
+ ret_values = self._vectorized_entity_momentum(retained_df, col, short_w, long_w)
345
+ churn_values = self._vectorized_entity_momentum(churned_df, col, short_w, long_w)
346
+ overall_values = self._vectorized_entity_momentum(overall_df, col, short_w, long_w)
347
+ ret_mom = float(np.mean(ret_values)) if ret_values else 1.0
348
+ churn_mom = float(np.mean(churn_values)) if churn_values else 1.0
349
+ overall_mom = float(np.mean(overall_values)) if overall_values else 1.0
350
+ d, interp = compute_effect_size(ret_values, churn_values)
351
+ return CohortMomentumResult(
352
+ column=col, short_window=short_w, long_window=long_w,
353
+ retained_momentum=ret_mom, churned_momentum=churn_mom, overall_momentum=overall_mom,
354
+ effect_size=d, effect_interp=interp, window_label=f"{short_w}d/{long_w}d"
355
+ )
356
+
357
+ def _vectorized_entity_momentum(
358
+ self, df: DataFrame, col: str, short_w: int, long_w: int
359
+ ) -> List[float]:
360
+ if df.empty or col not in df.columns:
361
+ return []
362
+ reference_date = df[self.time_column].max()
363
+ df_calc = df[[self.entity_column, self.time_column, col]].copy()
364
+ df_calc["_days_ago"] = (reference_date - df_calc[self.time_column]).dt.days
365
+ short_means = df_calc[df_calc["_days_ago"] <= short_w].groupby(self.entity_column)[col].mean()
366
+ long_means = df_calc[df_calc["_days_ago"] <= long_w].groupby(self.entity_column)[col].mean()
367
+ valid = (long_means > 0) & short_means.notna() & long_means.notna()
368
+ momentum = (short_means[valid] / long_means[valid]).dropna()
369
+ return momentum.tolist()
370
+
371
+ def generate_momentum_interpretation(
372
+ self, results: Dict[str, List[CohortMomentumResult]]
373
+ ) -> List[str]:
374
+ notes = []
375
+ for col, col_results in results.items():
376
+ best = max(col_results, key=lambda r: abs(r.effect_size)) if col_results else None
377
+ if not best:
378
+ continue
379
+ d = best.effect_size
380
+ ret_trend = "accelerating" if best.retained_momentum > 1.05 else "decelerating" if best.retained_momentum < 0.95 else "stable"
381
+ churn_trend = "accelerating" if best.churned_momentum > 1.05 else "decelerating" if best.churned_momentum < 0.95 else "stable"
382
+ if abs(d) >= 0.5:
383
+ notes.append(f"• {col}: Strong signal at {best.window_label} - "
384
+ f"retained {ret_trend} ({best.retained_momentum:.2f}), "
385
+ f"churned {churn_trend} ({best.churned_momentum:.2f}), d={d:.2f}")
386
+ elif abs(d) >= 0.2:
387
+ notes.append(f"• {col}: Moderate signal at {best.window_label} (d={d:.2f}) - "
388
+ f"retained={best.retained_momentum:.2f}, churned={best.churned_momentum:.2f}")
389
+ else:
390
+ notes.append(f"• {col}: No significant momentum difference between cohorts")
391
+ return notes
392
+
393
+ def generate_momentum_recommendations(
394
+ self, results: Dict[str, List[CohortMomentumResult]]
395
+ ) -> List[VelocityRecommendation]:
396
+ recommendations = []
397
+ for col, col_results in results.items():
398
+ best = max(col_results, key=lambda r: abs(r.effect_size)) if col_results else None
399
+ if best and abs(best.effect_size) >= 0.5:
400
+ recommendations.append(VelocityRecommendation(
401
+ source_column=col, action="add_momentum_feature",
402
+ description=f"Add {best.window_label} momentum for {col} (d={best.effect_size:.2f})",
403
+ params={"short_window": best.short_window, "long_window": best.long_window},
404
+ effect_size=best.effect_size, priority=1 if abs(best.effect_size) >= 0.8 else 2
405
+ ))
406
+ return sorted(recommendations, key=lambda r: (-abs(r.effect_size), r.priority))
407
+
408
+ def calculate_lag_correlations(
409
+ self, df: DataFrame, value_columns: List[str], max_lag: int = 14
410
+ ) -> Dict[str, LagCorrelationResult]:
411
+ df = self._prepare_dataframe(df)
412
+ return {col: self._lag_correlation_for_column(df, col, max_lag)
413
+ for col in value_columns if col in df.columns}
414
+
415
+ def _lag_correlation_for_column(self, df: DataFrame, col: str, max_lag: int) -> LagCorrelationResult:
416
+ daily = df.groupby(df[self.time_column].dt.date)[col].mean()
417
+ correlations = [
418
+ float(daily.autocorr(lag=lag)) if len(daily) > lag and not np.isnan(daily.autocorr(lag=lag)) else 0.0
419
+ for lag in range(1, max_lag + 1)
420
+ ]
421
+ best_idx = int(np.argmax(np.abs(correlations)))
422
+ return LagCorrelationResult(
423
+ column=col, correlations=correlations, best_lag=best_idx + 1,
424
+ best_correlation=correlations[best_idx] if correlations else 0.0,
425
+ has_weekly_pattern=abs(correlations[6] if len(correlations) >= 7 else 0) > 0.2,
426
+ )
427
+
428
+ def generate_lag_recommendations(self, results: Dict[str, LagCorrelationResult]) -> List[VelocityRecommendation]:
429
+ recommendations = []
430
+ for col, result in results.items():
431
+ if result.best_correlation >= 0.3:
432
+ recommendations.append(VelocityRecommendation(
433
+ source_column=col, action="add_lag_feature",
434
+ description=f"Add lag-{result.best_lag}d feature for {col} (r={result.best_correlation:.2f})",
435
+ params={"lag_days": result.best_lag, "correlation": result.best_correlation},
436
+ effect_size=result.best_correlation, priority=1 if result.best_correlation >= 0.5 else 2
437
+ ))
438
+ if result.has_weekly_pattern and result.best_lag != 7:
439
+ recommendations.append(VelocityRecommendation(
440
+ source_column=col, action="add_weekly_lag",
441
+ description=f"Add lag-7d feature for {col} (weekly pattern detected)",
442
+ params={"lag_days": 7, "weekly_pattern": True},
443
+ effect_size=abs(result.correlations[6]) if len(result.correlations) >= 7 else 0.2,
444
+ priority=2
445
+ ))
446
+ return sorted(recommendations, key=lambda r: (-r.effect_size, r.priority))
447
+
448
+ def generate_lag_interpretation(self, results: Dict[str, LagCorrelationResult]) -> List[str]:
449
+ notes = []
450
+ strong_lags = [(col, r) for col, r in results.items() if r.best_correlation >= 0.5]
451
+ moderate_lags = [(col, r) for col, r in results.items() if 0.3 <= r.best_correlation < 0.5]
452
+ weekly_patterns = [(col, r) for col, r in results.items() if r.has_weekly_pattern]
453
+ weak_lags = [(col, r) for col, r in results.items() if r.best_correlation < 0.3]
454
+
455
+ if strong_lags:
456
+ cols = ", ".join(col for col, _ in strong_lags)
457
+ notes.append(f"Strong autocorrelation (r >= 0.5): {cols}")
458
+ notes.append(" → These variables have high predictability from past values")
459
+ notes.append(" → Lag features will be highly informative")
460
+
461
+ if moderate_lags:
462
+ cols = ", ".join(col for col, _ in moderate_lags)
463
+ notes.append(f"Moderate autocorrelation (0.3 <= r < 0.5): {cols}")
464
+ notes.append(" → Past values provide useful but not dominant signal")
465
+
466
+ if weekly_patterns:
467
+ cols = ", ".join(col for col, _ in weekly_patterns)
468
+ notes.append(f"Weekly patterns detected: {cols}")
469
+ notes.append(" → Consider day_of_week features and lag-7d features")
470
+
471
+ if weak_lags and len(weak_lags) == len(results):
472
+ notes.append("All variables show weak autocorrelation (r < 0.3)")
473
+ notes.append(" → Lag features may not be highly predictive")
474
+ notes.append(" → Consider aggregated/rolling features instead")
475
+
476
+ return notes
477
+
478
+ def _validate_target_constant_per_entity(self, df: DataFrame, target_column: str) -> None:
479
+ import warnings
480
+ varying_entities = (df.groupby(self.entity_column)[target_column].nunique() > 1).sum()
481
+ if varying_entities > 0:
482
+ warnings.warn(
483
+ f"Target '{target_column}' varies within {varying_entities} entities. "
484
+ f"Using first value per entity. Target should be constant for retention modeling.",
485
+ UserWarning, stacklevel=3,
486
+ )
487
+
488
+ def calculate_predictive_power(self, df: DataFrame, value_columns: List[str], target_column: str) -> Dict[str, PredictivePowerResult]:
489
+ if self.time_column in df.columns:
490
+ df = self._prepare_dataframe(df)
491
+ self._validate_target_constant_per_entity(df, target_column)
492
+ entity_data = self._aggregate_to_entity_level(df, value_columns, target_column)
493
+ else:
494
+ entity_data = df # Already entity-level
495
+ return {col: self._predictive_power_for_column(entity_data, col, target_column)
496
+ for col in value_columns if col in entity_data.columns}
497
+
498
+ def _aggregate_to_entity_level(self, df: DataFrame, value_columns: List[str], target_column: str) -> DataFrame:
499
+ entity_features = df.groupby(self.entity_column)[value_columns].mean()
500
+ entity_target = df.groupby(self.entity_column)[target_column].first()
501
+ return entity_features.join(entity_target)
502
+
503
+ def _predictive_power_for_column(self, entity_data: DataFrame, col: str, target_column: str) -> PredictivePowerResult:
504
+ feature, target = entity_data[col], entity_data[target_column]
505
+ iv = self._calculate_iv(feature, target)
506
+ ks_stat, ks_pval = self._calculate_ks(feature, target)
507
+ return PredictivePowerResult(
508
+ column=col, information_value=iv, iv_interpretation=self._interpret_iv(iv),
509
+ ks_statistic=ks_stat, ks_pvalue=ks_pval, ks_interpretation=self._interpret_ks(ks_stat),
510
+ )
511
+
512
+ def compare_cohorts(
513
+ self, df: DataFrame, value_columns: List[str], target_column: str
514
+ ) -> Dict[str, Dict[str, CohortComparison]]:
515
+ """Compare metrics between retained and churned cohorts."""
516
+ df = self._prepare_dataframe(df)
517
+ self._validate_event_level_target_usage(df, target_column)
518
+ self._validate_target_constant_per_entity(df, target_column)
519
+
520
+ value_columns = [c for c in value_columns if c != target_column]
521
+ df = self._add_entity_target_column(df, target_column)
522
+
523
+ return {col: self._compare_cohorts_for_column(df, col)
524
+ for col in value_columns if col in df.columns}
525
+
526
+ def _add_entity_target_column(self, df: DataFrame, target_column: str) -> DataFrame:
527
+ entity_target = df.groupby(self.entity_column)[target_column].first()
528
+ return df.merge(entity_target.reset_index().rename(columns={target_column: "_target"}), on=self.entity_column)
529
+
530
+ def _compare_cohorts_for_column(self, df: DataFrame, col: str) -> Dict[str, CohortComparison]:
531
+ retained_df, churned_df = df[df["_target"] == 1], df[df["_target"] == 0]
532
+ return {
533
+ "retained": self._cohort_comparison(retained_df, col),
534
+ "churned": self._cohort_comparison(churned_df, col),
535
+ }
536
+
537
+ def _cohort_comparison(self, cohort_df: DataFrame, col: str) -> CohortComparison:
538
+ vel = self.calculate_velocity(cohort_df, [col])
539
+ mom = self.calculate_momentum(cohort_df, [col])
540
+ return CohortComparison(
541
+ velocity=vel[col].mean_velocity if col in vel else 0,
542
+ momentum=mom[col].mean_momentum if col in mom else 1,
543
+ mean_value=float(cohort_df[col].mean()),
544
+ )
545
+
546
+ def get_feature_recommendations(
547
+ self, df: DataFrame, value_columns: List[str], target_column: Optional[str] = None
548
+ ) -> List[FeatureRecommendation]:
549
+ recommendations: List[FeatureRecommendation] = []
550
+ next_priority = 1
551
+
552
+ if target_column:
553
+ next_priority = self._add_predictive_power_recommendations(df, value_columns, target_column, recommendations, next_priority)
554
+ next_priority = self._add_velocity_recommendations(df, value_columns, recommendations, next_priority)
555
+ next_priority = self._add_momentum_recommendations(df, value_columns, recommendations, next_priority)
556
+ self._add_lag_recommendations(df, value_columns, recommendations, next_priority)
557
+ return recommendations
558
+
559
+ def _add_predictive_power_recommendations(
560
+ self, df: DataFrame, value_columns: List[str], target_column: str,
561
+ recommendations: List[FeatureRecommendation], next_priority: int
562
+ ) -> int:
563
+ power_results = self.calculate_predictive_power(df, value_columns, target_column)
564
+ for col, result in sorted(power_results.items(), key=lambda x: x[1].information_value, reverse=True):
565
+ if result.information_value > self.IV_THRESHOLDS["weak"]:
566
+ recommendations.append(FeatureRecommendation(
567
+ feature_name=f"{col}_mean", feature_type=FeatureType.ROLLING,
568
+ formula=f"df.groupby(entity)['{col}'].transform('mean')",
569
+ rationale=f"IV={result.information_value:.3f} ({result.iv_interpretation})",
570
+ priority=next_priority, source_column=col,
571
+ ))
572
+ next_priority += 1
573
+ return next_priority
574
+
575
+ def _add_velocity_recommendations(
576
+ self, df: DataFrame, value_columns: List[str],
577
+ recommendations: List[FeatureRecommendation], next_priority: int
578
+ ) -> int:
579
+ for col, result in self.calculate_velocity(df, value_columns).items():
580
+ if result.trend_direction != "stable":
581
+ recommendations.append(FeatureRecommendation(
582
+ feature_name=f"{col}_velocity_7d", feature_type=FeatureType.VELOCITY,
583
+ formula="(current - lag_7d) / lag_7d",
584
+ rationale=f"Detected {result.trend_direction} trend",
585
+ priority=next_priority, source_column=col,
586
+ ))
587
+ next_priority += 1
588
+ return next_priority
589
+
590
+ def _add_momentum_recommendations(
591
+ self, df: DataFrame, value_columns: List[str],
592
+ recommendations: List[FeatureRecommendation], next_priority: int
593
+ ) -> int:
594
+ for col, result in self.calculate_momentum(df, value_columns).items():
595
+ if result.interpretation != "stable":
596
+ recommendations.append(FeatureRecommendation(
597
+ feature_name=f"{col}_momentum_{result.short_window}_{result.long_window}",
598
+ feature_type=FeatureType.MOMENTUM,
599
+ formula=f"mean_{result.short_window}d / mean_{result.long_window}d",
600
+ rationale=f"Momentum indicates {result.interpretation} behavior",
601
+ priority=next_priority, source_column=col,
602
+ ))
603
+ next_priority += 1
604
+ return next_priority
605
+
606
+ def _add_lag_recommendations(
607
+ self, df: DataFrame, value_columns: List[str],
608
+ recommendations: List[FeatureRecommendation], next_priority: int
609
+ ) -> int:
610
+ for col, result in self.calculate_lag_correlations(df, value_columns).items():
611
+ if result.best_correlation > 0.3:
612
+ recommendations.append(FeatureRecommendation(
613
+ feature_name=f"{col}_lag_{result.best_lag}d", feature_type=FeatureType.LAG,
614
+ formula=f"df['{col}'].shift({result.best_lag})",
615
+ rationale=f"Strong autocorrelation (r={result.best_correlation:.2f}) at lag {result.best_lag}",
616
+ priority=next_priority, source_column=col,
617
+ ))
618
+ next_priority += 1
619
+ if result.has_weekly_pattern:
620
+ recommendations.append(FeatureRecommendation(
621
+ feature_name=f"{col}_weekly_pattern", feature_type=FeatureType.LAG,
622
+ formula=f"df['{col}'].shift(7)", rationale="Weekly seasonality detected",
623
+ priority=next_priority, source_column=col,
624
+ ))
625
+ next_priority += 1
626
+ return next_priority
627
+
628
+ def _prepare_dataframe(self, df: DataFrame) -> DataFrame:
629
+ df = df.copy()
630
+ df[self.time_column] = pd.to_datetime(df[self.time_column])
631
+ return df
632
+
633
+ def _validate_event_level_target_usage(self, df: DataFrame, target_column: Optional[str]) -> None:
634
+ if target_column is None:
635
+ return
636
+ n_entities, n_rows = df[self.entity_column].nunique(), len(df)
637
+ if n_entities < n_rows:
638
+ raise ValueError(
639
+ f"Target comparisons not allowed on event-level data. "
640
+ f"Found {n_rows:,} rows but only {n_entities:,} entities. "
641
+ f"Aggregate to entity level first using TimeWindowAggregator."
642
+ )
643
+
644
+ def _calculate_iv(self, feature: pd.Series, target: pd.Series, bins: int = 10) -> float:
645
+ df_iv = pd.DataFrame({"feature": feature, "target": target}).dropna()
646
+ if len(df_iv) < bins * 2:
647
+ return 0.0
648
+ try:
649
+ df_iv["bin"] = pd.qcut(df_iv["feature"], q=bins, duplicates="drop")
650
+ except ValueError:
651
+ return 0.0
652
+
653
+ grouped = df_iv.groupby("bin", observed=False)["target"].agg(["sum", "count"])
654
+ grouped["non_events"] = grouped["count"] - grouped["sum"]
655
+ grouped["events"] = grouped["sum"]
656
+ total_events, total_non_events = grouped["events"].sum(), grouped["non_events"].sum()
657
+ if total_events == 0 or total_non_events == 0:
658
+ return 0.0
659
+
660
+ grouped["pct_events"] = grouped["events"] / total_events
661
+ grouped["pct_non_events"] = grouped["non_events"] / total_non_events
662
+ grouped["pct_events"] = grouped["pct_events"].replace(0, 0.0001)
663
+ grouped["pct_non_events"] = grouped["pct_non_events"].replace(0, 0.0001)
664
+ grouped["woe"] = np.log(grouped["pct_events"] / grouped["pct_non_events"])
665
+ grouped["iv"] = (grouped["pct_events"] - grouped["pct_non_events"]) * grouped["woe"]
666
+ return float(grouped["iv"].sum())
667
+
668
+ def _calculate_ks(self, feature: pd.Series, target: pd.Series) -> Tuple[float, float]:
669
+ df_ks = pd.DataFrame({"feature": feature, "target": target}).dropna()
670
+ group0, group1 = df_ks[df_ks["target"] == 0]["feature"], df_ks[df_ks["target"] == 1]["feature"]
671
+ if len(group0) == 0 or len(group1) == 0:
672
+ return 0.0, 1.0
673
+ ks_stat, p_val = stats.ks_2samp(group0, group1)
674
+ return float(ks_stat), float(p_val)
675
+
676
+ def _interpret_iv(self, iv: float) -> str:
677
+ if iv > self.IV_THRESHOLDS["suspicious"]:
678
+ return "suspicious"
679
+ if iv > self.IV_THRESHOLDS["strong"]:
680
+ return "strong"
681
+ if iv > self.IV_THRESHOLDS["medium"]:
682
+ return "medium"
683
+ if iv > self.IV_THRESHOLDS["weak"]:
684
+ return "weak"
685
+ return "very_weak"
686
+
687
+ def _interpret_ks(self, ks: float) -> str:
688
+ if ks > self.KS_THRESHOLDS["medium"]:
689
+ return "strong"
690
+ if ks > self.KS_THRESHOLDS["weak"]:
691
+ return "medium"
692
+ return "weak"