churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,636 @@
1
+ from dataclasses import dataclass, field
2
+ from enum import Enum
3
+ from typing import Dict, List, Optional, Tuple
4
+
5
+ import numpy as np
6
+ from scipy import stats
7
+
8
+ from customer_retention.core.compat import DataFrame, pd
9
+ from customer_retention.core.utils import compute_effect_size
10
+
11
+
12
+ class TrendDirection(str, Enum):
13
+ INCREASING = "increasing"
14
+ DECREASING = "decreasing"
15
+ STABLE = "stable"
16
+ UNKNOWN = "unknown"
17
+
18
+
19
+ @dataclass
20
+ class TrendResult:
21
+ direction: TrendDirection
22
+ strength: float
23
+ slope: Optional[float] = None
24
+ p_value: Optional[float] = None
25
+ confidence: str = "low"
26
+
27
+ @property
28
+ def is_significant(self) -> bool:
29
+ return self.p_value is not None and self.p_value < 0.05
30
+
31
+ @property
32
+ def has_direction(self) -> bool:
33
+ return self.direction in [TrendDirection.INCREASING, TrendDirection.DECREASING]
34
+
35
+
36
+ @dataclass
37
+ class TrendRecommendation:
38
+ action: str
39
+ priority: str
40
+ reason: str
41
+ features: List[str] = field(default_factory=list)
42
+
43
+
44
+ @dataclass
45
+ class SeasonalityPeriod:
46
+ period: int
47
+ strength: float
48
+ period_name: Optional[str] = None
49
+
50
+
51
+ @dataclass
52
+ class RecencyResult:
53
+ avg_recency_days: float
54
+ median_recency_days: float
55
+ min_recency_days: float
56
+ max_recency_days: float
57
+ target_correlation: Optional[float] = None
58
+ recency_distribution: Optional[dict] = None
59
+
60
+
61
+ @dataclass
62
+ class GroupStats:
63
+ mean: float
64
+ median: float
65
+ std: float
66
+ q25: float
67
+ q75: float
68
+ count: int
69
+
70
+
71
+ @dataclass
72
+ class RecencyBucketStats:
73
+ bucket_label: str
74
+ bucket_range: Tuple[int, int]
75
+ entity_count: int
76
+ target_rate: float
77
+
78
+
79
+ @dataclass
80
+ class RecencyInsight:
81
+ finding: str
82
+ metric_value: float
83
+ metric_name: str
84
+
85
+
86
+ @dataclass
87
+ class AnomalyDiagnostics:
88
+ target_1_is_minority: bool
89
+ target_1_pct: float
90
+ retained_median_tenure: Optional[float] = None
91
+ churned_median_tenure: Optional[float] = None
92
+ tenure_explains_pattern: bool = False
93
+
94
+
95
+ @dataclass
96
+ class RecencyComparisonResult:
97
+ retained_stats: GroupStats
98
+ churned_stats: GroupStats
99
+ cohens_d: float
100
+ effect_interpretation: str
101
+ churned_higher: bool
102
+ recommendations: List[Dict]
103
+ bucket_stats: List[RecencyBucketStats] = field(default_factory=list)
104
+ key_findings: List[RecencyInsight] = field(default_factory=list)
105
+ inflection_bucket: Optional[str] = None
106
+ distribution_pattern: str = "unknown"
107
+ anomaly_diagnostics: Optional[AnomalyDiagnostics] = None
108
+
109
+
110
+ @dataclass
111
+ class CohortDistribution:
112
+ year_counts: Dict[int, int]
113
+ total_entities: int
114
+ dominant_year: int
115
+ dominant_pct: float
116
+ num_years: int
117
+
118
+
119
+ @dataclass
120
+ class CohortRecommendation:
121
+ action: str
122
+ priority: str
123
+ reason: str
124
+ features: List[str] = field(default_factory=list)
125
+ insight: Optional[str] = None
126
+
127
+
128
+ @dataclass
129
+ class TemporalPatternAnalysis:
130
+ trend: Optional[TrendResult] = None
131
+ seasonality: List[SeasonalityPeriod] = field(default_factory=list)
132
+ cohort_analysis: Optional[DataFrame] = None
133
+ recency_analysis: Optional[RecencyResult] = None
134
+
135
+
136
+ def compute_group_stats(values: np.ndarray) -> GroupStats:
137
+ return GroupStats(
138
+ mean=float(np.mean(values)),
139
+ median=float(np.median(values)),
140
+ std=float(np.std(values)),
141
+ q25=float(np.percentile(values, 25)),
142
+ q75=float(np.percentile(values, 75)),
143
+ count=len(values)
144
+ )
145
+
146
+
147
+ def generate_trend_recommendations(trend: TrendResult, mean_value: float = 1.0) -> List[TrendRecommendation]:
148
+ recommendations = []
149
+ daily_pct = (trend.slope / mean_value * 100) if trend.slope and mean_value else 0
150
+
151
+ if trend.has_direction and trend.strength > 0.3 and trend.is_significant:
152
+ recommendations.append(TrendRecommendation(
153
+ action="add_trend_features", priority="high",
154
+ features=["recent_vs_overall_ratio", "entity_trend_slope"],
155
+ reason=f"Strong {trend.direction.value} trend (R²={trend.strength:.2f}, {daily_pct:+.2f}%/day)"
156
+ ))
157
+ recommendations.append(TrendRecommendation(
158
+ action="consider_detrending", priority="medium", features=[],
159
+ reason="Strong trend may dominate signal - consider detrending aggregated features"
160
+ ))
161
+ recommendations.append(TrendRecommendation(
162
+ action="time_based_split", priority="high", features=[],
163
+ reason="Strong trend detected - use time-based train/test split to avoid leakage"
164
+ ))
165
+ elif trend.has_direction and trend.strength > 0.1 and trend.is_significant:
166
+ recommendations.append(TrendRecommendation(
167
+ action="add_trend_features", priority="medium",
168
+ features=["recent_vs_overall_ratio"],
169
+ reason=f"Moderate {trend.direction.value} trend (R²={trend.strength:.2f})"
170
+ ))
171
+ elif trend.direction == TrendDirection.STABLE:
172
+ recommendations.append(TrendRecommendation(
173
+ action="skip_trend_features", priority="low", features=[],
174
+ reason=f"No significant trend (R²={trend.strength:.2f}) - trend features unlikely to help"
175
+ ))
176
+ return recommendations
177
+
178
+
179
+ def analyze_cohort_distribution(first_events: DataFrame, time_column: str) -> CohortDistribution:
180
+ years = first_events[time_column].dt.year
181
+ year_counts = years.value_counts().sort_index().to_dict()
182
+ total = len(first_events)
183
+ dominant_year = years.mode().iloc[0] if len(years) > 0 else 0
184
+ dominant_pct = (year_counts.get(dominant_year, 0) / total * 100) if total > 0 else 0
185
+ return CohortDistribution(
186
+ year_counts=year_counts, total_entities=total,
187
+ dominant_year=int(dominant_year), dominant_pct=dominant_pct, num_years=len(year_counts)
188
+ )
189
+
190
+
191
+ def generate_cohort_recommendations(
192
+ dist: CohortDistribution, retention_variation: Optional[float] = None
193
+ ) -> List[CohortRecommendation]:
194
+ recommendations = []
195
+ skew_threshold = 80
196
+
197
+ if dist.dominant_pct > skew_threshold:
198
+ recommendations.append(CohortRecommendation(
199
+ action="skip_cohort_features", priority="low",
200
+ reason=f"{dist.dominant_pct:.0f}% onboarded in {dist.dominant_year} - insufficient variation",
201
+ insight="Established customer base, not a growing acquisition funnel"
202
+ ))
203
+ elif dist.num_years >= 3 and dist.dominant_pct < 60:
204
+ recommendations.append(CohortRecommendation(
205
+ action="add_cohort_features", priority="medium",
206
+ features=["cohort_year", "cohort_quarter"],
207
+ reason=f"Good variation across {dist.num_years} years - cohort features may be valuable"
208
+ ))
209
+ else:
210
+ recommendations.append(CohortRecommendation(
211
+ action="consider_cohort_features", priority="low",
212
+ features=["cohort_year"],
213
+ reason="Moderate variation - test if cohort features improve model"
214
+ ))
215
+
216
+ if retention_variation is not None and retention_variation > 0.1:
217
+ recommendations.append(CohortRecommendation(
218
+ action="investigate_cohort_retention", priority="medium",
219
+ reason=f"Retention varies {retention_variation*100:.0f}pp across cohorts - investigate drivers"
220
+ ))
221
+ return recommendations
222
+
223
+
224
+ DEFAULT_BUCKET_EDGES = [0, 7, 30, 90, 180, float("inf")]
225
+ BUCKET_LABELS = ["0-7d", "8-30d", "31-90d", "91-180d", ">180d"]
226
+ INFLECTION_MIN_DROP = 0.10
227
+ MONOTONIC_TOLERANCE = 0.05
228
+
229
+
230
+ def compute_recency_buckets(
231
+ df: DataFrame, entity_column: str, time_column: str, target_column: str,
232
+ reference_date: pd.Timestamp, bucket_edges: Optional[List[float]] = None
233
+ ) -> List[RecencyBucketStats]:
234
+ edges = bucket_edges or DEFAULT_BUCKET_EDGES
235
+ labels = _generate_bucket_labels(edges)
236
+ entity_last = df.groupby(entity_column)[time_column].max().reset_index()
237
+ entity_last["recency_days"] = (reference_date - entity_last[time_column]).dt.days
238
+ entity_target = df.groupby(entity_column)[target_column].first().reset_index()
239
+ entity_data = entity_last.merge(entity_target, on=entity_column)
240
+ entity_data["bucket"] = pd.cut(entity_data["recency_days"], bins=edges, labels=labels, include_lowest=True)
241
+ bucket_stats = []
242
+ for i, label in enumerate(labels):
243
+ bucket_data = entity_data[entity_data["bucket"] == label]
244
+ if len(bucket_data) == 0:
245
+ continue
246
+ bucket_stats.append(RecencyBucketStats(
247
+ bucket_label=label,
248
+ bucket_range=(int(edges[i]), int(edges[i + 1]) if edges[i + 1] != float("inf") else 9999),
249
+ entity_count=len(bucket_data),
250
+ target_rate=float(bucket_data[target_column].mean())
251
+ ))
252
+ return bucket_stats
253
+
254
+
255
+ def _generate_bucket_labels(edges: List[float]) -> List[str]:
256
+ labels = []
257
+ for i in range(len(edges) - 1):
258
+ start, end = int(edges[i]), edges[i + 1]
259
+ if end == float("inf"):
260
+ labels.append(f">{start}d")
261
+ elif start == 0:
262
+ labels.append(f"0-{int(end)}d")
263
+ else:
264
+ labels.append(f"{start + 1}-{int(end)}d")
265
+ return labels
266
+
267
+
268
+ def detect_inflection_bucket(buckets: List[RecencyBucketStats]) -> Optional[str]:
269
+ if len(buckets) < 2:
270
+ return None
271
+ max_drop, inflection_label = 0.0, None
272
+ for i in range(len(buckets) - 1):
273
+ drop = buckets[i].target_rate - buckets[i + 1].target_rate
274
+ if drop > max_drop:
275
+ max_drop, inflection_label = drop, buckets[i + 1].bucket_label
276
+ return inflection_label if max_drop >= INFLECTION_MIN_DROP else None
277
+
278
+
279
+ def classify_distribution_pattern(buckets: List[RecencyBucketStats]) -> str:
280
+ if len(buckets) < 2:
281
+ return "insufficient_data"
282
+ rates = [b.target_rate for b in buckets]
283
+ total_drop = rates[0] - rates[-1]
284
+ if abs(total_drop) < MONOTONIC_TOLERANCE:
285
+ return "flat_no_pattern"
286
+ drops = [rates[i] - rates[i + 1] for i in range(len(rates) - 1)]
287
+ max_drop = max(drops) if drops else 0
288
+ avg_drop = total_drop / (len(rates) - 1) if len(rates) > 1 else 0
289
+ if max_drop > avg_drop * 2 and max_drop >= INFLECTION_MIN_DROP:
290
+ return "threshold_step"
291
+ if all(d >= -MONOTONIC_TOLERANCE for d in drops):
292
+ return "monotonic_decline"
293
+ return "variable"
294
+
295
+
296
+ def _diagnose_anomaly_pattern(
297
+ df: DataFrame, entity_column: str, time_column: str, target_column: str
298
+ ) -> AnomalyDiagnostics:
299
+ entity_target = df.groupby(entity_column)[target_column].first()
300
+ target_1_pct = float(entity_target.mean() * 100)
301
+ target_1_is_minority = target_1_pct < 50
302
+ entity_first = df.groupby(entity_column)[time_column].min()
303
+ entity_last = df.groupby(entity_column)[time_column].max()
304
+ tenure = (entity_last - entity_first).dt.days
305
+ tenure_by_target = pd.DataFrame({"target": entity_target, "tenure": tenure})
306
+ retained_tenure = tenure_by_target[tenure_by_target["target"] == 1]["tenure"]
307
+ churned_tenure = tenure_by_target[tenure_by_target["target"] == 0]["tenure"]
308
+ retained_median_tenure = float(retained_tenure.median()) if len(retained_tenure) > 0 else None
309
+ churned_median_tenure = float(churned_tenure.median()) if len(churned_tenure) > 0 else None
310
+ tenure_explains = False
311
+ if retained_median_tenure and churned_median_tenure:
312
+ tenure_explains = retained_median_tenure > churned_median_tenure * 1.5
313
+ return AnomalyDiagnostics(
314
+ target_1_is_minority=target_1_is_minority,
315
+ target_1_pct=target_1_pct,
316
+ retained_median_tenure=retained_median_tenure,
317
+ churned_median_tenure=churned_median_tenure,
318
+ tenure_explains_pattern=tenure_explains
319
+ )
320
+
321
+
322
+ def generate_recency_insights(result: "RecencyComparisonResult") -> List[RecencyInsight]:
323
+ insights = []
324
+ median_gap = result.churned_stats.median - result.retained_stats.median
325
+ gap_direction = "longer" if median_gap > 0 else "shorter"
326
+ insights.append(RecencyInsight(
327
+ finding=f"Churned entities last active {abs(median_gap):.0f} days {gap_direction} than retained (median: {result.churned_stats.median:.0f}d vs {result.retained_stats.median:.0f}d)",
328
+ metric_value=median_gap,
329
+ metric_name="median_gap_days"
330
+ ))
331
+ if not result.churned_higher and result.anomaly_diagnostics:
332
+ diag = result.anomaly_diagnostics
333
+ anomaly_parts = ["⚠️ Unusual pattern: churned have MORE recent activity."]
334
+ if diag.target_1_is_minority:
335
+ anomaly_parts.append(f"Target=1 is minority ({diag.target_1_pct:.0f}%) - likely means CHURN not retention.")
336
+ else:
337
+ anomaly_parts.append(f"Target=1 is majority ({diag.target_1_pct:.0f}%) - confirms retention label.")
338
+ if diag.tenure_explains_pattern:
339
+ anomaly_parts.append(f"Tenure gap explains pattern: retained={diag.retained_median_tenure:.0f}d vs churned={diag.churned_median_tenure:.0f}d median tenure.")
340
+ insights.append(RecencyInsight(finding=" ".join(anomaly_parts), metric_value=0.0, metric_name="pattern_anomaly"))
341
+ effect_desc = _effect_size_description(result.cohens_d, result.effect_interpretation)
342
+ insights.append(RecencyInsight(finding=effect_desc, metric_value=abs(result.cohens_d), metric_name="effect_size"))
343
+ if result.inflection_bucket and result.churned_higher:
344
+ insights.append(RecencyInsight(
345
+ finding=f"Sharpest target rate drop occurs at {result.inflection_bucket} boundary",
346
+ metric_value=0.0, metric_name="inflection_point"
347
+ ))
348
+ return insights
349
+
350
+
351
+ def _effect_size_description(cohens_d: float, interpretation: str) -> str:
352
+ abs_d = abs(cohens_d)
353
+ if abs_d >= 0.8:
354
+ return f"Recency strongly discriminates target ({interpretation}, d={cohens_d:+.2f}) - high predictive value"
355
+ if abs_d >= 0.5:
356
+ return f"Recency moderately discriminates target ({interpretation}, d={cohens_d:+.2f}) - useful predictor"
357
+ if abs_d >= 0.2:
358
+ return f"Recency weakly discriminates target ({interpretation}, d={cohens_d:+.2f}) - may help in combination"
359
+ return f"Recency has minimal discriminative power ({interpretation}, d={cohens_d:+.2f})"
360
+
361
+
362
+ def _generate_enhanced_recommendations(
363
+ churned_higher: bool, cohens_d: float, inflection_bucket: Optional[str],
364
+ distribution_pattern: str, bucket_stats: List[RecencyBucketStats],
365
+ anomaly_diagnostics: Optional[AnomalyDiagnostics] = None
366
+ ) -> List[Dict]:
367
+ recommendations = []
368
+ if not churned_higher:
369
+ diag = anomaly_diagnostics
370
+ if diag and diag.target_1_is_minority:
371
+ recommendations.append({
372
+ "action": "invert_target_interpretation", "priority": "high",
373
+ "reason": f"Target=1 is minority ({diag.target_1_pct:.0f}%) - interpret as CHURN; recency pattern is classic churn behavior",
374
+ "features": ["days_since_last_event", "log_recency"]
375
+ })
376
+ elif diag and diag.tenure_explains_pattern:
377
+ recommendations.append({
378
+ "action": "use_tenure_adjusted_recency", "priority": "high",
379
+ "reason": f"Retained have {diag.retained_median_tenure:.0f}d vs churned {diag.churned_median_tenure:.0f}d median tenure - use recency relative to tenure",
380
+ "features": ["recency_vs_tenure_ratio", "normalized_recency"]
381
+ })
382
+ else:
383
+ recommendations.append({
384
+ "action": "investigate_further", "priority": "high",
385
+ "reason": "Pattern unexpected and not explained by target definition or tenure - review data collection",
386
+ "features": []
387
+ })
388
+ if diag and not diag.target_1_is_minority and not diag.tenure_explains_pattern:
389
+ recommendations.append({
390
+ "action": "check_pre_churn_activity", "priority": "medium",
391
+ "reason": "Churned may show activity spike before leaving (support tickets, complaints)",
392
+ "features": ["activity_trend_last_30d", "support_interaction_count"]
393
+ })
394
+ return recommendations[:3]
395
+ abs_d = abs(cohens_d)
396
+ if abs_d >= 0.5:
397
+ recommendations.append({
398
+ "action": "add_recency_features", "priority": "high",
399
+ "reason": f"Strong effect size (d={cohens_d:+.2f}) - recency is a key predictor",
400
+ "features": ["days_since_last_event", "log_recency"]
401
+ })
402
+ if inflection_bucket and distribution_pattern == "threshold_step":
403
+ threshold_days = _extract_threshold_from_bucket(inflection_bucket)
404
+ recommendations.append({
405
+ "action": "create_activity_threshold_flag", "priority": "high",
406
+ "reason": f"Clear threshold at {inflection_bucket}: create binary is_active_{threshold_days}d flag",
407
+ "features": [f"is_active_{threshold_days}d"]
408
+ })
409
+ elif distribution_pattern == "monotonic_decline":
410
+ recommendations.append({
411
+ "action": "use_continuous_recency", "priority": "medium",
412
+ "reason": "Monotonic decline pattern: continuous recency features outperform binary flags",
413
+ "features": ["days_since_last_event", "log_recency", "recency_percentile"]
414
+ })
415
+ if len(recommendations) < 2 and bucket_stats:
416
+ recommendations.append({
417
+ "action": "add_recency_buckets", "priority": "medium",
418
+ "reason": "Create recency bucket features for interpretable segments",
419
+ "features": ["recency_bucket"]
420
+ })
421
+ return recommendations[:3]
422
+
423
+
424
+ def _extract_threshold_from_bucket(bucket_label: str) -> int:
425
+ import re
426
+ match = re.search(r"(\d+)", bucket_label)
427
+ return int(match.group(1)) if match else 30
428
+
429
+
430
+ def compare_recency_by_target(
431
+ df: DataFrame, entity_column: str, time_column: str, target_column: str,
432
+ reference_date: Optional[pd.Timestamp] = None, cap_percentile: float = 0.99
433
+ ) -> Optional[RecencyComparisonResult]:
434
+ if target_column not in df.columns:
435
+ return None
436
+ ref_date = reference_date or df[time_column].max()
437
+ entity_last = df.groupby(entity_column)[time_column].max().reset_index()
438
+ entity_last["recency_days"] = (ref_date - entity_last[time_column]).dt.days
439
+ entity_target = df.groupby(entity_column)[target_column].first().reset_index()
440
+ entity_recency = entity_last.merge(entity_target, on=entity_column)
441
+ cap = entity_recency["recency_days"].quantile(cap_percentile)
442
+ entity_capped = entity_recency[entity_recency["recency_days"] <= cap]
443
+ retained = entity_capped[entity_capped[target_column] == 1]["recency_days"].values
444
+ churned = entity_capped[entity_capped[target_column] == 0]["recency_days"].values
445
+ if len(retained) < 2 or len(churned) < 2:
446
+ return None
447
+ cohens_d, effect_interp = compute_effect_size(retained, churned)
448
+ churned_higher = bool(np.median(churned) > np.median(retained))
449
+ bucket_stats = compute_recency_buckets(df, entity_column, time_column, target_column, ref_date)
450
+ inflection_bucket = detect_inflection_bucket(bucket_stats)
451
+ distribution_pattern = classify_distribution_pattern(bucket_stats)
452
+ anomaly_diag = _diagnose_anomaly_pattern(df, entity_column, time_column, target_column) if not churned_higher else None
453
+ recommendations = _generate_enhanced_recommendations(
454
+ churned_higher, cohens_d, inflection_bucket, distribution_pattern, bucket_stats, anomaly_diag
455
+ )
456
+ result = RecencyComparisonResult(
457
+ retained_stats=compute_group_stats(retained),
458
+ churned_stats=compute_group_stats(churned),
459
+ cohens_d=cohens_d, effect_interpretation=effect_interp,
460
+ churned_higher=churned_higher, recommendations=recommendations,
461
+ bucket_stats=bucket_stats, inflection_bucket=inflection_bucket,
462
+ distribution_pattern=distribution_pattern, anomaly_diagnostics=anomaly_diag
463
+ )
464
+ result.key_findings = generate_recency_insights(result)
465
+ return result
466
+
467
+
468
+ class TemporalPatternAnalyzer:
469
+ TREND_THRESHOLD = 0.001
470
+ CONFIDENCE_HIGH_P = 0.01
471
+ CONFIDENCE_HIGH_R2 = 0.5
472
+ CONFIDENCE_MED_P = 0.05
473
+ CONFIDENCE_MED_R2 = 0.3
474
+
475
+ def __init__(self, time_column: str):
476
+ self.time_column = time_column
477
+
478
+ def analyze(self, df: DataFrame, value_column: str, entity_column: Optional[str] = None, target_column: Optional[str] = None) -> TemporalPatternAnalysis:
479
+ if len(df) < 2:
480
+ return TemporalPatternAnalysis()
481
+
482
+ trend = self.detect_trend(df, value_column)
483
+ seasonality = self.detect_seasonality(df, value_column)
484
+
485
+ return TemporalPatternAnalysis(
486
+ trend=trend,
487
+ seasonality=seasonality,
488
+ )
489
+
490
+ @staticmethod
491
+ def _unknown_trend() -> TrendResult:
492
+ return TrendResult(direction=TrendDirection.UNKNOWN, strength=0.0, confidence="low")
493
+
494
+ def detect_trend(self, df: DataFrame, value_column: str) -> TrendResult:
495
+ if len(df) < 3:
496
+ return self._unknown_trend()
497
+
498
+ df_clean = df[[self.time_column, value_column]].dropna()
499
+ if len(df_clean) < 3:
500
+ return self._unknown_trend()
501
+
502
+ time_col = pd.to_datetime(df_clean[self.time_column])
503
+ x = (time_col - time_col.min()).dt.total_seconds() / 86400
504
+ y = df_clean[value_column].values
505
+
506
+ slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
507
+ r_squared = r_value ** 2
508
+
509
+ mean_y = np.mean(y)
510
+ normalized_slope = slope / mean_y if mean_y != 0 else 0
511
+
512
+ if abs(normalized_slope) < self.TREND_THRESHOLD:
513
+ direction = TrendDirection.STABLE
514
+ elif slope > 0:
515
+ direction = TrendDirection.INCREASING
516
+ else:
517
+ direction = TrendDirection.DECREASING
518
+
519
+ if p_value < self.CONFIDENCE_HIGH_P and r_squared > self.CONFIDENCE_HIGH_R2:
520
+ confidence = "high"
521
+ elif p_value < self.CONFIDENCE_MED_P and r_squared > self.CONFIDENCE_MED_R2:
522
+ confidence = "medium"
523
+ else:
524
+ confidence = "low"
525
+
526
+ return TrendResult(
527
+ direction=direction,
528
+ strength=r_squared,
529
+ slope=slope,
530
+ p_value=p_value,
531
+ confidence=confidence
532
+ )
533
+
534
+ def detect_seasonality(self, df: DataFrame, value_column: str, max_periods: int = 3, additional_lags: Optional[List[int]] = None) -> List[SeasonalityPeriod]:
535
+ if len(df) < 14:
536
+ return []
537
+
538
+ df_clean = df[[self.time_column, value_column]].dropna()
539
+ if len(df_clean) < 14:
540
+ return []
541
+
542
+ df_sorted = df_clean.sort_values(self.time_column)
543
+ values = df_sorted[value_column].values
544
+
545
+ results = []
546
+ period_names = {7: "weekly", 14: "bi-weekly", 21: "tri-weekly", 30: "monthly", 90: "quarterly", 180: "semi-annual", 365: "yearly"}
547
+
548
+ base_lags = [7, 14, 21, 30]
549
+ all_lags = list(set(base_lags + (additional_lags or [])))
550
+
551
+ for lag in all_lags:
552
+ if lag >= len(values) // 2:
553
+ continue
554
+
555
+ acf = self._autocorrelation(values, lag)
556
+
557
+ if acf > 0.3:
558
+ period_name = period_names.get(lag, f"{lag}-day")
559
+ results.append(SeasonalityPeriod(
560
+ period=lag,
561
+ strength=acf,
562
+ period_name=period_name
563
+ ))
564
+
565
+ results.sort(key=lambda x: x.strength, reverse=True)
566
+ return results[:max_periods]
567
+
568
+ def _autocorrelation(self, series: np.ndarray, lag: int) -> float:
569
+ n = len(series)
570
+ if lag >= n:
571
+ return 0.0
572
+
573
+ mean = np.mean(series)
574
+ var = np.var(series)
575
+
576
+ if var == 0:
577
+ return 0.0
578
+
579
+ cov = np.mean((series[:-lag] - mean) * (series[lag:] - mean))
580
+ return cov / var
581
+
582
+ def analyze_cohorts(self, df: DataFrame, entity_column: str, cohort_column: str, target_column: Optional[str] = None, period: str = "M") -> DataFrame:
583
+ if len(df) == 0:
584
+ return pd.DataFrame()
585
+
586
+ df_copy = df.copy()
587
+ entity_first_event = df_copy.groupby(entity_column)[cohort_column].min()
588
+ df_copy["_cohort"] = df_copy[entity_column].map(entity_first_event)
589
+ df_copy["_cohort"] = pd.to_datetime(df_copy["_cohort"]).dt.to_period(period)
590
+
591
+ entity_cohorts = df_copy.groupby(entity_column)["_cohort"].first().reset_index()
592
+ entity_cohorts.columns = [entity_column, "_cohort"]
593
+
594
+ cohort_stats = entity_cohorts.groupby("_cohort").agg({entity_column: "count"}).reset_index()
595
+ cohort_stats.columns = ["cohort", "entity_count"]
596
+
597
+ cohort_dates = df_copy.groupby("_cohort")[self.time_column].agg(["min", "max"]).reset_index()
598
+ cohort_dates.columns = ["cohort", "first_event", "last_event"]
599
+ cohort_stats = cohort_stats.merge(cohort_dates, on="cohort", how="left")
600
+
601
+ if target_column and target_column in df.columns:
602
+ entity_target = df_copy.groupby(entity_column)[target_column].max()
603
+ entity_cohorts["_target"] = entity_cohorts[entity_column].map(entity_target)
604
+ target_stats = entity_cohorts.groupby("_cohort")["_target"].mean().reset_index()
605
+ target_stats.columns = ["cohort", "retention_rate"]
606
+ cohort_stats = cohort_stats.merge(target_stats, on="cohort", how="left")
607
+
608
+ return cohort_stats.sort_values("cohort")
609
+
610
+ def analyze_recency(self, df: DataFrame, entity_column: str, target_column: Optional[str] = None, reference_date: Optional[pd.Timestamp] = None) -> RecencyResult:
611
+ if len(df) == 0:
612
+ return RecencyResult(avg_recency_days=0, median_recency_days=0, min_recency_days=0, max_recency_days=0)
613
+
614
+ ref_date = reference_date or pd.Timestamp.now()
615
+ pd.to_datetime(df[self.time_column])
616
+
617
+ entity_last = df.groupby(entity_column)[self.time_column].max()
618
+ entity_last = pd.to_datetime(entity_last)
619
+ recency_days = (ref_date - entity_last).dt.days
620
+
621
+ target_correlation = None
622
+ if target_column and target_column in df.columns:
623
+ entity_target = df.groupby(entity_column)[target_column].first()
624
+ combined = pd.DataFrame({"recency": recency_days, "target": entity_target}).dropna()
625
+
626
+ if len(combined) > 2:
627
+ corr, _ = stats.pearsonr(combined["recency"], combined["target"])
628
+ target_correlation = corr
629
+
630
+ return RecencyResult(
631
+ avg_recency_days=float(recency_days.mean()),
632
+ median_recency_days=float(recency_days.median()),
633
+ min_recency_days=float(recency_days.min()),
634
+ max_recency_days=float(recency_days.max()),
635
+ target_correlation=target_correlation,
636
+ )