churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,382 @@
1
+ from typing import Optional
2
+
3
+ from customer_retention.core.compat import DataFrame, is_datetime64_any_dtype, is_numeric_dtype, is_string_dtype, pd
4
+ from customer_retention.core.config.column_config import ColumnType, DatasetGranularity
5
+
6
+ from .profile_result import GranularityResult, TypeConfidence, TypeInference
7
+
8
+
9
+ class TypeDetector:
10
+ IDENTIFIER_PATTERNS = ["id", "key", "code", "uuid", "guid"]
11
+ TARGET_PATTERNS_PRIMARY = ["churned", "retained", "churn", "retention", "attrition"]
12
+ TARGET_PATTERNS_SECONDARY = [
13
+ "unsubscribe", "unsubscribed", "terminate", "terminated", "cancel", "cancelled",
14
+ "close", "closed", "discontinue", "discontinued", "exit", "exited", "leave", "left",
15
+ ]
16
+ TARGET_PATTERNS_GENERIC = ["target", "label", "outcome", "class", "flag"]
17
+ CYCLICAL_DAY_PATTERNS = ["mon", "tue", "wed", "thu", "fri", "sat", "sun", "monday", "tuesday", "wednesday"]
18
+ CYCLICAL_MONTH_PATTERNS = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]
19
+
20
+ def __init__(self):
21
+ self.evidence = []
22
+
23
+ def detect_type(self, series: pd.Series, column_name: str) -> TypeInference:
24
+ self.evidence = []
25
+
26
+ if self.is_identifier(series, column_name):
27
+ return TypeInference(
28
+ inferred_type=ColumnType.IDENTIFIER,
29
+ confidence=TypeConfidence.HIGH,
30
+ evidence=self.evidence.copy()
31
+ )
32
+
33
+ if self.is_target(series, column_name):
34
+ return TypeInference(
35
+ inferred_type=ColumnType.TARGET,
36
+ confidence=TypeConfidence.HIGH,
37
+ evidence=self.evidence.copy()
38
+ )
39
+
40
+ if self.is_binary(series):
41
+ return TypeInference(
42
+ inferred_type=ColumnType.BINARY,
43
+ confidence=TypeConfidence.HIGH,
44
+ evidence=self.evidence.copy()
45
+ )
46
+
47
+ if self.is_datetime(series):
48
+ return TypeInference(
49
+ inferred_type=ColumnType.DATETIME,
50
+ confidence=TypeConfidence.HIGH,
51
+ evidence=self.evidence.copy()
52
+ )
53
+
54
+ if is_numeric_dtype(series):
55
+ return self.detect_numeric_type(series)
56
+
57
+ if is_string_dtype(series) or series.dtype == object:
58
+ return self.detect_categorical_type(series)
59
+
60
+ return TypeInference(
61
+ inferred_type=ColumnType.UNKNOWN,
62
+ confidence=TypeConfidence.LOW,
63
+ evidence=["Could not determine type"]
64
+ )
65
+
66
+ def is_identifier(self, series: pd.Series, column_name: str) -> bool:
67
+ column_lower = column_name.lower()
68
+ if any(pattern in column_lower for pattern in self.IDENTIFIER_PATTERNS):
69
+ self.evidence.append("Column name contains identifier pattern")
70
+ return True
71
+
72
+ if len(series) == 0:
73
+ return False
74
+
75
+ if is_datetime64_any_dtype(series):
76
+ return False
77
+
78
+ if is_numeric_dtype(series):
79
+ return False
80
+
81
+ distinct_count = series.nunique()
82
+ distinct_ratio = distinct_count / len(series)
83
+
84
+ if distinct_ratio == 1.0 and distinct_count <= 100:
85
+ if series.dtype == object:
86
+ sample = series.dropna().head(100)
87
+ if len(sample) > 0:
88
+ parseable_count = 0
89
+ for value in sample:
90
+ try:
91
+ pd.to_datetime(value, format='mixed')
92
+ parseable_count += 1
93
+ except (ValueError, TypeError):
94
+ pass
95
+
96
+ if parseable_count / len(sample) > 0.8:
97
+ return False
98
+
99
+ self.evidence.append("All values are unique (100%)")
100
+ return True
101
+
102
+ return False
103
+
104
+ def is_target(self, series: pd.Series, column_name: str) -> bool:
105
+ column_lower = column_name.lower()
106
+ distinct_count = series.nunique()
107
+ if distinct_count > 10:
108
+ return False
109
+
110
+ for pattern in self.TARGET_PATTERNS_PRIMARY:
111
+ if pattern in column_lower:
112
+ self.evidence.append(f"Column name contains primary target pattern '{pattern}' with {distinct_count} classes")
113
+ return True
114
+
115
+ for pattern in self.TARGET_PATTERNS_SECONDARY:
116
+ if pattern in column_lower:
117
+ self.evidence.append(f"Column name contains secondary target pattern '{pattern}' with {distinct_count} classes")
118
+ return True
119
+
120
+ for pattern in self.TARGET_PATTERNS_GENERIC:
121
+ if pattern in column_lower:
122
+ self.evidence.append(f"Column name contains generic target pattern '{pattern}' with {distinct_count} classes")
123
+ return True
124
+
125
+ return False
126
+
127
+ def is_binary(self, series: pd.Series) -> bool:
128
+ distinct_count = series.nunique()
129
+ if distinct_count != 2:
130
+ return False
131
+
132
+ unique_values = set(series.dropna().unique())
133
+
134
+ binary_sets = [
135
+ {0, 1}, {0.0, 1.0},
136
+ {True, False},
137
+ {"0", "1"},
138
+ {"yes", "no"}, {"Yes", "No"}, {"YES", "NO"},
139
+ {"true", "false"}, {"True", "False"}, {"TRUE", "FALSE"},
140
+ {"y", "n"}, {"Y", "N"}
141
+ ]
142
+
143
+ for binary_set in binary_sets:
144
+ if unique_values == binary_set or unique_values.issubset(binary_set):
145
+ self.evidence.append(f"Exactly 2 unique values: {unique_values}")
146
+ return True
147
+
148
+ if distinct_count == 2:
149
+ self.evidence.append(f"Exactly 2 unique values (non-standard): {unique_values}")
150
+ return True
151
+
152
+ return False
153
+
154
+ def is_datetime(self, series: pd.Series) -> bool:
155
+ if is_datetime64_any_dtype(series):
156
+ self.evidence.append("Column is datetime dtype")
157
+ return True
158
+
159
+ if series.dtype == object:
160
+ sample = series.dropna().head(100)
161
+ if len(sample) == 0:
162
+ return False
163
+
164
+ parseable_count = 0
165
+ for value in sample:
166
+ try:
167
+ pd.to_datetime(value, format='mixed')
168
+ parseable_count += 1
169
+ except (ValueError, TypeError):
170
+ pass
171
+
172
+ if parseable_count / len(sample) > 0.8:
173
+ self.evidence.append(f"{parseable_count}/{len(sample)} values parseable as datetime")
174
+ return True
175
+
176
+ return False
177
+
178
+ def detect_numeric_type(self, series: pd.Series) -> TypeInference:
179
+ distinct_count = series.nunique()
180
+
181
+ if distinct_count <= 20:
182
+ self.evidence.append(f"Numeric with {distinct_count} unique values (≤20)")
183
+ return TypeInference(
184
+ inferred_type=ColumnType.NUMERIC_DISCRETE,
185
+ confidence=TypeConfidence.MEDIUM,
186
+ evidence=self.evidence.copy(),
187
+ alternatives=[ColumnType.NUMERIC_CONTINUOUS]
188
+ )
189
+
190
+ self.evidence.append(f"Numeric with {distinct_count} unique values (>20)")
191
+ return TypeInference(
192
+ inferred_type=ColumnType.NUMERIC_CONTINUOUS,
193
+ confidence=TypeConfidence.HIGH,
194
+ evidence=self.evidence.copy()
195
+ )
196
+
197
+ def detect_categorical_type(self, series: pd.Series) -> TypeInference:
198
+ if len(series) == 0 or series.dropna().empty:
199
+ return TypeInference(
200
+ inferred_type=ColumnType.UNKNOWN,
201
+ confidence=TypeConfidence.LOW,
202
+ evidence=["Empty or all-null series"]
203
+ )
204
+
205
+ distinct_count = series.nunique()
206
+
207
+ if self.is_cyclical_pattern(series):
208
+ return TypeInference(
209
+ inferred_type=ColumnType.CATEGORICAL_CYCLICAL,
210
+ confidence=TypeConfidence.MEDIUM,
211
+ evidence=self.evidence.copy()
212
+ )
213
+
214
+ if distinct_count <= 10:
215
+ self.evidence.append(f"String with {distinct_count} unique values (≤10)")
216
+ return TypeInference(
217
+ inferred_type=ColumnType.CATEGORICAL_NOMINAL,
218
+ confidence=TypeConfidence.HIGH,
219
+ evidence=self.evidence.copy()
220
+ )
221
+
222
+ if distinct_count <= 100:
223
+ self.evidence.append(f"String with {distinct_count} unique values (≤100)")
224
+ return TypeInference(
225
+ inferred_type=ColumnType.CATEGORICAL_NOMINAL,
226
+ confidence=TypeConfidence.MEDIUM,
227
+ evidence=self.evidence.copy()
228
+ )
229
+
230
+ self.evidence.append(f"String with {distinct_count} unique values (>100)")
231
+ return TypeInference(
232
+ inferred_type=ColumnType.TEXT,
233
+ confidence=TypeConfidence.MEDIUM,
234
+ evidence=self.evidence.copy(),
235
+ alternatives=[ColumnType.CATEGORICAL_NOMINAL]
236
+ )
237
+
238
+ def is_cyclical_pattern(self, series: pd.Series) -> bool:
239
+ sample_values = [str(v).lower() for v in series.dropna().unique()[:20]]
240
+
241
+ if len(sample_values) == 0:
242
+ return False
243
+
244
+ day_matches = sum(1 for v in sample_values if any(day in v for day in self.CYCLICAL_DAY_PATTERNS))
245
+ if day_matches >= min(3, len(sample_values)):
246
+ self.evidence.append("Contains day name patterns (cyclical)")
247
+ return True
248
+
249
+ month_matches = sum(1 for v in sample_values if any(month in v for month in self.CYCLICAL_MONTH_PATTERNS))
250
+ if month_matches >= min(3, len(sample_values)):
251
+ self.evidence.append("Contains month name patterns (cyclical)")
252
+ return True
253
+
254
+ return False
255
+
256
+ def detect_granularity(self, df: DataFrame) -> GranularityResult:
257
+ """Detect whether dataset is entity-level or event-level (time series)."""
258
+ evidence = []
259
+
260
+ if df is None or len(df) == 0 or len(df.columns) == 0:
261
+ return GranularityResult(
262
+ granularity=DatasetGranularity.UNKNOWN,
263
+ evidence=["Empty or invalid DataFrame"]
264
+ )
265
+
266
+ entity_column = self._detect_entity_column(df)
267
+ time_column = self._detect_time_column(df)
268
+
269
+ if entity_column is None:
270
+ evidence.append("No clear entity/ID column detected")
271
+ return GranularityResult(
272
+ granularity=DatasetGranularity.UNKNOWN,
273
+ evidence=evidence
274
+ )
275
+
276
+ unique_entities = df[entity_column].nunique()
277
+ total_rows = len(df)
278
+ avg_events = total_rows / unique_entities if unique_entities > 0 else 0
279
+
280
+ if unique_entities == total_rows:
281
+ evidence.append(f"Each {entity_column} appears exactly once")
282
+ return GranularityResult(
283
+ granularity=DatasetGranularity.ENTITY_LEVEL,
284
+ entity_column=entity_column,
285
+ time_column=time_column,
286
+ unique_entities=unique_entities,
287
+ total_rows=total_rows,
288
+ avg_events_per_entity=1.0,
289
+ evidence=evidence
290
+ )
291
+
292
+ if avg_events > 1.5 and time_column is not None:
293
+ evidence.append(f"Multiple rows per {entity_column} (avg {avg_events:.1f})")
294
+ evidence.append(f"Temporal column detected: {time_column}")
295
+ return GranularityResult(
296
+ granularity=DatasetGranularity.EVENT_LEVEL,
297
+ entity_column=entity_column,
298
+ time_column=time_column,
299
+ unique_entities=unique_entities,
300
+ total_rows=total_rows,
301
+ avg_events_per_entity=round(avg_events, 2),
302
+ evidence=evidence
303
+ )
304
+
305
+ if avg_events > 1.5:
306
+ evidence.append(f"Multiple rows per {entity_column} but no datetime column")
307
+ return GranularityResult(
308
+ granularity=DatasetGranularity.EVENT_LEVEL,
309
+ entity_column=entity_column,
310
+ time_column=None,
311
+ unique_entities=unique_entities,
312
+ total_rows=total_rows,
313
+ avg_events_per_entity=round(avg_events, 2),
314
+ evidence=evidence
315
+ )
316
+
317
+ evidence.append("Could not determine granularity with confidence")
318
+ return GranularityResult(
319
+ granularity=DatasetGranularity.UNKNOWN,
320
+ entity_column=entity_column,
321
+ time_column=time_column,
322
+ evidence=evidence
323
+ )
324
+
325
+ def _detect_entity_column(self, df: DataFrame) -> Optional[str]:
326
+ """Find the most likely entity/ID column."""
327
+ candidates = []
328
+
329
+ for col in df.columns:
330
+ col_lower = col.lower()
331
+
332
+ if any(pattern in col_lower for pattern in self.IDENTIFIER_PATTERNS):
333
+ unique_ratio = df[col].nunique() / len(df)
334
+ if 0.01 < unique_ratio < 1.0:
335
+ candidates.append((col, unique_ratio, "name_match"))
336
+ elif unique_ratio == 1.0:
337
+ candidates.append((col, unique_ratio, "unique_id"))
338
+
339
+ if not candidates:
340
+ for col in df.columns:
341
+ if df[col].dtype == object or str(df[col].dtype).startswith("str"):
342
+ unique_ratio = df[col].nunique() / len(df)
343
+ if 0.01 < unique_ratio < 0.5:
344
+ candidates.append((col, unique_ratio, "string_repeating"))
345
+
346
+ if not candidates:
347
+ return None
348
+
349
+ for col, ratio, match_type in candidates:
350
+ if match_type == "name_match" and ratio < 1.0:
351
+ return col
352
+
353
+ for col, ratio, match_type in candidates:
354
+ if match_type == "unique_id":
355
+ return col
356
+
357
+ return candidates[0][0] if candidates else None
358
+
359
+ def _detect_time_column(self, df: DataFrame) -> Optional[str]:
360
+ """Find the most likely datetime/timestamp column."""
361
+ for col in df.columns:
362
+ if is_datetime64_any_dtype(df[col]):
363
+ return col
364
+
365
+ datetime_patterns = ["date", "time", "timestamp", "created", "updated", "sent", "event"]
366
+ for col in df.columns:
367
+ col_lower = col.lower()
368
+ if any(pattern in col_lower for pattern in datetime_patterns):
369
+ if df[col].dtype == object:
370
+ sample = df[col].dropna().head(20)
371
+ if len(sample) > 0:
372
+ parseable = 0
373
+ for val in sample:
374
+ try:
375
+ pd.to_datetime(val, format='mixed')
376
+ parseable += 1
377
+ except (ValueError, TypeError):
378
+ pass
379
+ if parseable / len(sample) > 0.8:
380
+ return col
381
+
382
+ return None
@@ -0,0 +1,288 @@
1
+ from dataclasses import dataclass
2
+ from typing import Dict, List, Optional, Tuple
3
+
4
+ import pandas as pd
5
+
6
+ from .temporal_pattern_analyzer import SeasonalityPeriod
7
+ from .time_series_profiler import ActivitySegmentResult, LifecycleQuadrantResult
8
+
9
+ WINDOW_DAYS_MAP: Dict[str, Optional[float]] = {
10
+ "24h": 1.0, "7d": 7.0, "14d": 14.0, "30d": 30.0,
11
+ "90d": 90.0, "180d": 180.0, "365d": 365.0, "all_time": None,
12
+ }
13
+
14
+ SEASONALITY_WINDOW_MAP: Dict[int, str] = {
15
+ 1: "24h", 7: "7d", 14: "14d", 30: "30d", 90: "90d", 365: "365d",
16
+ }
17
+
18
+ TIMING_TOLERANCE = 0.5
19
+
20
+
21
+ @dataclass
22
+ class TemporalHeterogeneityResult:
23
+ eta_squared_intensity: float
24
+ eta_squared_event_count: float
25
+ heterogeneity_level: str
26
+ segmentation_advisory: str
27
+ advisory_rationale: List[str]
28
+ coverage_table: pd.DataFrame
29
+
30
+
31
+ @dataclass
32
+ class WindowUnionResult:
33
+ windows: List[str]
34
+ explanation: pd.DataFrame
35
+ heterogeneity: TemporalHeterogeneityResult
36
+ coverage_threshold: float
37
+ feature_count_estimate: int
38
+
39
+
40
+ class WindowRecommendationCollector:
41
+ ALL_CANDIDATE_WINDOWS = ["24h", "7d", "14d", "30d", "90d", "180d", "365d", "all_time"]
42
+
43
+ def __init__(self, coverage_threshold: float = 0.10, always_include: Optional[List[str]] = None):
44
+ self._coverage_threshold = coverage_threshold
45
+ self._always_include = always_include if always_include is not None else ["all_time"]
46
+ self._segment_lifecycles: Optional[pd.DataFrame] = None
47
+ self._quadrant_lifecycles: Optional[pd.DataFrame] = None
48
+ self._seasonality_periods: List[SeasonalityPeriod] = []
49
+ self._inter_event_median: Optional[float] = None
50
+ self._inter_event_mean: Optional[float] = None
51
+
52
+ def add_segment_context(self, result: ActivitySegmentResult) -> None:
53
+ self._segment_lifecycles = result.lifecycles
54
+
55
+ def add_quadrant_context(self, result: LifecycleQuadrantResult) -> None:
56
+ self._quadrant_lifecycles = result.lifecycles
57
+
58
+ def add_seasonality_context(self, periods: List[SeasonalityPeriod]) -> None:
59
+ self._seasonality_periods = periods
60
+
61
+ def add_inter_event_context(self, median_days: float, mean_days: float) -> None:
62
+ self._inter_event_median = median_days
63
+ self._inter_event_mean = mean_days
64
+
65
+ def compute_union(
66
+ self, lifecycles: pd.DataFrame, time_span_days: int,
67
+ min_coverage_ratio: float = 2.0,
68
+ value_columns: int = 0, agg_funcs: int = 4,
69
+ ) -> WindowUnionResult:
70
+ rows = self._compute_coverage_rows(lifecycles, time_span_days, min_coverage_ratio)
71
+ self._annotate_context(rows, lifecycles)
72
+ selected = [r["window"] for r in rows if r["included"]]
73
+ explanation = pd.DataFrame(rows)
74
+ heterogeneity = self._compute_heterogeneity(lifecycles, selected)
75
+ feature_count = value_columns * agg_funcs * len(selected) + len(selected) if value_columns > 0 else len(selected)
76
+ return WindowUnionResult(
77
+ windows=selected, explanation=explanation,
78
+ heterogeneity=heterogeneity, coverage_threshold=self._coverage_threshold,
79
+ feature_count_estimate=feature_count,
80
+ )
81
+
82
+ def _compute_coverage_rows(
83
+ self, lifecycles: pd.DataFrame, time_span_days: int, min_coverage_ratio: float,
84
+ ) -> List[Dict]:
85
+ duration = lifecycles["duration_days"].astype(float)
86
+ event_count = lifecycles["event_count"].astype(float)
87
+ n = len(lifecycles)
88
+ rows = []
89
+ for window in self.ALL_CANDIDATE_WINDOWS:
90
+ window_days = WINDOW_DAYS_MAP[window]
91
+ if window_days is None:
92
+ rows.append(self._all_time_row(n))
93
+ continue
94
+ has_span = duration >= window_days
95
+ expected_events = event_count * (window_days / duration.clip(lower=1))
96
+ has_density = expected_events >= 2
97
+ beneficial = has_span & has_density
98
+ coverage_pct = beneficial.mean()
99
+ meaningful_pct = has_density[has_span].mean() if has_span.any() else 0.0
100
+ beneficial_count = int(beneficial.sum())
101
+ hard_excluded = time_span_days < window_days * min_coverage_ratio
102
+ included, exclusion_reason = self._determine_inclusion(
103
+ window, coverage_pct, hard_excluded,
104
+ )
105
+ rows.append({
106
+ "window": window, "window_days": window_days,
107
+ "coverage_pct": round(coverage_pct, 4),
108
+ "meaningful_pct": round(meaningful_pct, 4),
109
+ "beneficial_entities": beneficial_count,
110
+ "primary_segments": [], "included": included,
111
+ "exclusion_reason": exclusion_reason, "note": "",
112
+ })
113
+ return rows
114
+
115
+ def _all_time_row(self, n: int) -> Dict:
116
+ return {
117
+ "window": "all_time", "window_days": None,
118
+ "coverage_pct": 1.0, "meaningful_pct": 1.0,
119
+ "beneficial_entities": n, "primary_segments": [],
120
+ "included": True, "exclusion_reason": "", "note": "",
121
+ }
122
+
123
+ def _determine_inclusion(self, window: str, coverage_pct: float, hard_excluded: bool) -> Tuple[bool, str]:
124
+ if hard_excluded:
125
+ if window in self._always_include:
126
+ return True, ""
127
+ return False, f"Excluded: span < {WINDOW_DAYS_MAP[window] * 2:.0f}d required"
128
+ if window in self._always_include:
129
+ return True, ""
130
+ if coverage_pct >= self._coverage_threshold:
131
+ return True, ""
132
+ return False, f"Coverage {coverage_pct:.1%} < threshold {self._coverage_threshold:.1%}"
133
+
134
+ def _annotate_context(self, rows: List[Dict], lifecycles: pd.DataFrame) -> None:
135
+ self._annotate_segments(rows, lifecycles)
136
+ self._annotate_seasonality(rows)
137
+ self._annotate_timing(rows)
138
+
139
+ def _annotate_segments(self, rows: List[Dict], lifecycles: pd.DataFrame) -> None:
140
+ context_lc = self._segment_lifecycles if self._segment_lifecycles is not None else self._quadrant_lifecycles
141
+ if context_lc is None:
142
+ return
143
+ group_col = "activity_segment" if "activity_segment" in (context_lc.columns if self._segment_lifecycles is not None else []) else None
144
+ if group_col is None and self._quadrant_lifecycles is not None and "lifecycle_quadrant" in self._quadrant_lifecycles.columns:
145
+ group_col = "lifecycle_quadrant"
146
+ context_lc = self._quadrant_lifecycles
147
+ if group_col is None:
148
+ return
149
+ duration = context_lc["duration_days"].astype(float)
150
+ event_count = context_lc["event_count"].astype(float)
151
+ groups = context_lc[group_col]
152
+ for row in rows:
153
+ window_days = row["window_days"]
154
+ if window_days is None:
155
+ row["primary_segments"] = sorted(groups.unique().tolist())
156
+ continue
157
+ has_span = duration >= window_days
158
+ expected_events = event_count * (window_days / duration.clip(lower=1))
159
+ beneficial = has_span & (expected_events >= 2)
160
+ if not beneficial.any():
161
+ continue
162
+ group_coverage = groups[beneficial].value_counts(normalize=True)
163
+ top = group_coverage[group_coverage >= 0.15].index.tolist()
164
+ row["primary_segments"] = sorted(top[:3])
165
+
166
+ def _annotate_seasonality(self, rows: List[Dict]) -> None:
167
+ if not self._seasonality_periods:
168
+ return
169
+ detected_windows = set()
170
+ for sp in self._seasonality_periods:
171
+ if sp.period in SEASONALITY_WINDOW_MAP:
172
+ detected_windows.add(SEASONALITY_WINDOW_MAP[sp.period])
173
+ for row in rows:
174
+ if row["window"] in detected_windows:
175
+ period_name = next(
176
+ (sp.period_name or f"{sp.period}d" for sp in self._seasonality_periods
177
+ if SEASONALITY_WINDOW_MAP.get(sp.period) == row["window"]), ""
178
+ )
179
+ row["note"] = f"Seasonality detected ({period_name})"
180
+
181
+ def _annotate_timing(self, rows: List[Dict]) -> None:
182
+ if self._inter_event_median is None:
183
+ return
184
+ for row in rows:
185
+ window_days = row["window_days"]
186
+ if window_days is None:
187
+ continue
188
+ ratio = self._inter_event_median / window_days if window_days > 0 else 0
189
+ if TIMING_TOLERANCE <= ratio <= (1.0 / TIMING_TOLERANCE):
190
+ existing = row["note"]
191
+ timing_note = "Timing-aligned (median inter-event)"
192
+ row["note"] = f"{existing}; {timing_note}" if existing else timing_note
193
+
194
+ def _compute_heterogeneity(self, lifecycles: pd.DataFrame, selected_windows: List[str]) -> TemporalHeterogeneityResult:
195
+ eta_intensity, eta_event = self._compute_eta_squared(lifecycles)
196
+ level = self._classify_heterogeneity(max(eta_intensity, eta_event))
197
+ cold_start_frac = self._cold_start_fraction(lifecycles)
198
+ advisory, rationale = self._build_advisory(level, cold_start_frac, selected_windows, lifecycles)
199
+ coverage_table = self._build_coverage_table(lifecycles, selected_windows)
200
+ return TemporalHeterogeneityResult(
201
+ eta_squared_intensity=eta_intensity,
202
+ eta_squared_event_count=eta_event,
203
+ heterogeneity_level=level,
204
+ segmentation_advisory=advisory,
205
+ advisory_rationale=rationale,
206
+ coverage_table=coverage_table,
207
+ )
208
+
209
+ def _compute_eta_squared(self, lifecycles: pd.DataFrame) -> Tuple[float, float]:
210
+ group_col = "lifecycle_quadrant" if "lifecycle_quadrant" in lifecycles.columns else None
211
+ if group_col is None:
212
+ return 0.0, 0.0
213
+ groups = lifecycles[group_col]
214
+ if groups.nunique() < 2:
215
+ return 0.0, 0.0
216
+ eta_intensity = self._eta_squared_for_variable(lifecycles, "intensity", groups)
217
+ eta_event = self._eta_squared_for_variable(lifecycles, "event_count", groups)
218
+ return eta_intensity, eta_event
219
+
220
+ def _eta_squared_for_variable(self, df: pd.DataFrame, var: str, groups: pd.Series) -> float:
221
+ if var not in df.columns:
222
+ return 0.0
223
+ values = df[var].astype(float)
224
+ grand_mean = values.mean()
225
+ ss_total = ((values - grand_mean) ** 2).sum()
226
+ if ss_total == 0:
227
+ return 0.0
228
+ ss_between = 0.0
229
+ for _, group_vals in values.groupby(groups):
230
+ n_k = len(group_vals)
231
+ mean_k = group_vals.mean()
232
+ ss_between += n_k * (mean_k - grand_mean) ** 2
233
+ return float(ss_between / ss_total)
234
+
235
+ def _classify_heterogeneity(self, eta_max: float) -> str:
236
+ if eta_max < 0.06:
237
+ return "low"
238
+ if eta_max < 0.14:
239
+ return "moderate"
240
+ return "high"
241
+
242
+ def _cold_start_fraction(self, lifecycles: pd.DataFrame) -> float:
243
+ cold_labels = {"One-shot", "One-time"}
244
+ cold_count = 0
245
+ for col in ("lifecycle_quadrant", "activity_segment"):
246
+ if col in lifecycles.columns:
247
+ cold_count = max(cold_count, lifecycles[col].isin(cold_labels).sum())
248
+ return cold_count / len(lifecycles) if len(lifecycles) > 0 else 0.0
249
+
250
+ def _build_advisory(
251
+ self, level: str, cold_start_frac: float, selected_windows: List[str], lifecycles: pd.DataFrame,
252
+ ) -> Tuple[str, List[str]]:
253
+ rationale: List[str] = []
254
+ if level == "low":
255
+ rationale.append("Low temporal diversity across quadrants")
256
+ rationale.append("Union strategy loses minimal signal")
257
+ return "single_model", rationale
258
+ if level == "high" and cold_start_frac > 0.30:
259
+ rationale.append("High temporal diversity across quadrants")
260
+ rationale.append(f"Large cold-start population ({cold_start_frac:.0%} One-time/One-shot)")
261
+ rationale.append("Consider separate handling for entities with vs without history")
262
+ return "consider_separate_models", rationale
263
+ rationale.append(f"{level.capitalize()} temporal diversity across quadrants")
264
+ rationale.append("Union windows still pragmatic for feature engineering")
265
+ rationale.append("Model may benefit from knowing entity's engagement pattern")
266
+ return "consider_segment_feature", rationale
267
+
268
+ def _build_coverage_table(self, lifecycles: pd.DataFrame, selected_windows: List[str]) -> pd.DataFrame:
269
+ duration = lifecycles["duration_days"].astype(float)
270
+ event_count = lifecycles["event_count"].astype(float)
271
+ rows = []
272
+ for window in selected_windows:
273
+ window_days = WINDOW_DAYS_MAP.get(window)
274
+ if window_days is None:
275
+ rows.append({"window": "all_time", "coverage_pct": 1.0, "meaningful_pct": 1.0, "zero_risk_pct": 0.0})
276
+ continue
277
+ has_span = duration >= window_days
278
+ coverage = has_span.mean()
279
+ expected_events = event_count * (window_days / duration.clip(lower=1))
280
+ meaningful = (has_span & (expected_events >= 2)).mean()
281
+ zero_risk = 1.0 - meaningful
282
+ rows.append({
283
+ "window": window,
284
+ "coverage_pct": round(float(coverage), 4),
285
+ "meaningful_pct": round(float(meaningful), 4),
286
+ "zero_risk_pct": round(float(zero_risk), 4),
287
+ })
288
+ return pd.DataFrame(rows)