churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,703 @@
1
+ """TemporalFeatureEngineer - temporal feature engineering with lagged windows.
2
+
3
+ Generates features across 7 groups:
4
+ 1. Lagged Windows - Sequential non-overlapping time windows
5
+ 2. Velocity - Rate of change between windows
6
+ 3. Acceleration - Change in velocity (momentum)
7
+ 4. Lifecycle - Beginning/Middle/End of customer history
8
+ 5. Recency - Days since last/first event, tenure
9
+ 6. Regularity - Frequency and consistency patterns
10
+ 7. Cohort Comparison - Customer vs cohort averages
11
+
12
+ Key Concepts:
13
+ Per-Customer Alignment: Each customer's features are computed relative to
14
+ their own reference point (e.g., churn date, last activity), making
15
+ historical churners comparable to current active customers.
16
+
17
+ Lagged Windows: Sequential non-overlapping windows (Lag0=most recent,
18
+ Lag1=previous period, etc.) enable velocity/acceleration computation.
19
+ """
20
+
21
+ from dataclasses import dataclass, field
22
+ from datetime import datetime
23
+ from enum import Enum
24
+ from typing import Any, Dict, List, Optional
25
+
26
+ import numpy as np
27
+
28
+ from customer_retention.core.compat import pd
29
+
30
+
31
+ class ReferenceMode(Enum):
32
+ """How to determine reference point for temporal alignment."""
33
+ PER_CUSTOMER = "per_customer" # Each customer has own reference date
34
+ GLOBAL_DATE = "global_date" # Single date for all customers
35
+
36
+
37
+ class FeatureGroup(Enum):
38
+ """Categories of temporal features."""
39
+ LAGGED_WINDOWS = "lagged_windows"
40
+ VELOCITY = "velocity"
41
+ ACCELERATION = "acceleration"
42
+ LIFECYCLE = "lifecycle"
43
+ RECENCY = "recency"
44
+ REGULARITY = "regularity"
45
+ COHORT_COMPARISON = "cohort_comparison"
46
+
47
+
48
+ @dataclass
49
+ class TemporalAggregationConfig:
50
+ """Configuration for temporal feature engineering."""
51
+
52
+ # Reference point alignment
53
+ reference_mode: ReferenceMode = ReferenceMode.PER_CUSTOMER
54
+ global_reference_date: Optional[datetime] = None
55
+
56
+ # Lagged windows (Group 1)
57
+ lag_window_days: int = 30
58
+ num_lags: int = 4
59
+ lag_aggregations: List[str] = field(default_factory=lambda: ["sum", "mean", "count", "max"])
60
+
61
+ # Velocity/Acceleration (Groups 2-3)
62
+ compute_velocity: bool = True
63
+ compute_acceleration: bool = True
64
+
65
+ # Lifecycle windows (Group 4)
66
+ compute_lifecycle: bool = True
67
+ lifecycle_splits: List[float] = field(default_factory=lambda: [0.33, 0.33, 0.34])
68
+ min_history_days: int = 60
69
+
70
+ # Recency/Tenure (Group 5)
71
+ compute_recency: bool = True
72
+
73
+ # Frequency/Regularity (Group 6)
74
+ compute_regularity: bool = True
75
+
76
+ # Cohort Comparison (Group 7)
77
+ compute_cohort: bool = True
78
+
79
+
80
+ @dataclass
81
+ class FeatureGroupResult:
82
+ """Result for a single feature group."""
83
+ group: FeatureGroup
84
+ features: List[str]
85
+ rationale: str
86
+ enabled: bool = True
87
+
88
+
89
+ @dataclass
90
+ class TemporalFeatureResult:
91
+ """Result from temporal feature computation."""
92
+ features_df: pd.DataFrame
93
+ feature_groups: List[FeatureGroupResult]
94
+ config: TemporalAggregationConfig
95
+ entity_col: str
96
+ value_cols: List[str]
97
+
98
+ def get_catalog(self) -> str:
99
+ """Generate formatted feature catalog with rationale."""
100
+ lines = []
101
+ lines.append("=" * 80)
102
+ lines.append("TEMPORAL FEATURE CATALOG")
103
+ lines.append("=" * 80)
104
+
105
+ for group_result in self.feature_groups:
106
+ if not group_result.enabled:
107
+ continue
108
+
109
+ lines.append("")
110
+ lines.append(f"GROUP: {group_result.group.value.upper()} ({len(group_result.features)} features)")
111
+ lines.append(f"Rationale: {group_result.rationale}")
112
+ lines.append("-" * 60)
113
+
114
+ for feat in group_result.features[:10]:
115
+ lines.append(f" - {feat}")
116
+ if len(group_result.features) > 10:
117
+ lines.append(f" ... and {len(group_result.features) - 10} more")
118
+
119
+ lines.append("")
120
+ lines.append("=" * 80)
121
+ return "\n".join(lines)
122
+
123
+ def to_dict(self) -> Dict[str, Any]:
124
+ return {
125
+ "n_features": len(self.features_df.columns) - 1, # Exclude entity col
126
+ "n_entities": len(self.features_df),
127
+ "feature_groups": [
128
+ {"group": g.group.value, "n_features": len(g.features), "enabled": g.enabled}
129
+ for g in self.feature_groups
130
+ ],
131
+ }
132
+
133
+
134
+ class TemporalFeatureEngineer:
135
+ """Engineers temporal features from event data with per-customer alignment.
136
+
137
+ Supports 7 feature groups:
138
+ 1. Lagged Windows - lag{N}_{metric}_{agg}
139
+ 2. Velocity - {metric}_velocity, {metric}_velocity_pct
140
+ 3. Acceleration - {metric}_acceleration, {metric}_momentum
141
+ 4. Lifecycle - {metric}_beginning, {metric}_middle, {metric}_end, {metric}_trend_ratio
142
+ 5. Recency - days_since_last_event, days_since_first_event, active_span_days
143
+ 6. Regularity - event_frequency, inter_event_gap_mean, regularity_score
144
+ 7. Cohort - {metric}_vs_cohort_mean, {metric}_vs_cohort_pct
145
+ """
146
+
147
+ RATIONALES = {
148
+ FeatureGroup.LAGGED_WINDOWS: "Capture behavior at sequential time horizons to enable trend detection",
149
+ FeatureGroup.VELOCITY: "Rate of change is the #1 churn predictor - declining engagement signals risk",
150
+ FeatureGroup.ACCELERATION: "Is the decline accelerating or stabilizing? Indicates intervention urgency",
151
+ FeatureGroup.LIFECYCLE: "Customer lifecycle patterns reveal engagement trajectory over full history",
152
+ FeatureGroup.RECENCY: "How recently active and tenure are fundamental churn signals",
153
+ FeatureGroup.REGULARITY: "Consistent patterns indicate habit formation; irregular patterns suggest weak retention",
154
+ FeatureGroup.COHORT_COMPARISON: "Compare customer to peers - is their behavior normal or anomalous?",
155
+ }
156
+
157
+ def __init__(self, config: Optional[TemporalAggregationConfig] = None):
158
+ self.config = config or TemporalAggregationConfig()
159
+
160
+ def compute(
161
+ self,
162
+ events_df: pd.DataFrame,
163
+ entity_col: str,
164
+ time_col: str,
165
+ value_cols: List[str],
166
+ reference_dates: Optional[pd.DataFrame] = None,
167
+ reference_col: Optional[str] = None,
168
+ ) -> TemporalFeatureResult:
169
+ """Compute temporal features for all entities.
170
+
171
+ Args:
172
+ events_df: Event-level data with timestamps
173
+ entity_col: Column identifying entities (e.g., customer_id)
174
+ time_col: Column with event timestamps
175
+ value_cols: Columns to aggregate (e.g., amount, quantity)
176
+ reference_dates: DataFrame with entity and reference date columns
177
+ reference_col: Column name for reference date in reference_dates
178
+
179
+ Returns:
180
+ TemporalFeatureResult with features DataFrame and metadata
181
+ """
182
+ events_df = events_df.copy()
183
+ events_df[time_col] = pd.to_datetime(events_df[time_col])
184
+
185
+ # Determine reference dates per entity
186
+ ref_dates = self._get_reference_dates(
187
+ events_df, entity_col, time_col, reference_dates, reference_col
188
+ )
189
+
190
+ # Compute each feature group
191
+ all_features = []
192
+ feature_groups = []
193
+
194
+ # Group 1: Lagged Windows
195
+ lag_features, lag_group = self._compute_lagged_windows(
196
+ events_df, entity_col, time_col, value_cols, ref_dates
197
+ )
198
+ all_features.append(lag_features)
199
+ feature_groups.append(lag_group)
200
+
201
+ # Group 2: Velocity
202
+ if self.config.compute_velocity:
203
+ velocity_features, velocity_group = self._compute_velocity(
204
+ lag_features, value_cols
205
+ )
206
+ all_features.append(velocity_features.drop(columns=[entity_col]))
207
+ feature_groups.append(velocity_group)
208
+ else:
209
+ feature_groups.append(FeatureGroupResult(
210
+ group=FeatureGroup.VELOCITY, features=[],
211
+ rationale=self.RATIONALES[FeatureGroup.VELOCITY], enabled=False
212
+ ))
213
+
214
+ # Group 3: Acceleration
215
+ if self.config.compute_acceleration and self.config.compute_velocity:
216
+ accel_features, accel_group = self._compute_acceleration(
217
+ all_features[1] if len(all_features) > 1 else lag_features,
218
+ lag_features, value_cols, entity_col
219
+ )
220
+ all_features.append(accel_features.drop(columns=[entity_col], errors='ignore'))
221
+ feature_groups.append(accel_group)
222
+ else:
223
+ feature_groups.append(FeatureGroupResult(
224
+ group=FeatureGroup.ACCELERATION, features=[],
225
+ rationale=self.RATIONALES[FeatureGroup.ACCELERATION], enabled=False
226
+ ))
227
+
228
+ # Group 4: Lifecycle
229
+ if self.config.compute_lifecycle:
230
+ lifecycle_features, lifecycle_group = self._compute_lifecycle(
231
+ events_df, entity_col, time_col, value_cols, ref_dates
232
+ )
233
+ all_features.append(lifecycle_features.drop(columns=[entity_col]))
234
+ feature_groups.append(lifecycle_group)
235
+ else:
236
+ feature_groups.append(FeatureGroupResult(
237
+ group=FeatureGroup.LIFECYCLE, features=[],
238
+ rationale=self.RATIONALES[FeatureGroup.LIFECYCLE], enabled=False
239
+ ))
240
+
241
+ # Group 5: Recency
242
+ if self.config.compute_recency:
243
+ recency_features, recency_group = self._compute_recency(
244
+ events_df, entity_col, time_col, ref_dates
245
+ )
246
+ all_features.append(recency_features.drop(columns=[entity_col]))
247
+ feature_groups.append(recency_group)
248
+ else:
249
+ feature_groups.append(FeatureGroupResult(
250
+ group=FeatureGroup.RECENCY, features=[],
251
+ rationale=self.RATIONALES[FeatureGroup.RECENCY], enabled=False
252
+ ))
253
+
254
+ # Group 6: Regularity
255
+ if self.config.compute_regularity:
256
+ regularity_features, regularity_group = self._compute_regularity(
257
+ events_df, entity_col, time_col, ref_dates
258
+ )
259
+ all_features.append(regularity_features.drop(columns=[entity_col]))
260
+ feature_groups.append(regularity_group)
261
+ else:
262
+ feature_groups.append(FeatureGroupResult(
263
+ group=FeatureGroup.REGULARITY, features=[],
264
+ rationale=self.RATIONALES[FeatureGroup.REGULARITY], enabled=False
265
+ ))
266
+
267
+ # Group 7: Cohort Comparison
268
+ if self.config.compute_cohort:
269
+ cohort_features, cohort_group = self._compute_cohort_comparison(
270
+ lag_features, value_cols, entity_col
271
+ )
272
+ all_features.append(cohort_features.drop(columns=[entity_col]))
273
+ feature_groups.append(cohort_group)
274
+ else:
275
+ feature_groups.append(FeatureGroupResult(
276
+ group=FeatureGroup.COHORT_COMPARISON, features=[],
277
+ rationale=self.RATIONALES[FeatureGroup.COHORT_COMPARISON], enabled=False
278
+ ))
279
+
280
+ # Merge all features
281
+ result_df = all_features[0]
282
+ for df in all_features[1:]:
283
+ if entity_col in df.columns:
284
+ result_df = result_df.merge(df, on=entity_col, how="left")
285
+ else:
286
+ result_df = pd.concat([result_df.reset_index(drop=True),
287
+ df.reset_index(drop=True)], axis=1)
288
+
289
+ return TemporalFeatureResult(
290
+ features_df=result_df,
291
+ feature_groups=feature_groups,
292
+ config=self.config,
293
+ entity_col=entity_col,
294
+ value_cols=value_cols,
295
+ )
296
+
297
+ def _get_reference_dates(
298
+ self,
299
+ events_df: pd.DataFrame,
300
+ entity_col: str,
301
+ time_col: str,
302
+ reference_dates: Optional[pd.DataFrame],
303
+ reference_col: Optional[str],
304
+ ) -> pd.DataFrame:
305
+ """Determine reference date for each entity."""
306
+ entities = events_df[entity_col].unique()
307
+
308
+ if self.config.reference_mode == ReferenceMode.GLOBAL_DATE:
309
+ ref_date = self.config.global_reference_date or datetime.now()
310
+ return pd.DataFrame({
311
+ entity_col: entities,
312
+ "reference_date": ref_date,
313
+ })
314
+
315
+ if reference_dates is not None and reference_col is not None:
316
+ ref_df = reference_dates[[entity_col, reference_col]].copy()
317
+ ref_df.columns = [entity_col, "reference_date"]
318
+ ref_df["reference_date"] = pd.to_datetime(ref_df["reference_date"])
319
+ return ref_df
320
+
321
+ # Default: Use last event date per entity
322
+ ref_df = events_df.groupby(entity_col)[time_col].max().reset_index()
323
+ ref_df.columns = [entity_col, "reference_date"]
324
+ return ref_df
325
+
326
+ def _compute_lagged_windows(
327
+ self,
328
+ events_df: pd.DataFrame,
329
+ entity_col: str,
330
+ time_col: str,
331
+ value_cols: List[str],
332
+ ref_dates: pd.DataFrame,
333
+ ) -> tuple:
334
+ """Compute lagged window aggregations (Group 1)."""
335
+ window_days = self.config.lag_window_days
336
+ num_lags = self.config.num_lags
337
+
338
+ # Merge reference dates
339
+ df = events_df.merge(ref_dates, on=entity_col)
340
+
341
+ # Calculate days before reference for each event
342
+ df["days_before_ref"] = (df["reference_date"] - df[time_col]).dt.days
343
+
344
+ # Initialize result with entities
345
+ result = ref_dates[[entity_col]].copy()
346
+ feature_names = []
347
+
348
+ for lag in range(num_lags):
349
+ start_days = lag * window_days
350
+ end_days = (lag + 1) * window_days
351
+
352
+ # Filter events in this lag window
353
+ lag_mask = (df["days_before_ref"] >= start_days) & (df["days_before_ref"] < end_days)
354
+ lag_df = df[lag_mask]
355
+
356
+ for col in value_cols:
357
+ for agg in self.config.lag_aggregations:
358
+ feat_name = f"lag{lag}_{col}_{agg}"
359
+ feature_names.append(feat_name)
360
+
361
+ if agg == "count":
362
+ agg_result = lag_df.groupby(entity_col)[col].count().reset_index()
363
+ agg_result.columns = [entity_col, feat_name]
364
+ # Fill missing with 0 for counts
365
+ result = result.merge(agg_result, on=entity_col, how="left")
366
+ result[feat_name] = result[feat_name].fillna(0).astype(int)
367
+ else:
368
+ agg_func = {"sum": "sum", "mean": "mean", "max": "max", "min": "min"}.get(agg, agg)
369
+ agg_result = lag_df.groupby(entity_col)[col].agg(agg_func).reset_index()
370
+ agg_result.columns = [entity_col, feat_name]
371
+ result = result.merge(agg_result, on=entity_col, how="left")
372
+ # Leave as NaN for non-count aggregations
373
+
374
+ group_result = FeatureGroupResult(
375
+ group=FeatureGroup.LAGGED_WINDOWS,
376
+ features=feature_names,
377
+ rationale=self.RATIONALES[FeatureGroup.LAGGED_WINDOWS],
378
+ )
379
+
380
+ return result, group_result
381
+
382
+ def _compute_velocity(
383
+ self,
384
+ lag_features: pd.DataFrame,
385
+ value_cols: List[str],
386
+ ) -> tuple:
387
+ """Compute velocity features (Group 2)."""
388
+ entity_col = lag_features.columns[0]
389
+ result = lag_features[[entity_col]].copy()
390
+ feature_names = []
391
+ window_days = self.config.lag_window_days
392
+
393
+ for col in value_cols:
394
+ lag0_col = f"lag0_{col}_sum"
395
+ lag1_col = f"lag1_{col}_sum"
396
+
397
+ if lag0_col in lag_features.columns and lag1_col in lag_features.columns:
398
+ # Velocity = (Lag0 - Lag1) / window_days
399
+ velocity_name = f"{col}_velocity"
400
+ result[velocity_name] = (
401
+ lag_features[lag0_col] - lag_features[lag1_col]
402
+ ) / window_days
403
+ feature_names.append(velocity_name)
404
+
405
+ # Velocity percentage = (Lag0 - Lag1) / Lag1
406
+ velocity_pct_name = f"{col}_velocity_pct"
407
+ result[velocity_pct_name] = np.where(
408
+ lag_features[lag1_col] != 0,
409
+ (lag_features[lag0_col] - lag_features[lag1_col]) / lag_features[lag1_col],
410
+ np.nan
411
+ )
412
+ feature_names.append(velocity_pct_name)
413
+
414
+ group_result = FeatureGroupResult(
415
+ group=FeatureGroup.VELOCITY,
416
+ features=feature_names,
417
+ rationale=self.RATIONALES[FeatureGroup.VELOCITY],
418
+ )
419
+
420
+ return result, group_result
421
+
422
+ def _compute_acceleration(
423
+ self,
424
+ velocity_features: pd.DataFrame,
425
+ lag_features: pd.DataFrame,
426
+ value_cols: List[str],
427
+ entity_col: str,
428
+ ) -> tuple:
429
+ """Compute acceleration and momentum features (Group 3)."""
430
+ result = lag_features[[entity_col]].copy()
431
+ feature_names = []
432
+ window_days = self.config.lag_window_days
433
+
434
+ for col in value_cols:
435
+ velocity_col = f"{col}_velocity"
436
+ lag0_col = f"lag0_{col}_sum"
437
+ lag1_col = f"lag1_{col}_sum"
438
+ lag2_col = f"lag2_{col}_sum"
439
+
440
+ # Acceleration = change in velocity
441
+ if lag1_col in lag_features.columns and lag2_col in lag_features.columns:
442
+ velocity_01 = (lag_features[lag0_col] - lag_features[lag1_col]) / window_days
443
+ velocity_12 = (lag_features[lag1_col] - lag_features[lag2_col]) / window_days
444
+ accel_name = f"{col}_acceleration"
445
+ result[accel_name] = velocity_01 - velocity_12
446
+ feature_names.append(accel_name)
447
+
448
+ # Momentum = Lag0 × Velocity
449
+ if velocity_col in velocity_features.columns and lag0_col in lag_features.columns:
450
+ momentum_name = f"{col}_momentum"
451
+ result[momentum_name] = lag_features[lag0_col] * velocity_features[velocity_col]
452
+ feature_names.append(momentum_name)
453
+
454
+ group_result = FeatureGroupResult(
455
+ group=FeatureGroup.ACCELERATION,
456
+ features=feature_names,
457
+ rationale=self.RATIONALES[FeatureGroup.ACCELERATION],
458
+ )
459
+
460
+ return result, group_result
461
+
462
+ def _compute_lifecycle(
463
+ self,
464
+ events_df: pd.DataFrame,
465
+ entity_col: str,
466
+ time_col: str,
467
+ value_cols: List[str],
468
+ ref_dates: pd.DataFrame,
469
+ ) -> tuple:
470
+ """Compute lifecycle features (Group 4): Beginning/Middle/End."""
471
+ result = ref_dates[[entity_col]].copy()
472
+ feature_names = []
473
+ min_days = self.config.min_history_days
474
+ splits = self.config.lifecycle_splits
475
+
476
+ # Get history span per entity
477
+ history_stats = events_df.groupby(entity_col).agg({
478
+ time_col: ["min", "max"]
479
+ }).reset_index()
480
+ history_stats.columns = [entity_col, "first_event", "last_event"]
481
+ history_stats["history_days"] = (
482
+ history_stats["last_event"] - history_stats["first_event"]
483
+ ).dt.days
484
+
485
+ df = events_df.merge(history_stats, on=entity_col)
486
+
487
+ for col in value_cols:
488
+ # Initialize columns
489
+ result[f"{col}_beginning"] = np.nan
490
+ result[f"{col}_middle"] = np.nan
491
+ result[f"{col}_end"] = np.nan
492
+ result[f"{col}_trend_ratio"] = np.nan
493
+
494
+ feature_names.extend([
495
+ f"{col}_beginning", f"{col}_middle", f"{col}_end", f"{col}_trend_ratio"
496
+ ])
497
+
498
+ # Process each entity
499
+ for entity in result[entity_col].unique():
500
+ entity_df = df[df[entity_col] == entity]
501
+ if len(entity_df) == 0:
502
+ continue
503
+
504
+ history_days = entity_df["history_days"].iloc[0]
505
+
506
+ # Skip if insufficient history
507
+ if history_days < min_days:
508
+ continue
509
+
510
+ first_event = entity_df["first_event"].iloc[0]
511
+ entity_df["last_event"].iloc[0]
512
+
513
+ # Calculate split boundaries
514
+ split1 = first_event + pd.Timedelta(days=history_days * splits[0])
515
+ split2 = first_event + pd.Timedelta(days=history_days * (splits[0] + splits[1]))
516
+
517
+ for col in value_cols:
518
+ beginning_val = entity_df[entity_df[time_col] < split1][col].sum()
519
+ middle_val = entity_df[(entity_df[time_col] >= split1) &
520
+ (entity_df[time_col] < split2)][col].sum()
521
+ end_val = entity_df[entity_df[time_col] >= split2][col].sum()
522
+
523
+ mask = result[entity_col] == entity
524
+ result.loc[mask, f"{col}_beginning"] = beginning_val
525
+ result.loc[mask, f"{col}_middle"] = middle_val
526
+ result.loc[mask, f"{col}_end"] = end_val
527
+
528
+ if beginning_val > 0:
529
+ result.loc[mask, f"{col}_trend_ratio"] = end_val / beginning_val
530
+
531
+ group_result = FeatureGroupResult(
532
+ group=FeatureGroup.LIFECYCLE,
533
+ features=feature_names,
534
+ rationale=self.RATIONALES[FeatureGroup.LIFECYCLE],
535
+ )
536
+
537
+ return result, group_result
538
+
539
+ def _compute_recency(
540
+ self,
541
+ events_df: pd.DataFrame,
542
+ entity_col: str,
543
+ time_col: str,
544
+ ref_dates: pd.DataFrame,
545
+ ) -> tuple:
546
+ """Compute recency and tenure features (Group 5)."""
547
+ result = ref_dates[[entity_col]].copy()
548
+
549
+ # Get first and last event per entity
550
+ event_stats = events_df.groupby(entity_col).agg({
551
+ time_col: ["min", "max", "count"]
552
+ }).reset_index()
553
+ event_stats.columns = [entity_col, "first_event", "last_event", "event_count"]
554
+
555
+ result = result.merge(event_stats, on=entity_col, how="left")
556
+ result = result.merge(ref_dates, on=entity_col)
557
+
558
+ # Days since last event (from reference date)
559
+ result["days_since_last_event"] = (
560
+ result["reference_date"] - result["last_event"]
561
+ ).dt.days
562
+
563
+ # Days since first event (tenure)
564
+ result["days_since_first_event"] = (
565
+ result["reference_date"] - result["first_event"]
566
+ ).dt.days
567
+
568
+ # Active span (first to last event)
569
+ result["active_span_days"] = (
570
+ result["last_event"] - result["first_event"]
571
+ ).dt.days
572
+
573
+ # Recency ratio: days_since_last / active_span (0 = just active, 1 = dormant)
574
+ result["recency_ratio"] = np.where(
575
+ result["active_span_days"] > 0,
576
+ result["days_since_last_event"] / (result["active_span_days"] + result["days_since_last_event"]),
577
+ 0
578
+ )
579
+ result["recency_ratio"] = result["recency_ratio"].clip(0, 1)
580
+
581
+ # Clean up
582
+ result = result.drop(columns=["first_event", "last_event", "event_count", "reference_date"])
583
+
584
+ feature_names = [
585
+ "days_since_last_event", "days_since_first_event",
586
+ "active_span_days", "recency_ratio"
587
+ ]
588
+
589
+ group_result = FeatureGroupResult(
590
+ group=FeatureGroup.RECENCY,
591
+ features=feature_names,
592
+ rationale=self.RATIONALES[FeatureGroup.RECENCY],
593
+ )
594
+
595
+ return result, group_result
596
+
597
+ def _compute_regularity(
598
+ self,
599
+ events_df: pd.DataFrame,
600
+ entity_col: str,
601
+ time_col: str,
602
+ ref_dates: pd.DataFrame,
603
+ ) -> tuple:
604
+ """Compute frequency and regularity features (Group 6)."""
605
+ result = ref_dates[[entity_col]].copy()
606
+
607
+ for entity in result[entity_col].unique():
608
+ entity_events = events_df[events_df[entity_col] == entity].sort_values(time_col)
609
+
610
+ if len(entity_events) < 2:
611
+ continue
612
+
613
+ # Inter-event gaps
614
+ gaps = entity_events[time_col].diff().dt.days.dropna()
615
+
616
+ if len(gaps) > 0:
617
+ gap_mean = gaps.mean()
618
+ gap_std = gaps.std() if len(gaps) > 1 else 0
619
+ gap_max = gaps.max()
620
+
621
+ mask = result[entity_col] == entity
622
+
623
+ # Event frequency (events per 30 days)
624
+ total_days = (entity_events[time_col].max() - entity_events[time_col].min()).days
625
+ if total_days > 0:
626
+ result.loc[mask, "event_frequency"] = len(entity_events) / total_days * 30
627
+ else:
628
+ result.loc[mask, "event_frequency"] = len(entity_events)
629
+
630
+ result.loc[mask, "inter_event_gap_mean"] = gap_mean
631
+ result.loc[mask, "inter_event_gap_std"] = gap_std
632
+ result.loc[mask, "inter_event_gap_max"] = gap_max
633
+
634
+ # Regularity score: 1 - (std / mean), higher = more regular
635
+ if gap_mean > 0:
636
+ regularity = max(0, 1 - (gap_std / gap_mean))
637
+ result.loc[mask, "regularity_score"] = regularity
638
+ else:
639
+ result.loc[mask, "regularity_score"] = 1.0
640
+
641
+ # Fill NaN for entities with single event
642
+ for col in ["event_frequency", "inter_event_gap_mean", "inter_event_gap_std",
643
+ "inter_event_gap_max", "regularity_score"]:
644
+ if col not in result.columns:
645
+ result[col] = np.nan
646
+
647
+ feature_names = [
648
+ "event_frequency", "inter_event_gap_mean", "inter_event_gap_std",
649
+ "inter_event_gap_max", "regularity_score"
650
+ ]
651
+
652
+ group_result = FeatureGroupResult(
653
+ group=FeatureGroup.REGULARITY,
654
+ features=feature_names,
655
+ rationale=self.RATIONALES[FeatureGroup.REGULARITY],
656
+ )
657
+
658
+ return result, group_result
659
+
660
+ def _compute_cohort_comparison(
661
+ self,
662
+ lag_features: pd.DataFrame,
663
+ value_cols: List[str],
664
+ entity_col: str,
665
+ ) -> tuple:
666
+ """Compute cohort comparison features (Group 7)."""
667
+ result = lag_features[[entity_col]].copy()
668
+ feature_names = []
669
+
670
+ for col in value_cols:
671
+ lag0_col = f"lag0_{col}_sum"
672
+
673
+ if lag0_col in lag_features.columns:
674
+ cohort_mean = lag_features[lag0_col].mean()
675
+ cohort_std = lag_features[lag0_col].std()
676
+
677
+ # Difference from cohort mean
678
+ vs_mean_name = f"{col}_vs_cohort_mean"
679
+ result[vs_mean_name] = lag_features[lag0_col] - cohort_mean
680
+ feature_names.append(vs_mean_name)
681
+
682
+ # Percentage of cohort mean
683
+ vs_pct_name = f"{col}_vs_cohort_pct"
684
+ result[vs_pct_name] = np.where(
685
+ cohort_mean != 0,
686
+ lag_features[lag0_col] / cohort_mean,
687
+ np.nan
688
+ )
689
+ feature_names.append(vs_pct_name)
690
+
691
+ # Z-score (standard deviations from mean)
692
+ if cohort_std > 0:
693
+ zscore_name = f"{col}_cohort_zscore"
694
+ result[zscore_name] = (lag_features[lag0_col] - cohort_mean) / cohort_std
695
+ feature_names.append(zscore_name)
696
+
697
+ group_result = FeatureGroupResult(
698
+ group=FeatureGroup.COHORT_COMPARISON,
699
+ features=feature_names,
700
+ rationale=self.RATIONALES[FeatureGroup.COHORT_COMPARISON],
701
+ )
702
+
703
+ return result, group_result