churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,551 @@
1
+ """
2
+ Feature engineering pipeline for customer retention analysis.
3
+
4
+ This module provides the FeatureEngineer class that orchestrates
5
+ all feature generation components.
6
+ """
7
+
8
+ from dataclasses import dataclass
9
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
10
+
11
+ from customer_retention.core.compat import DataFrame, Timestamp
12
+
13
+ if TYPE_CHECKING:
14
+ from customer_retention.integrations.feature_store.registry import FeatureRegistry
15
+ from customer_retention.stages.features.behavioral_features import BehavioralFeatureGenerator
16
+ from customer_retention.stages.features.feature_definitions import (
17
+ FeatureCatalog,
18
+ FeatureCategory,
19
+ FeatureDefinition,
20
+ LeakageRisk,
21
+ )
22
+ from customer_retention.stages.features.interaction_features import InteractionFeatureGenerator
23
+ from customer_retention.stages.features.temporal_features import (
24
+ ReferenceDateSource,
25
+ TemporalFeatureGenerator,
26
+ )
27
+ from customer_retention.stages.temporal.point_in_time_join import PointInTimeJoiner
28
+
29
+
30
+ @dataclass
31
+ class FeatureEngineerConfig:
32
+ """
33
+ Configuration for the feature engineering pipeline.
34
+
35
+ Parameters
36
+ ----------
37
+ reference_date : Timestamp, optional
38
+ Reference date for temporal calculations.
39
+ generate_temporal : bool, default True
40
+ Whether to generate temporal features.
41
+ generate_behavioral : bool, default True
42
+ Whether to generate behavioral features.
43
+ generate_interaction : bool, default True
44
+ Whether to generate interaction features.
45
+ created_column : str, optional
46
+ Column name for account creation date.
47
+ first_order_column : str, optional
48
+ Column name for first order date.
49
+ last_order_column : str, optional
50
+ Column name for last order date.
51
+ tenure_months_column : str, optional
52
+ Column name for tenure in months (if pre-computed).
53
+ total_orders_column : str, optional
54
+ Column name for total orders.
55
+ emails_sent_column : str, optional
56
+ Column name for emails sent.
57
+ open_rate_column : str, optional
58
+ Column name for email open rate.
59
+ click_rate_column : str, optional
60
+ Column name for email click rate.
61
+ service_columns : List[str], optional
62
+ List of binary service adoption columns.
63
+ interaction_combinations : List[Tuple], optional
64
+ List of feature combinations for interaction features.
65
+ interaction_ratios : List[Tuple], optional
66
+ List of ratio features for interaction features.
67
+ populate_catalog : bool, default False
68
+ Whether to populate feature catalog with definitions.
69
+ preserve_original : bool, default True
70
+ Whether to preserve original columns.
71
+ id_column : str, optional
72
+ Column name for customer ID (always preserved).
73
+ enforce_point_in_time : bool, default True
74
+ Whether to enforce point-in-time validation.
75
+ feature_timestamp_column : str, optional
76
+ Column name for feature observation timestamp.
77
+ """
78
+ reference_date: Optional[Timestamp] = None
79
+ generate_temporal: bool = True
80
+ generate_behavioral: bool = True
81
+ generate_interaction: bool = True
82
+ created_column: Optional[str] = None
83
+ first_order_column: Optional[str] = None
84
+ last_order_column: Optional[str] = None
85
+ tenure_months_column: Optional[str] = None
86
+ total_orders_column: Optional[str] = None
87
+ emails_sent_column: Optional[str] = None
88
+ open_rate_column: Optional[str] = None
89
+ click_rate_column: Optional[str] = None
90
+ service_columns: Optional[List[str]] = None
91
+ interaction_combinations: Optional[List[Tuple[str, str, str, str]]] = None
92
+ interaction_ratios: Optional[List[Tuple[str, str, str]]] = None
93
+ populate_catalog: bool = False
94
+ preserve_original: bool = True
95
+ id_column: Optional[str] = None
96
+ enforce_point_in_time: bool = True
97
+ feature_timestamp_column: Optional[str] = None
98
+
99
+
100
+ @dataclass
101
+ class FeatureEngineerResult:
102
+ """Result of feature engineering pipeline."""
103
+ df: DataFrame
104
+ generated_features: List[str]
105
+ feature_categories: Dict[str, List[str]]
106
+ config: FeatureEngineerConfig
107
+ pit_validation: Optional[Dict[str, Any]] = None
108
+
109
+
110
+ class FeatureEngineer:
111
+ """
112
+ Feature engineering pipeline that orchestrates feature generation.
113
+
114
+ This class combines temporal, behavioral, and interaction feature
115
+ generators into a single pipeline.
116
+
117
+ Parameters
118
+ ----------
119
+ config : FeatureEngineerConfig
120
+ Pipeline configuration.
121
+
122
+ Attributes
123
+ ----------
124
+ catalog : FeatureCatalog
125
+ Catalog of generated feature definitions.
126
+ generated_features : List[str]
127
+ List of all generated feature names.
128
+ """
129
+
130
+ def __init__(self, config: FeatureEngineerConfig):
131
+ self.config = config
132
+ self.catalog = FeatureCatalog()
133
+ self.generated_features: List[str] = []
134
+ self._feature_categories: Dict[str, List[str]] = {
135
+ "temporal": [],
136
+ "behavioral": [],
137
+ "interaction": [],
138
+ }
139
+ self._is_fitted = False
140
+
141
+ # Initialize generators
142
+ self._init_generators()
143
+
144
+ def _init_generators(self) -> None:
145
+ """Initialize feature generators based on config."""
146
+ # Temporal generator
147
+ if self.config.generate_temporal and self.config.reference_date:
148
+ self._temporal_generator = TemporalFeatureGenerator(
149
+ reference_date=self.config.reference_date,
150
+ reference_date_source=ReferenceDateSource.CONFIG,
151
+ created_column=self.config.created_column,
152
+ first_order_column=self.config.first_order_column,
153
+ last_order_column=self.config.last_order_column,
154
+ )
155
+ else:
156
+ self._temporal_generator = None
157
+
158
+ # Behavioral generator
159
+ if self.config.generate_behavioral:
160
+ self._behavioral_generator = BehavioralFeatureGenerator(
161
+ tenure_months_column=self.config.tenure_months_column,
162
+ total_orders_column=self.config.total_orders_column,
163
+ emails_sent_column=self.config.emails_sent_column,
164
+ open_rate_column=self.config.open_rate_column,
165
+ click_rate_column=self.config.click_rate_column,
166
+ service_columns=self.config.service_columns,
167
+ )
168
+ else:
169
+ self._behavioral_generator = None
170
+
171
+ # Interaction generator
172
+ if self.config.generate_interaction:
173
+ self._interaction_generator = InteractionFeatureGenerator(
174
+ combinations=self.config.interaction_combinations,
175
+ ratios=self.config.interaction_ratios,
176
+ )
177
+ else:
178
+ self._interaction_generator = None
179
+
180
+ def fit(self, df: DataFrame) -> "FeatureEngineer":
181
+ """
182
+ Fit the feature engineering pipeline.
183
+
184
+ Parameters
185
+ ----------
186
+ df : DataFrame
187
+ Input DataFrame.
188
+
189
+ Returns
190
+ -------
191
+ self
192
+ """
193
+ if self._temporal_generator:
194
+ self._temporal_generator.fit(df)
195
+ if self._behavioral_generator:
196
+ self._behavioral_generator.fit(df)
197
+ if self._interaction_generator:
198
+ self._interaction_generator.fit(df)
199
+
200
+ self._is_fitted = True
201
+ return self
202
+
203
+ def transform(self, df: DataFrame) -> FeatureEngineerResult:
204
+ """
205
+ Generate features for the input DataFrame.
206
+
207
+ Parameters
208
+ ----------
209
+ df : DataFrame
210
+ Input DataFrame.
211
+
212
+ Returns
213
+ -------
214
+ FeatureEngineerResult
215
+ Result containing DataFrame with features and metadata.
216
+ """
217
+ if not self._is_fitted:
218
+ raise ValueError("FeatureEngineer not fitted. Call fit() first.")
219
+
220
+ result_df = df.copy()
221
+ self.generated_features = []
222
+ self._feature_categories = {
223
+ "temporal": [],
224
+ "behavioral": [],
225
+ "interaction": [],
226
+ }
227
+ pit_validation = None
228
+
229
+ # Run point-in-time validation if enabled and feature_timestamp exists
230
+ if self.config.enforce_point_in_time:
231
+ pit_validation = self._validate_point_in_time(result_df)
232
+
233
+ # Apply temporal features
234
+ if self._temporal_generator:
235
+ result_df = self._temporal_generator.transform(result_df)
236
+ temporal_features = self._temporal_generator.generated_features
237
+ self.generated_features.extend(temporal_features)
238
+ self._feature_categories["temporal"] = temporal_features
239
+ if self.config.populate_catalog:
240
+ self._add_temporal_definitions(temporal_features)
241
+
242
+ # Apply behavioral features
243
+ if self._behavioral_generator:
244
+ result_df = self._behavioral_generator.transform(result_df)
245
+ behavioral_features = self._behavioral_generator.generated_features
246
+ self.generated_features.extend(behavioral_features)
247
+ self._feature_categories["behavioral"] = behavioral_features
248
+ if self.config.populate_catalog:
249
+ self._add_behavioral_definitions(behavioral_features)
250
+
251
+ # Apply interaction features (needs computed features)
252
+ if self._interaction_generator:
253
+ result_df = self._interaction_generator.transform(result_df)
254
+ interaction_features = self._interaction_generator.generated_features
255
+ self.generated_features.extend(interaction_features)
256
+ self._feature_categories["interaction"] = interaction_features
257
+
258
+ return FeatureEngineerResult(
259
+ df=result_df,
260
+ generated_features=self.generated_features.copy(),
261
+ feature_categories=self._feature_categories.copy(),
262
+ config=self.config,
263
+ pit_validation=pit_validation,
264
+ )
265
+
266
+ def fit_transform(self, df: DataFrame) -> FeatureEngineerResult:
267
+ """
268
+ Fit and transform in one step.
269
+
270
+ Parameters
271
+ ----------
272
+ df : DataFrame
273
+ Input DataFrame.
274
+
275
+ Returns
276
+ -------
277
+ FeatureEngineerResult
278
+ Result containing DataFrame with features and metadata.
279
+ """
280
+ self.fit(df)
281
+ return self.transform(df)
282
+
283
+ def _validate_point_in_time(self, df: DataFrame) -> Dict[str, Any]:
284
+ """
285
+ Validate point-in-time correctness of the DataFrame.
286
+
287
+ Returns validation report with any issues found.
288
+ """
289
+ ts_col = self.config.feature_timestamp_column or "feature_timestamp"
290
+
291
+ if ts_col not in df.columns:
292
+ return {"validated": False, "reason": f"No {ts_col} column found"}
293
+
294
+ report = PointInTimeJoiner.validate_temporal_integrity(df)
295
+ datetime_cols = df.select_dtypes(include=["datetime64"]).columns.tolist()
296
+ future_issues = PointInTimeJoiner.validate_no_future_data(
297
+ df, ts_col, [c for c in datetime_cols if c != ts_col]
298
+ )
299
+
300
+ report["future_data_issues"] = future_issues
301
+ report["validated"] = True
302
+ return report
303
+
304
+ def _add_temporal_definitions(self, features: List[str]) -> None:
305
+ """Add temporal feature definitions to catalog."""
306
+ definitions = {
307
+ "tenure_days": FeatureDefinition(
308
+ name="tenure_days",
309
+ description="Customer lifetime in days since account creation",
310
+ category=FeatureCategory.TEMPORAL,
311
+ derivation="reference_date - created_date",
312
+ source_columns=[self.config.created_column or "created"],
313
+ data_type="float",
314
+ business_meaning="How long customer has been with us",
315
+ leakage_risk=LeakageRisk.LOW,
316
+ ),
317
+ "account_age_months": FeatureDefinition(
318
+ name="account_age_months",
319
+ description="Customer tenure in months",
320
+ category=FeatureCategory.TEMPORAL,
321
+ derivation="tenure_days / 30.44",
322
+ source_columns=["tenure_days"],
323
+ data_type="float",
324
+ business_meaning="Customer tenure normalized to months",
325
+ leakage_risk=LeakageRisk.LOW,
326
+ ),
327
+ "days_since_last_order": FeatureDefinition(
328
+ name="days_since_last_order",
329
+ description="Days between reference date and last order",
330
+ category=FeatureCategory.TEMPORAL,
331
+ derivation="reference_date - last_order_date",
332
+ source_columns=[self.config.last_order_column or "lastorder"],
333
+ data_type="float",
334
+ business_meaning="Customer recency - higher values indicate dormant customers",
335
+ leakage_risk=LeakageRisk.MEDIUM,
336
+ ),
337
+ "days_to_first_order": FeatureDefinition(
338
+ name="days_to_first_order",
339
+ description="Days between account creation and first order",
340
+ category=FeatureCategory.TEMPORAL,
341
+ derivation="first_order_date - created_date",
342
+ source_columns=[
343
+ self.config.created_column or "created",
344
+ self.config.first_order_column or "firstorder"
345
+ ],
346
+ data_type="float",
347
+ business_meaning="Activation time - how quickly customer made first purchase",
348
+ leakage_risk=LeakageRisk.LOW,
349
+ ),
350
+ "active_period_days": FeatureDefinition(
351
+ name="active_period_days",
352
+ description="Days between first and last order",
353
+ category=FeatureCategory.TEMPORAL,
354
+ derivation="last_order_date - first_order_date",
355
+ source_columns=[
356
+ self.config.first_order_column or "firstorder",
357
+ self.config.last_order_column or "lastorder"
358
+ ],
359
+ data_type="float",
360
+ business_meaning="Active purchasing span",
361
+ leakage_risk=LeakageRisk.LOW,
362
+ ),
363
+ }
364
+
365
+ for feature_name in features:
366
+ if feature_name in definitions:
367
+ self.catalog.add(definitions[feature_name], overwrite=True)
368
+
369
+ def _add_behavioral_definitions(self, features: List[str]) -> None:
370
+ """Add behavioral feature definitions to catalog."""
371
+ definitions = {
372
+ "email_engagement_score": FeatureDefinition(
373
+ name="email_engagement_score",
374
+ description="Combined email engagement metric",
375
+ category=FeatureCategory.ENGAGEMENT,
376
+ derivation="(open_rate + click_rate) / 2",
377
+ source_columns=[
378
+ self.config.open_rate_column or "eopenrate",
379
+ self.config.click_rate_column or "eclickrate"
380
+ ],
381
+ data_type="float",
382
+ business_meaning="Overall email engagement level",
383
+ leakage_risk=LeakageRisk.LOW,
384
+ ),
385
+ "click_to_open_rate": FeatureDefinition(
386
+ name="click_to_open_rate",
387
+ description="Click rate relative to open rate",
388
+ category=FeatureCategory.ENGAGEMENT,
389
+ derivation="click_rate / open_rate",
390
+ source_columns=[
391
+ self.config.open_rate_column or "eopenrate",
392
+ self.config.click_rate_column or "eclickrate"
393
+ ],
394
+ data_type="float",
395
+ business_meaning="Email quality - how engaging emails are to openers",
396
+ leakage_risk=LeakageRisk.LOW,
397
+ ),
398
+ "service_adoption_score": FeatureDefinition(
399
+ name="service_adoption_score",
400
+ description="Count of services adopted",
401
+ category=FeatureCategory.ADOPTION,
402
+ derivation="sum(service_flags)",
403
+ source_columns=self.config.service_columns or [],
404
+ data_type="float",
405
+ business_meaning="Customer investment in platform services",
406
+ leakage_risk=LeakageRisk.LOW,
407
+ ),
408
+ "service_adoption_pct": FeatureDefinition(
409
+ name="service_adoption_pct",
410
+ description="Percentage of available services adopted",
411
+ category=FeatureCategory.ADOPTION,
412
+ derivation="services_used / total_services",
413
+ source_columns=self.config.service_columns or [],
414
+ data_type="float",
415
+ business_meaning="Relative service adoption level",
416
+ leakage_risk=LeakageRisk.LOW,
417
+ ),
418
+ }
419
+
420
+ for feature_name in features:
421
+ if feature_name in definitions:
422
+ self.catalog.add(definitions[feature_name], overwrite=True)
423
+
424
+ def to_feature_registry(self) -> "FeatureRegistry":
425
+ """Convert generated features to a FeatureRegistry for the feature store.
426
+
427
+ This creates temporal feature definitions that can be used with
428
+ the FeatureStoreManager for publishing and retrieval.
429
+
430
+ Returns
431
+ -------
432
+ FeatureRegistry
433
+ Registry containing all generated features
434
+ """
435
+ from customer_retention.integrations.feature_store import (
436
+ FeatureComputationType,
437
+ FeatureRegistry,
438
+ TemporalFeatureDefinition,
439
+ )
440
+
441
+ registry = FeatureRegistry()
442
+ entity_key = self.config.id_column or "entity_id"
443
+ timestamp_col = self.config.feature_timestamp_column or "feature_timestamp"
444
+
445
+ # Map FeatureCategory to leakage risk
446
+ category_to_risk = {
447
+ FeatureCategory.TEMPORAL: "low",
448
+ FeatureCategory.BEHAVIORAL: "low",
449
+ FeatureCategory.ENGAGEMENT: "low",
450
+ FeatureCategory.ADOPTION: "low",
451
+ FeatureCategory.DEMOGRAPHIC: "low",
452
+ FeatureCategory.AGGREGATE: "low",
453
+ FeatureCategory.RATIO: "low",
454
+ FeatureCategory.TREND: "medium",
455
+ FeatureCategory.INTERACTION: "low",
456
+ FeatureCategory.MONETARY: "low",
457
+ }
458
+
459
+ # Convert catalog entries to temporal feature definitions
460
+ for name in self.catalog.list_names():
461
+ old_def = self.catalog.get(name)
462
+ if old_def is None:
463
+ continue
464
+
465
+ # Determine computation type
466
+ if "interaction" in name.lower() or "_x_" in name:
467
+ comp_type = FeatureComputationType.INTERACTION
468
+ elif "ratio" in name.lower() or "_per_" in name:
469
+ comp_type = FeatureComputationType.RATIO
470
+ elif old_def.category in {FeatureCategory.AGGREGATE, FeatureCategory.TREND}:
471
+ comp_type = FeatureComputationType.AGGREGATION
472
+ else:
473
+ comp_type = FeatureComputationType.DERIVED
474
+
475
+ # For DERIVED type, we need a formula - fall back to PASSTHROUGH if none
476
+ if comp_type == FeatureComputationType.DERIVED and not old_def.derivation:
477
+ comp_type = FeatureComputationType.PASSTHROUGH
478
+
479
+ registry.register(TemporalFeatureDefinition(
480
+ name=old_def.name,
481
+ description=old_def.description,
482
+ entity_key=entity_key,
483
+ timestamp_column=timestamp_col,
484
+ source_columns=old_def.source_columns,
485
+ computation_type=comp_type,
486
+ derivation_formula=old_def.derivation if comp_type == FeatureComputationType.DERIVED else None,
487
+ data_type=old_def.data_type,
488
+ leakage_risk=category_to_risk.get(old_def.category, "low"),
489
+ leakage_notes=f"Category: {old_def.category.value}",
490
+ ))
491
+
492
+ # Add any generated features not in catalog
493
+ for feature_name in self.generated_features:
494
+ if feature_name not in registry:
495
+ registry.register(TemporalFeatureDefinition(
496
+ name=feature_name,
497
+ description=f"Generated feature: {feature_name}",
498
+ entity_key=entity_key,
499
+ timestamp_column=timestamp_col,
500
+ source_columns=[],
501
+ computation_type=FeatureComputationType.PASSTHROUGH,
502
+ data_type="float64",
503
+ leakage_risk="low",
504
+ ))
505
+
506
+ return registry
507
+
508
+ def publish_to_feature_store(
509
+ self,
510
+ df: DataFrame,
511
+ table_name: str = "customer_features",
512
+ backend: str = "feast",
513
+ repo_path: str = "./feature_store/feature_repo",
514
+ ) -> str:
515
+ """Publish features to the feature store.
516
+
517
+ Parameters
518
+ ----------
519
+ df : DataFrame
520
+ DataFrame with features to publish
521
+ table_name : str
522
+ Name of the feature table
523
+ backend : str
524
+ Feature store backend ("feast" or "databricks")
525
+ repo_path : str
526
+ Path to feature store repo (for Feast)
527
+
528
+ Returns
529
+ -------
530
+ str
531
+ Name of the created feature table
532
+ """
533
+ from customer_retention.integrations.feature_store import FeatureStoreManager
534
+
535
+ registry = self.to_feature_registry()
536
+
537
+ manager = FeatureStoreManager.create(
538
+ backend=backend,
539
+ repo_path=repo_path,
540
+ )
541
+
542
+ entity_key = self.config.id_column or "entity_id"
543
+ timestamp_col = self.config.feature_timestamp_column or "feature_timestamp"
544
+
545
+ return manager.publish_features(
546
+ df=df,
547
+ registry=registry,
548
+ table_name=table_name,
549
+ entity_key=entity_key,
550
+ timestamp_column=timestamp_col,
551
+ )