churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,160 @@
1
+ """
2
+ Interaction feature generation for customer retention analysis.
3
+
4
+ This module provides feature combinations and ratio calculations
5
+ from existing features.
6
+ """
7
+
8
+ from dataclasses import dataclass, field
9
+ from typing import List, Optional, Tuple
10
+
11
+ import numpy as np
12
+
13
+ from customer_retention.core.compat import DataFrame, Series
14
+
15
+
16
+ @dataclass
17
+ class InteractionFeatureResult:
18
+ """Result of interaction feature generation."""
19
+ df: DataFrame
20
+ generated_features: List[str]
21
+ skipped_combinations: List[str] = field(default_factory=list)
22
+
23
+
24
+ class InteractionFeatureGenerator:
25
+ """
26
+ Generates interaction features from combinations of existing features.
27
+
28
+ Interaction features are derived by combining two or more features
29
+ using mathematical operations (multiply, divide, add, subtract).
30
+
31
+ Parameters
32
+ ----------
33
+ combinations : List[Tuple[str, str, str, str]], optional
34
+ List of feature combinations to create.
35
+ Each tuple contains (col1, col2, output_name, operation).
36
+ Supported operations: "multiply", "divide", "add", "subtract"
37
+ ratios : List[Tuple[str, str, str]], optional
38
+ List of ratio features to create.
39
+ Each tuple contains (numerator, denominator, output_name).
40
+
41
+ Attributes
42
+ ----------
43
+ generated_features : List[str]
44
+ Names of features generated during last transform.
45
+ """
46
+
47
+ def __init__(
48
+ self,
49
+ combinations: Optional[List[Tuple[str, str, str, str]]] = None,
50
+ ratios: Optional[List[Tuple[str, str, str]]] = None,
51
+ ):
52
+ self.combinations = combinations or []
53
+ self.ratios = ratios or []
54
+ self.generated_features: List[str] = []
55
+ self._is_fitted = False
56
+
57
+ def fit(self, df: DataFrame) -> "InteractionFeatureGenerator":
58
+ """
59
+ Fit the generator (validates columns exist).
60
+
61
+ Parameters
62
+ ----------
63
+ df : DataFrame
64
+ Input DataFrame.
65
+
66
+ Returns
67
+ -------
68
+ self
69
+ """
70
+ self._is_fitted = True
71
+ return self
72
+
73
+ def transform(self, df: DataFrame) -> DataFrame:
74
+ """
75
+ Generate interaction features for the input DataFrame.
76
+
77
+ Parameters
78
+ ----------
79
+ df : DataFrame
80
+ Input DataFrame.
81
+
82
+ Returns
83
+ -------
84
+ DataFrame
85
+ DataFrame with interaction features added.
86
+ """
87
+ if not self._is_fitted:
88
+ raise ValueError("Generator not fitted. Call fit() first.")
89
+
90
+ result = df.copy()
91
+ self.generated_features = []
92
+
93
+ # Process combinations
94
+ for combo in self.combinations:
95
+ col1, col2, output_name, operation = combo
96
+ if col1 in df.columns and col2 in df.columns:
97
+ result[output_name] = self._apply_operation(
98
+ df[col1], df[col2], operation
99
+ )
100
+ self.generated_features.append(output_name)
101
+
102
+ # Process ratios
103
+ for ratio in self.ratios:
104
+ numerator, denominator, output_name = ratio
105
+ if numerator in df.columns and denominator in df.columns:
106
+ result[output_name] = self._safe_divide(
107
+ df[numerator], df[denominator]
108
+ )
109
+ self.generated_features.append(output_name)
110
+
111
+ return result
112
+
113
+ def fit_transform(self, df: DataFrame) -> DataFrame:
114
+ """
115
+ Fit and transform in one step.
116
+
117
+ Parameters
118
+ ----------
119
+ df : DataFrame
120
+ Input DataFrame.
121
+
122
+ Returns
123
+ -------
124
+ DataFrame
125
+ DataFrame with interaction features added.
126
+ """
127
+ self.fit(df)
128
+ return self.transform(df)
129
+
130
+ def _apply_operation(
131
+ self,
132
+ col1: Series,
133
+ col2: Series,
134
+ operation: str
135
+ ) -> Series:
136
+ """Apply the specified operation to two columns."""
137
+ if operation == "multiply":
138
+ return col1 * col2
139
+ elif operation == "divide":
140
+ return self._safe_divide(col1, col2)
141
+ elif operation == "add":
142
+ return col1 + col2
143
+ elif operation == "subtract":
144
+ return col1 - col2
145
+ else:
146
+ raise ValueError(f"Unknown operation: {operation}")
147
+
148
+ def _safe_divide(
149
+ self,
150
+ numerator: Series,
151
+ denominator: Series
152
+ ) -> Series:
153
+ """
154
+ Safely divide two series, handling division by zero.
155
+
156
+ Returns NaN where denominator is zero or null.
157
+ """
158
+ # Replace zeros with NaN to avoid inf
159
+ safe_denominator = denominator.replace(0, np.nan)
160
+ return numerator / safe_denominator
@@ -0,0 +1,243 @@
1
+ """
2
+ Temporal feature generation for customer retention analysis.
3
+
4
+ This module provides temporal feature calculations such as tenure,
5
+ recency, activation time, and active period.
6
+ """
7
+
8
+ import warnings
9
+ from dataclasses import dataclass, field
10
+ from enum import Enum
11
+ from typing import List, Optional, Union
12
+
13
+ from customer_retention.core.compat import DataFrame, Series, Timedelta, Timestamp, pd
14
+
15
+
16
+ class ReferenceDateSource(Enum):
17
+ """Source for the reference date used in temporal calculations."""
18
+ CONFIG = "config"
19
+ MAX_DATE = "max_date"
20
+ COLUMN = "column"
21
+ FEATURE_TIMESTAMP = "feature_timestamp"
22
+
23
+
24
+ @dataclass
25
+ class TemporalFeatureResult:
26
+ """Result of temporal feature generation."""
27
+ df: DataFrame
28
+ reference_date: Union[Timestamp, Series]
29
+ generated_features: List[str]
30
+ warnings: List[str] = field(default_factory=list)
31
+
32
+
33
+ class TemporalFeatureGenerator:
34
+ """
35
+ Generates temporal features from datetime columns.
36
+
37
+ Temporal features are calculated relative to a reference date, which can
38
+ be specified explicitly, derived from the data, or per-row from a column.
39
+
40
+ Parameters
41
+ ----------
42
+ reference_date : Timestamp, optional
43
+ Explicit reference date for calculations. Used when reference_date_source
44
+ is CONFIG.
45
+ reference_date_source : ReferenceDateSource, default CONFIG
46
+ How to determine the reference date:
47
+ - CONFIG: Use the explicit reference_date parameter
48
+ - MAX_DATE: Use the maximum date in date_column
49
+ - COLUMN: Use per-row dates from reference_date_column
50
+ reference_date_column : str, optional
51
+ Column name for per-row reference dates. Required when source is COLUMN.
52
+ date_column : str, optional
53
+ Column used to determine max date when source is MAX_DATE.
54
+ created_column : str, default "created"
55
+ Column containing customer account creation date.
56
+ first_order_column : str, optional
57
+ Column containing date of first order.
58
+ last_order_column : str, optional
59
+ Column containing date of last order.
60
+
61
+ Attributes
62
+ ----------
63
+ reference_date : Timestamp or Series
64
+ The reference date(s) used for calculations after fitting.
65
+ generated_features : List[str]
66
+ Names of features generated during last transform.
67
+ """
68
+
69
+ def __init__(
70
+ self,
71
+ reference_date: Optional[Timestamp] = None,
72
+ reference_date_source: ReferenceDateSource = ReferenceDateSource.CONFIG,
73
+ reference_date_column: Optional[str] = None,
74
+ date_column: Optional[str] = None,
75
+ created_column: str = "created",
76
+ first_order_column: Optional[str] = None,
77
+ last_order_column: Optional[str] = None,
78
+ ):
79
+ self._reference_date_param = reference_date
80
+ self.reference_date_source = reference_date_source
81
+ self.reference_date_column = reference_date_column
82
+ self.date_column = date_column
83
+ self.created_column = created_column
84
+ self.first_order_column = first_order_column
85
+ self.last_order_column = last_order_column
86
+
87
+ self.reference_date: Optional[Union[Timestamp, Series]] = None
88
+ self.generated_features: List[str] = []
89
+ self._is_fitted = False
90
+
91
+ def fit(self, df: DataFrame) -> "TemporalFeatureGenerator":
92
+ """
93
+ Fit the generator by determining the reference date.
94
+
95
+ Parameters
96
+ ----------
97
+ df : pd.DataFrame
98
+ Input DataFrame containing datetime columns.
99
+
100
+ Returns
101
+ -------
102
+ self
103
+ """
104
+ self._determine_reference_date(df)
105
+ self._is_fitted = True
106
+ return self
107
+
108
+ def transform(self, df: DataFrame) -> DataFrame:
109
+ """
110
+ Generate temporal features for the input DataFrame.
111
+
112
+ Parameters
113
+ ----------
114
+ df : DataFrame
115
+ Input DataFrame containing datetime columns.
116
+
117
+ Returns
118
+ -------
119
+ DataFrame
120
+ DataFrame with temporal features added.
121
+ """
122
+ if not self._is_fitted:
123
+ raise ValueError("Generator not fitted. Call fit() first.")
124
+
125
+ result = df.copy()
126
+ self.generated_features = []
127
+ warnings_list = []
128
+
129
+ # Get reference date(s) for this transform
130
+ if self.reference_date_source in [ReferenceDateSource.COLUMN, ReferenceDateSource.FEATURE_TIMESTAMP]:
131
+ ref_dates = pd.to_datetime(df[self.reference_date_column], format='mixed')
132
+ else:
133
+ ref_dates = self.reference_date
134
+
135
+ # Tenure features
136
+ if self.created_column and self.created_column in df.columns:
137
+ created = pd.to_datetime(df[self.created_column], format='mixed')
138
+ tenure_days = self._compute_days_diff(ref_dates, created)
139
+ result["tenure_days"] = tenure_days
140
+ self.generated_features.append("tenure_days")
141
+
142
+ # Check for negative values
143
+ if (tenure_days < 0).any():
144
+ warnings.warn(
145
+ "negative tenure_days detected. Reference date may be before "
146
+ "some created dates.",
147
+ UserWarning
148
+ )
149
+ warnings_list.append("negative_tenure_days")
150
+
151
+ # Account age in months
152
+ result["account_age_months"] = tenure_days / 30.44
153
+ self.generated_features.append("account_age_months")
154
+
155
+ # Recency features
156
+ if self.last_order_column and self.last_order_column in df.columns:
157
+ last_order = pd.to_datetime(df[self.last_order_column], format='mixed')
158
+ days_since_last = self._compute_days_diff(ref_dates, last_order)
159
+ result["days_since_last_order"] = days_since_last
160
+ self.generated_features.append("days_since_last_order")
161
+
162
+ # Activation features
163
+ if (self.first_order_column and self.first_order_column in df.columns and
164
+ self.created_column and self.created_column in df.columns):
165
+ created = pd.to_datetime(df[self.created_column], format='mixed')
166
+ first_order = pd.to_datetime(df[self.first_order_column], format='mixed')
167
+ days_to_first = self._compute_days_diff(first_order, created)
168
+ result["days_to_first_order"] = days_to_first
169
+ self.generated_features.append("days_to_first_order")
170
+
171
+ # Active period
172
+ if (self.first_order_column and self.first_order_column in df.columns and
173
+ self.last_order_column and self.last_order_column in df.columns):
174
+ first_order = pd.to_datetime(df[self.first_order_column], format='mixed')
175
+ last_order = pd.to_datetime(df[self.last_order_column], format='mixed')
176
+ active_period = self._compute_days_diff(last_order, first_order)
177
+ result["active_period_days"] = active_period
178
+ self.generated_features.append("active_period_days")
179
+
180
+ return result
181
+
182
+ def fit_transform(self, df: DataFrame) -> DataFrame:
183
+ """
184
+ Fit and transform in one step.
185
+
186
+ Parameters
187
+ ----------
188
+ df : DataFrame
189
+ Input DataFrame containing datetime columns.
190
+
191
+ Returns
192
+ -------
193
+ DataFrame
194
+ DataFrame with temporal features added.
195
+ """
196
+ self.fit(df)
197
+ return self.transform(df)
198
+
199
+ def _determine_reference_date(self, df: DataFrame) -> None:
200
+ """Determine the reference date based on configuration."""
201
+ if self.reference_date_source == ReferenceDateSource.CONFIG:
202
+ if self._reference_date_param is None:
203
+ raise ValueError(
204
+ "reference_date must be provided when source is CONFIG"
205
+ )
206
+ self.reference_date = self._reference_date_param
207
+
208
+ elif self.reference_date_source == ReferenceDateSource.MAX_DATE:
209
+ if self.date_column is None:
210
+ raise ValueError(
211
+ "date_column must be provided when source is MAX_DATE"
212
+ )
213
+ self.reference_date = pd.to_datetime(df[self.date_column], format='mixed').max()
214
+
215
+ elif self.reference_date_source == ReferenceDateSource.COLUMN:
216
+ if self.reference_date_column is None:
217
+ raise ValueError(
218
+ "reference_date_column must be provided when source is COLUMN"
219
+ )
220
+ self.reference_date = pd.to_datetime(df[self.reference_date_column], format='mixed')
221
+
222
+ elif self.reference_date_source == ReferenceDateSource.FEATURE_TIMESTAMP:
223
+ if "feature_timestamp" not in df.columns:
224
+ raise ValueError(
225
+ "feature_timestamp column required when source is FEATURE_TIMESTAMP"
226
+ )
227
+ self.reference_date = pd.to_datetime(df["feature_timestamp"], format='mixed')
228
+ self.reference_date_column = "feature_timestamp"
229
+
230
+ def _compute_days_diff(
231
+ self,
232
+ later: Union[Timestamp, Series],
233
+ earlier: Union[Timestamp, Series]
234
+ ) -> Series:
235
+ """
236
+ Compute the difference in days between two dates.
237
+
238
+ Handles both scalar and Series inputs, preserving NaN values.
239
+ """
240
+ diff = later - earlier
241
+ if isinstance(diff, Timedelta):
242
+ return pd.Series([diff.days])
243
+ return diff.dt.days
@@ -0,0 +1,9 @@
1
+ from .load_result import LoadResult
2
+ from .loaders import CSVLoader, DataLoader, DeltaLoader, LoaderFactory, ParquetLoader
3
+ from .source_registry import DataSourceRegistry
4
+
5
+ __all__ = [
6
+ "LoadResult",
7
+ "DataLoader", "CSVLoader", "ParquetLoader", "DeltaLoader", "LoaderFactory",
8
+ "DataSourceRegistry"
9
+ ]
@@ -0,0 +1,32 @@
1
+ from pydantic import BaseModel
2
+
3
+
4
+ class LoadResult(BaseModel):
5
+ success: bool
6
+ row_count: int
7
+ column_count: int
8
+ duration_seconds: float
9
+ source_name: str
10
+ warnings: list[str] = []
11
+ errors: list[str] = []
12
+ schema_info: dict[str, str] = {}
13
+
14
+ def has_warnings(self) -> bool:
15
+ return len(self.warnings) > 0
16
+
17
+ def has_errors(self) -> bool:
18
+ return len(self.errors) > 0
19
+
20
+ def add_warning(self, message: str) -> None:
21
+ self.warnings.append(message)
22
+
23
+ def add_error(self, message: str) -> None:
24
+ self.errors.append(message)
25
+
26
+ def get_summary(self) -> str:
27
+ status = "SUCCESS" if self.success else "FAILED"
28
+ return (
29
+ f"{status}: {self.source_name} - "
30
+ f"{self.row_count} rows, {self.column_count} columns "
31
+ f"({self.duration_seconds:.2f}s)"
32
+ )
@@ -0,0 +1,195 @@
1
+ import time
2
+ from abc import ABC, abstractmethod
3
+ from typing import Any, Optional
4
+
5
+ from customer_retention.core.compat import DataFrame, pd
6
+ from customer_retention.core.config.source_config import DataSourceConfig, FileFormat, SourceType
7
+
8
+ from .load_result import LoadResult
9
+
10
+
11
+ class DataLoader(ABC):
12
+ @abstractmethod
13
+ def load(self, config: DataSourceConfig, spark_session: Optional[Any] = None,
14
+ sample_size: Optional[int] = None) -> tuple[DataFrame, LoadResult]:
15
+ pass
16
+
17
+ @abstractmethod
18
+ def validate_source(self, config: DataSourceConfig) -> list[str]:
19
+ pass
20
+
21
+ def create_load_result(self, config: DataSourceConfig, df: DataFrame,
22
+ duration: float, success: bool = True) -> LoadResult:
23
+ return LoadResult(
24
+ success=success,
25
+ row_count=len(df),
26
+ column_count=len(df.columns),
27
+ duration_seconds=duration,
28
+ source_name=config.name,
29
+ schema_info={col: str(dtype) for col, dtype in df.dtypes.items()}
30
+ )
31
+
32
+ def apply_sample(self, df: DataFrame, sample_size: Optional[int]) -> DataFrame:
33
+ return df.head(sample_size) if sample_size else df
34
+
35
+
36
+ class CSVLoader(DataLoader):
37
+ def validate_source(self, config: DataSourceConfig) -> list[str]:
38
+ errors = []
39
+ if not config.path:
40
+ errors.append("CSV source requires path")
41
+ if config.file_format != FileFormat.CSV:
42
+ errors.append(f"CSVLoader requires CSV format, got {config.file_format}")
43
+ return errors
44
+
45
+ def load(self, config: DataSourceConfig, spark_session: Optional[Any] = None,
46
+ sample_size: Optional[int] = None) -> tuple[DataFrame, LoadResult]:
47
+ start_time = time.time()
48
+ result = LoadResult(success=False, row_count=0, column_count=0,
49
+ duration_seconds=0, source_name=config.name)
50
+
51
+ try:
52
+ validation_errors = self.validate_source(config)
53
+ if validation_errors:
54
+ result.errors.extend(validation_errors)
55
+ result.duration_seconds = time.time() - start_time
56
+ return pd.DataFrame(), result
57
+
58
+ read_kwargs = self.build_read_kwargs(config, sample_size)
59
+ df = pd.read_csv(config.path, **read_kwargs)
60
+
61
+ if sample_size and len(df) > sample_size:
62
+ df = df.head(sample_size)
63
+
64
+ duration = time.time() - start_time
65
+ result = self.create_load_result(config, df, duration)
66
+ return df, result
67
+
68
+ except Exception as e:
69
+ result.add_error(f"Failed to load CSV: {str(e)}")
70
+ result.duration_seconds = time.time() - start_time
71
+ return pd.DataFrame(), result
72
+
73
+ def build_read_kwargs(self, config: DataSourceConfig, sample_size: Optional[int]) -> dict:
74
+ kwargs = {
75
+ "delimiter": config.delimiter,
76
+ "header": 0 if config.header else None,
77
+ "quotechar": config.quote_char,
78
+ "encoding": config.encoding
79
+ }
80
+ if sample_size:
81
+ kwargs["nrows"] = sample_size
82
+ return kwargs
83
+
84
+
85
+ class ParquetLoader(DataLoader):
86
+ def validate_source(self, config: DataSourceConfig) -> list[str]:
87
+ errors = []
88
+ if not config.path:
89
+ errors.append("Parquet source requires path")
90
+ if config.file_format != FileFormat.PARQUET:
91
+ errors.append(f"ParquetLoader requires PARQUET format, got {config.file_format}")
92
+ return errors
93
+
94
+ def load(self, config: DataSourceConfig, spark_session: Optional[Any] = None,
95
+ sample_size: Optional[int] = None) -> tuple[DataFrame, LoadResult]:
96
+ start_time = time.time()
97
+ result = LoadResult(success=False, row_count=0, column_count=0,
98
+ duration_seconds=0, source_name=config.name)
99
+
100
+ try:
101
+ validation_errors = self.validate_source(config)
102
+ if validation_errors:
103
+ result.errors.extend(validation_errors)
104
+ result.duration_seconds = time.time() - start_time
105
+ return pd.DataFrame(), result
106
+
107
+ df = pd.read_parquet(config.path)
108
+ df = self.apply_sample(df, sample_size)
109
+
110
+ duration = time.time() - start_time
111
+ result = self.create_load_result(config, df, duration)
112
+ return df, result
113
+
114
+ except Exception as e:
115
+ result.add_error(f"Failed to load Parquet: {str(e)}")
116
+ result.duration_seconds = time.time() - start_time
117
+ return pd.DataFrame(), result
118
+
119
+
120
+ class DeltaLoader(DataLoader):
121
+ def validate_source(self, config: DataSourceConfig) -> list[str]:
122
+ errors = []
123
+ if config.source_type == SourceType.BATCH_FILE and not config.path:
124
+ errors.append("Delta file source requires path")
125
+ if config.source_type == SourceType.BATCH_TABLE and not config.table:
126
+ errors.append("Delta table source requires table name")
127
+ if config.file_format != FileFormat.DELTA:
128
+ errors.append(f"DeltaLoader requires DELTA format, got {config.file_format}")
129
+ return errors
130
+
131
+ def load(self, config: DataSourceConfig, spark_session: Optional[Any] = None,
132
+ sample_size: Optional[int] = None) -> tuple[DataFrame, LoadResult]:
133
+ start_time = time.time()
134
+ result = LoadResult(success=False, row_count=0, column_count=0,
135
+ duration_seconds=0, source_name=config.name)
136
+
137
+ try:
138
+ validation_errors = self.validate_source(config)
139
+ if validation_errors:
140
+ result.errors.extend(validation_errors)
141
+ result.duration_seconds = time.time() - start_time
142
+ return pd.DataFrame(), result
143
+
144
+ if not spark_session:
145
+ result.add_error("Delta loader requires Spark session")
146
+ result.duration_seconds = time.time() - start_time
147
+ return pd.DataFrame(), result
148
+
149
+ spark_df = self.load_spark_dataframe(config, spark_session, sample_size)
150
+ df = spark_df.toPandas()
151
+
152
+ duration = time.time() - start_time
153
+ result = self.create_load_result(config, df, duration)
154
+ return df, result
155
+
156
+ except Exception as e:
157
+ result.add_error(f"Failed to load Delta: {str(e)}")
158
+ result.duration_seconds = time.time() - start_time
159
+ return pd.DataFrame(), result
160
+
161
+ def load_spark_dataframe(self, config: DataSourceConfig, spark_session: Any,
162
+ sample_size: Optional[int]) -> Any:
163
+ if config.source_type == SourceType.BATCH_FILE:
164
+ spark_df = spark_session.read.format("delta").load(config.path)
165
+ else:
166
+ table_name = config.get_full_table_name()
167
+ spark_df = spark_session.read.table(table_name)
168
+
169
+ if sample_size:
170
+ spark_df = spark_df.limit(sample_size)
171
+
172
+ return spark_df
173
+
174
+
175
+ class LoaderFactory:
176
+ _loaders = {
177
+ FileFormat.CSV: CSVLoader,
178
+ FileFormat.PARQUET: ParquetLoader,
179
+ FileFormat.DELTA: DeltaLoader
180
+ }
181
+
182
+ @classmethod
183
+ def get_loader(cls, config: DataSourceConfig) -> DataLoader:
184
+ if not config.file_format:
185
+ raise ValueError(f"file_format required to determine loader for {config.name}")
186
+
187
+ loader_class = cls._loaders.get(config.file_format)
188
+ if not loader_class:
189
+ raise ValueError(f"No loader available for format: {config.file_format}")
190
+
191
+ return loader_class()
192
+
193
+ @classmethod
194
+ def register_loader(cls, file_format: FileFormat, loader_class: type[DataLoader]) -> None:
195
+ cls._loaders[file_format] = loader_class