churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,340 @@
1
+ """
2
+ Feature manifest and versioning for customer retention analysis.
3
+
4
+ This module provides classes for tracking feature sets, manifests,
5
+ and registry for version management.
6
+ """
7
+
8
+ import hashlib
9
+ import uuid
10
+ from dataclasses import dataclass, field
11
+ from datetime import datetime
12
+ from typing import Any, Dict, List, Optional
13
+
14
+ from customer_retention.core.compat import DataFrame, pd
15
+
16
+
17
+ @dataclass
18
+ class FeatureManifest:
19
+ """
20
+ Manifest tracking the composition and provenance of a feature set.
21
+
22
+ Attributes
23
+ ----------
24
+ manifest_id : str
25
+ Unique identifier for this manifest.
26
+ created_at : datetime
27
+ When the manifest was created.
28
+ created_by : str, optional
29
+ Who created the manifest.
30
+ feature_table : str, optional
31
+ Source table name.
32
+ feature_table_version : int, optional
33
+ Delta version number if applicable.
34
+ features_included : List[str]
35
+ List of feature names included.
36
+ features_excluded : List[str]
37
+ Excluded features and reasons.
38
+ row_count : int
39
+ Number of rows in the dataset.
40
+ column_count : int
41
+ Number of feature columns.
42
+ checksum : str
43
+ Data integrity hash.
44
+ """
45
+ manifest_id: str
46
+ created_at: datetime
47
+ features_included: List[str]
48
+ row_count: int
49
+ column_count: int
50
+ checksum: str
51
+ created_by: Optional[str] = None
52
+ feature_table: Optional[str] = None
53
+ feature_table_version: Optional[int] = None
54
+ features_excluded: List[str] = field(default_factory=list)
55
+ feature_transformations: Dict[str, str] = field(default_factory=dict)
56
+ metadata: Dict[str, Any] = field(default_factory=dict)
57
+
58
+ @classmethod
59
+ def from_dataframe(
60
+ cls,
61
+ df: DataFrame,
62
+ feature_columns: List[str],
63
+ entity_column: Optional[str] = None,
64
+ created_by: Optional[str] = None,
65
+ feature_table: Optional[str] = None,
66
+ ) -> "FeatureManifest":
67
+ """
68
+ Create a manifest from a DataFrame.
69
+
70
+ Parameters
71
+ ----------
72
+ df : DataFrame
73
+ Source DataFrame.
74
+ feature_columns : List[str]
75
+ List of feature column names.
76
+ entity_column : str, optional
77
+ Entity/ID column name.
78
+ created_by : str, optional
79
+ Creator name.
80
+ feature_table : str, optional
81
+ Source table name.
82
+
83
+ Returns
84
+ -------
85
+ FeatureManifest
86
+ New manifest instance.
87
+ """
88
+ # Generate unique ID
89
+ manifest_id = str(uuid.uuid4())
90
+
91
+ # Compute checksum from feature data
92
+ feature_data = df[feature_columns].values
93
+ checksum = cls._compute_checksum(feature_data)
94
+
95
+ return cls(
96
+ manifest_id=manifest_id,
97
+ created_at=datetime.now(),
98
+ created_by=created_by,
99
+ feature_table=feature_table,
100
+ features_included=feature_columns.copy(),
101
+ row_count=len(df),
102
+ column_count=len(feature_columns),
103
+ checksum=checksum,
104
+ )
105
+
106
+ @staticmethod
107
+ def _compute_checksum(data) -> str:
108
+ """Compute MD5 checksum of data."""
109
+ # Convert to bytes and hash
110
+ data_bytes = pd.util.hash_pandas_object(
111
+ pd.DataFrame(data), index=False
112
+ ).values.tobytes()
113
+ return hashlib.md5(data_bytes).hexdigest()
114
+
115
+ def to_dict(self) -> Dict[str, Any]:
116
+ """Convert manifest to dictionary."""
117
+ return {
118
+ "manifest_id": self.manifest_id,
119
+ "created_at": self.created_at.isoformat(),
120
+ "created_by": self.created_by,
121
+ "feature_table": self.feature_table,
122
+ "feature_table_version": self.feature_table_version,
123
+ "features_included": self.features_included,
124
+ "features_excluded": self.features_excluded,
125
+ "feature_transformations": self.feature_transformations,
126
+ "row_count": self.row_count,
127
+ "column_count": self.column_count,
128
+ "checksum": self.checksum,
129
+ "metadata": self.metadata,
130
+ }
131
+
132
+
133
+ @dataclass
134
+ class FeatureSet:
135
+ """
136
+ Named, versioned collection of features.
137
+
138
+ Attributes
139
+ ----------
140
+ name : str
141
+ Feature set name.
142
+ version : str
143
+ Version identifier (semver format).
144
+ description : str
145
+ Purpose of this feature set.
146
+ features_included : List[str]
147
+ Selected features.
148
+ features_excluded : List[str]
149
+ Dropped features.
150
+ exclusion_reasons : Dict[str, str]
151
+ Why each was dropped.
152
+ created_at : datetime
153
+ Creation timestamp.
154
+ created_by : str, optional
155
+ Creator.
156
+ parent_feature_set : str, optional
157
+ If derived from another set.
158
+ metadata : Dict
159
+ Additional info.
160
+ """
161
+ name: str
162
+ version: str
163
+ description: str
164
+ features_included: List[str]
165
+ features_excluded: List[str] = field(default_factory=list)
166
+ exclusion_reasons: Dict[str, str] = field(default_factory=dict)
167
+ created_at: datetime = field(default_factory=datetime.now)
168
+ created_by: Optional[str] = None
169
+ parent_feature_set: Optional[str] = None
170
+ feature_table: Optional[str] = None
171
+ feature_table_version: Optional[int] = None
172
+ transformations: Dict[str, str] = field(default_factory=dict)
173
+ metadata: Dict[str, Any] = field(default_factory=dict)
174
+
175
+ def to_dict(self) -> Dict[str, Any]:
176
+ """Convert feature set to dictionary."""
177
+ return {
178
+ "name": self.name,
179
+ "version": self.version,
180
+ "description": self.description,
181
+ "features_included": self.features_included,
182
+ "features_excluded": self.features_excluded,
183
+ "exclusion_reasons": self.exclusion_reasons,
184
+ "created_at": self.created_at.isoformat(),
185
+ "created_by": self.created_by,
186
+ "parent_feature_set": self.parent_feature_set,
187
+ "feature_table": self.feature_table,
188
+ "feature_table_version": self.feature_table_version,
189
+ "transformations": self.transformations,
190
+ "metadata": self.metadata,
191
+ }
192
+
193
+
194
+ class FeatureSetRegistry:
195
+ """
196
+ Registry for managing feature sets.
197
+
198
+ Provides methods for registering, retrieving, and comparing
199
+ feature sets.
200
+ """
201
+
202
+ def __init__(self):
203
+ self._registry: Dict[str, Dict[str, FeatureSet]] = {}
204
+
205
+ def register(self, feature_set: FeatureSet) -> None:
206
+ """
207
+ Register a new feature set.
208
+
209
+ Parameters
210
+ ----------
211
+ feature_set : FeatureSet
212
+ Feature set to register.
213
+
214
+ Raises
215
+ ------
216
+ ValueError
217
+ If feature set with same name and version exists.
218
+ """
219
+ name = feature_set.name
220
+ version = feature_set.version
221
+
222
+ if name not in self._registry:
223
+ self._registry[name] = {}
224
+
225
+ if version in self._registry[name]:
226
+ raise ValueError(
227
+ f"Feature set '{name}' version '{version}' already registered."
228
+ )
229
+
230
+ self._registry[name][version] = feature_set
231
+
232
+ def get(
233
+ self,
234
+ name: str,
235
+ version: str
236
+ ) -> Optional[FeatureSet]:
237
+ """
238
+ Get a feature set by name and version.
239
+
240
+ Parameters
241
+ ----------
242
+ name : str
243
+ Feature set name.
244
+ version : str
245
+ Version string.
246
+
247
+ Returns
248
+ -------
249
+ FeatureSet or None
250
+ The feature set, or None if not found.
251
+ """
252
+ if name not in self._registry:
253
+ return None
254
+ return self._registry[name].get(version)
255
+
256
+ def get_latest(self, name: str) -> Optional[FeatureSet]:
257
+ """
258
+ Get the latest version of a feature set.
259
+
260
+ Parameters
261
+ ----------
262
+ name : str
263
+ Feature set name.
264
+
265
+ Returns
266
+ -------
267
+ FeatureSet or None
268
+ Latest version, or None if not found.
269
+ """
270
+ if name not in self._registry:
271
+ return None
272
+
273
+ versions = list(self._registry[name].keys())
274
+ if not versions:
275
+ return None
276
+
277
+ # Sort versions (assumes semver-like format)
278
+ versions.sort(key=lambda v: [int(x) for x in v.split(".")])
279
+ return self._registry[name][versions[-1]]
280
+
281
+ def list_all(self) -> List[FeatureSet]:
282
+ """
283
+ List all registered feature sets.
284
+
285
+ Returns
286
+ -------
287
+ List[FeatureSet]
288
+ All feature sets.
289
+ """
290
+ result = []
291
+ for versions in self._registry.values():
292
+ result.extend(versions.values())
293
+ return result
294
+
295
+ def list_versions(self, name: str) -> List[str]:
296
+ """
297
+ List all versions of a feature set.
298
+
299
+ Parameters
300
+ ----------
301
+ name : str
302
+ Feature set name.
303
+
304
+ Returns
305
+ -------
306
+ List[str]
307
+ Available versions.
308
+ """
309
+ if name not in self._registry:
310
+ return []
311
+ return list(self._registry[name].keys())
312
+
313
+ def compare(
314
+ self,
315
+ set1: FeatureSet,
316
+ set2: FeatureSet
317
+ ) -> Dict[str, List[str]]:
318
+ """
319
+ Compare two feature sets.
320
+
321
+ Parameters
322
+ ----------
323
+ set1 : FeatureSet
324
+ First feature set.
325
+ set2 : FeatureSet
326
+ Second feature set.
327
+
328
+ Returns
329
+ -------
330
+ Dict[str, List[str]]
331
+ Dictionary with 'added', 'removed', and 'unchanged' keys.
332
+ """
333
+ features1 = set(set1.features_included)
334
+ features2 = set(set2.features_included)
335
+
336
+ return {
337
+ "added": list(features2 - features1),
338
+ "removed": list(features1 - features2),
339
+ "unchanged": list(features1 & features2),
340
+ }
@@ -0,0 +1,239 @@
1
+ from dataclasses import dataclass, field
2
+ from enum import Enum
3
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional
4
+
5
+ import numpy as np
6
+
7
+ from customer_retention.core.compat import DataFrame, is_numeric_dtype, pd
8
+
9
+ if TYPE_CHECKING:
10
+ from customer_retention.analysis.auto_explorer.findings import FeatureAvailabilityMetadata
11
+
12
+
13
+ class SelectionMethod(Enum):
14
+ VARIANCE = "VARIANCE"
15
+ CORRELATION = "CORRELATION"
16
+ MUTUAL_INFO = "MUTUAL_INFO"
17
+ IMPORTANCE = "IMPORTANCE"
18
+ RECURSIVE = "RECURSIVE"
19
+ L1_SELECTION = "L1_SELECTION"
20
+
21
+
22
+ @dataclass
23
+ class FeatureSelectionResult:
24
+ df: DataFrame
25
+ selected_features: List[str]
26
+ dropped_features: List[str]
27
+ drop_reasons: Dict[str, str]
28
+ method_used: SelectionMethod
29
+ importance_scores: Optional[Dict[str, float]] = None
30
+
31
+
32
+ @dataclass
33
+ class AvailabilityRecommendation:
34
+ column: str
35
+ issue_type: str
36
+ coverage_pct: float
37
+ first_valid_date: Optional[str]
38
+ last_valid_date: Optional[str]
39
+ options: List[Dict[str, Any]] = field(default_factory=list)
40
+
41
+ def to_dict(self) -> Dict[str, Any]:
42
+ return {
43
+ "column": self.column,
44
+ "issue_type": self.issue_type,
45
+ "coverage_pct": self.coverage_pct,
46
+ "first_valid_date": self.first_valid_date,
47
+ "last_valid_date": self.last_valid_date,
48
+ "options": self.options,
49
+ }
50
+
51
+
52
+ class FeatureSelector:
53
+ def __init__(self, method: SelectionMethod = SelectionMethod.VARIANCE, variance_threshold: float = 0.01, correlation_threshold: float = 0.95, target_column: Optional[str] = None, preserve_features: Optional[List[str]] = None, max_features: Optional[int] = None, apply_correlation_filter: bool = False):
54
+ self.method = method
55
+ self.variance_threshold = variance_threshold
56
+ self.correlation_threshold = correlation_threshold
57
+ self.target_column = target_column
58
+ self.preserve_features = preserve_features or []
59
+ self.max_features = max_features
60
+ self.apply_correlation_filter = apply_correlation_filter
61
+
62
+ self.selected_features: List[str] = []
63
+ self.dropped_features: List[str] = []
64
+ self.drop_reasons: Dict[str, str] = {}
65
+ self._is_fitted = False
66
+
67
+ def fit(self, df: DataFrame) -> "FeatureSelector":
68
+ feature_cols = [c for c in df.columns if c != self.target_column]
69
+
70
+ self.selected_features = feature_cols.copy()
71
+ self.dropped_features = []
72
+ self.drop_reasons = {}
73
+
74
+ if self.method == SelectionMethod.VARIANCE:
75
+ self._apply_variance_selection(df, feature_cols)
76
+ elif self.method == SelectionMethod.CORRELATION:
77
+ self._apply_correlation_selection(df, feature_cols)
78
+
79
+ if self.apply_correlation_filter and self.method != SelectionMethod.CORRELATION:
80
+ self._apply_correlation_selection(df, self.selected_features.copy())
81
+
82
+ if self.max_features and len(self.selected_features) > self.max_features:
83
+ feature_df = df[self.selected_features]
84
+ variances = feature_df.var().sort_values(ascending=False)
85
+ to_keep = variances.head(self.max_features).index.tolist()
86
+ to_drop = [f for f in self.selected_features if f not in to_keep]
87
+ for feature in to_drop:
88
+ if feature not in self.preserve_features:
89
+ self.selected_features.remove(feature)
90
+ self.dropped_features.append(feature)
91
+ self.drop_reasons[feature] = "max_features limit"
92
+
93
+ self._is_fitted = True
94
+ return self
95
+
96
+ def transform(self, df: DataFrame) -> FeatureSelectionResult:
97
+ if not self._is_fitted:
98
+ raise ValueError("Selector not fitted. Call fit() first.")
99
+
100
+ cols_to_keep = self.selected_features.copy()
101
+ if self.target_column and self.target_column in df.columns:
102
+ cols_to_keep.append(self.target_column)
103
+
104
+ cols_to_keep = [c for c in cols_to_keep if c in df.columns]
105
+ result_df = df[cols_to_keep].copy()
106
+
107
+ return FeatureSelectionResult(
108
+ df=result_df,
109
+ selected_features=self.selected_features.copy(),
110
+ dropped_features=self.dropped_features.copy(),
111
+ drop_reasons=self.drop_reasons.copy(),
112
+ method_used=self.method,
113
+ )
114
+
115
+ def fit_transform(self, df: DataFrame) -> FeatureSelectionResult:
116
+ self.fit(df)
117
+ return self.transform(df)
118
+
119
+ def _apply_variance_selection(self, df: DataFrame, features: List[str]) -> None:
120
+ for feature in features:
121
+ if feature in self.preserve_features:
122
+ continue
123
+
124
+ series = df[feature]
125
+ if not is_numeric_dtype(series):
126
+ continue
127
+
128
+ variance = series.var()
129
+ if pd.isna(variance) or variance < self.variance_threshold:
130
+ if feature in self.selected_features:
131
+ self.selected_features.remove(feature)
132
+ self.dropped_features.append(feature)
133
+ self.drop_reasons[feature] = f"low variance ({variance:.6f})"
134
+
135
+ def _apply_correlation_selection(self, df: DataFrame, features: List[str]) -> None:
136
+ numeric_features = [f for f in features if f in df.columns and is_numeric_dtype(df[f]) and f in self.selected_features]
137
+
138
+ if len(numeric_features) < 2:
139
+ return
140
+
141
+ corr_matrix = df[numeric_features].corr().abs()
142
+
143
+ upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
144
+
145
+ to_drop = set()
146
+ for column in upper.columns:
147
+ correlated = upper.index[upper[column] > self.correlation_threshold].tolist()
148
+ for corr_feature in correlated:
149
+ if corr_feature in self.preserve_features:
150
+ if column not in self.preserve_features:
151
+ to_drop.add(column)
152
+ elif column in self.preserve_features:
153
+ to_drop.add(corr_feature)
154
+ else:
155
+ var1 = df[column].var()
156
+ var2 = df[corr_feature].var()
157
+ if var1 >= var2:
158
+ to_drop.add(corr_feature)
159
+ else:
160
+ to_drop.add(column)
161
+
162
+ for feature in to_drop:
163
+ if feature in self.selected_features:
164
+ self.selected_features.remove(feature)
165
+ self.dropped_features.append(feature)
166
+ self.drop_reasons[feature] = f"high correlation (> {self.correlation_threshold})"
167
+
168
+ def get_availability_recommendations(self, availability: Optional["FeatureAvailabilityMetadata"]) -> List[AvailabilityRecommendation]:
169
+ if availability is None:
170
+ return []
171
+ recommendations: List[AvailabilityRecommendation] = []
172
+ problem_columns = availability.new_tracking + availability.retired_tracking + availability.partial_window
173
+ for col in problem_columns:
174
+ feat_info = availability.features.get(col)
175
+ if feat_info is None:
176
+ continue
177
+ recommendations.append(AvailabilityRecommendation(
178
+ column=col,
179
+ issue_type=feat_info.availability_type,
180
+ coverage_pct=feat_info.coverage_pct,
181
+ first_valid_date=feat_info.first_valid_date,
182
+ last_valid_date=feat_info.last_valid_date,
183
+ options=self._build_availability_options(col, feat_info.availability_type, feat_info.first_valid_date, feat_info.last_valid_date, feat_info.coverage_pct),
184
+ ))
185
+ return recommendations
186
+
187
+ def _build_availability_options(self, col: str, issue_type: str, first_date: Optional[str], last_date: Optional[str], coverage_pct: float) -> List[Dict[str, Any]]:
188
+ options: List[Dict[str, Any]] = []
189
+ options.append({
190
+ "type": "remove",
191
+ "description": f"Remove '{col}' from feature selection (recommended for most cases)",
192
+ "preserves_data": False,
193
+ "recommended": True,
194
+ })
195
+ options.append({
196
+ "type": "add_indicator",
197
+ "description": f"Create '{col}_available' indicator column to flag valid observations",
198
+ "preserves_data": True,
199
+ })
200
+ if issue_type == "new_tracking":
201
+ options.append({
202
+ "type": "filter_window",
203
+ "description": f"Filter training data to start from {first_date}",
204
+ "preserves_data": True,
205
+ })
206
+ options.append({
207
+ "type": "segment_by_cohort",
208
+ "description": f"Train separate models: pre-{first_date} cohort (without feature) vs post-{first_date} cohort (with feature)",
209
+ "preserves_data": True,
210
+ })
211
+ elif issue_type == "retired":
212
+ options.append({
213
+ "type": "filter_window",
214
+ "description": f"Filter test/scoring data to end at {last_date}",
215
+ "preserves_data": True,
216
+ })
217
+ options.append({
218
+ "type": "segment_by_cohort",
219
+ "description": "Use feature only for historical scoring; train fallback model without it for future predictions",
220
+ "preserves_data": True,
221
+ })
222
+ elif issue_type == "partial_window":
223
+ options.append({
224
+ "type": "filter_window",
225
+ "description": f"Use data only within {first_date} to {last_date}",
226
+ "preserves_data": True,
227
+ })
228
+ options.append({
229
+ "type": "segment_by_availability",
230
+ "description": "Train separate models: one using this feature (within window), one without (outside window)",
231
+ "preserves_data": True,
232
+ })
233
+ if coverage_pct >= 30:
234
+ options.append({
235
+ "type": "impute",
236
+ "description": f"Impute missing values (median/mode) - {coverage_pct:.0f}% coverage may be sufficient",
237
+ "preserves_data": True,
238
+ })
239
+ return options