churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,507 @@
1
+ """FeatureCapacityAnalyzer - estimates favorable feature-to-data ratios for modeling.
2
+
3
+ Key Concepts:
4
+ Events Per Variable (EPV): Minimum minority class samples per feature
5
+ - Conservative (EPV=20): Very stable, recommended for regulatory/high-stakes
6
+ - Moderate (EPV=10): Standard practice, widely used in literature
7
+ - Aggressive (EPV=5): With strong regularization, acceptable for exploration
8
+
9
+ Effective Features: Features that contribute independent information
10
+ - Highly correlated features (r > 0.8) count as ~1 effective feature
11
+ - Use eigenvalue analysis to estimate true dimensionality
12
+
13
+ Guidelines Based on Common Statistical Practice:
14
+ - Harrell (2015): EPV >= 10-20 for logistic regression
15
+ - Peduzzi et al. (1996): EPV >= 10 to avoid coefficient bias
16
+ - Tree models: More flexible, but still benefit from adequate data
17
+ """
18
+
19
+ from dataclasses import dataclass
20
+ from typing import Any, Dict, List, Optional, Tuple
21
+
22
+ import numpy as np
23
+
24
+ from customer_retention.core.compat import pd
25
+
26
+
27
+ @dataclass
28
+ class EffectiveFeaturesResult:
29
+ """Results from effective feature analysis."""
30
+ total_count: int
31
+ effective_count: float
32
+ redundant_features: List[str]
33
+ representative_features: List[str]
34
+ feature_clusters: List[List[str]]
35
+ correlation_matrix: Optional[pd.DataFrame] = None
36
+
37
+ def to_dict(self) -> Dict[str, Any]:
38
+ return {
39
+ "total_count": self.total_count,
40
+ "effective_count": self.effective_count,
41
+ "redundant_features": self.redundant_features,
42
+ "representative_features": self.representative_features,
43
+ "n_clusters": len(self.feature_clusters),
44
+ }
45
+
46
+
47
+ @dataclass
48
+ class ModelComplexityGuidance:
49
+ """Guidance on model complexity given data constraints."""
50
+ max_features_linear: int
51
+ max_features_tree: int
52
+ max_features_regularized: int
53
+ recommended_model_type: str
54
+ model_recommendations: List[str]
55
+ recommendations: List[str]
56
+
57
+ def to_dict(self) -> Dict[str, Any]:
58
+ return {
59
+ "max_features_linear": self.max_features_linear,
60
+ "max_features_tree": self.max_features_tree,
61
+ "max_features_regularized": self.max_features_regularized,
62
+ "recommended_model_type": self.recommended_model_type,
63
+ "model_recommendations": self.model_recommendations,
64
+ "recommendations": self.recommendations,
65
+ }
66
+
67
+
68
+ @dataclass
69
+ class FeatureCapacityResult:
70
+ """Results from feature capacity analysis."""
71
+ total_samples: int
72
+ minority_class_samples: int
73
+ total_features: int
74
+ effective_features: float
75
+ recommended_features_conservative: int
76
+ recommended_features_moderate: int
77
+ recommended_features_aggressive: int
78
+ events_per_variable: float
79
+ samples_per_feature: float
80
+ capacity_status: str # "adequate", "limited", "inadequate"
81
+ recommendations: List[str]
82
+ effective_features_result: Optional[EffectiveFeaturesResult] = None
83
+ complexity_guidance: Optional[ModelComplexityGuidance] = None
84
+
85
+ def to_dict(self) -> Dict[str, Any]:
86
+ return {
87
+ "total_samples": self.total_samples,
88
+ "minority_class_samples": self.minority_class_samples,
89
+ "total_features": self.total_features,
90
+ "effective_features": self.effective_features,
91
+ "recommended_features_conservative": self.recommended_features_conservative,
92
+ "recommended_features_moderate": self.recommended_features_moderate,
93
+ "recommended_features_aggressive": self.recommended_features_aggressive,
94
+ "events_per_variable": self.events_per_variable,
95
+ "samples_per_feature": self.samples_per_feature,
96
+ "capacity_status": self.capacity_status,
97
+ "recommendations": self.recommendations,
98
+ }
99
+
100
+
101
+ @dataclass
102
+ class SegmentCapacityResult:
103
+ """Results from segment-level capacity analysis."""
104
+ segment_capacities: Dict[str, FeatureCapacityResult]
105
+ recommended_strategy: str # "single_model", "segment_models", "hybrid"
106
+ strategy_reason: str
107
+ viable_segments: List[str]
108
+ insufficient_segments: List[str]
109
+ recommendations: List[str]
110
+
111
+ def to_dict(self) -> Dict[str, Any]:
112
+ return {
113
+ "recommended_strategy": self.recommended_strategy,
114
+ "strategy_reason": self.strategy_reason,
115
+ "viable_segments": self.viable_segments,
116
+ "insufficient_segments": self.insufficient_segments,
117
+ "recommendations": self.recommendations,
118
+ "segment_details": {k: v.to_dict() for k, v in self.segment_capacities.items()},
119
+ }
120
+
121
+
122
+ class FeatureCapacityAnalyzer:
123
+ """Analyzes feature capacity relative to available data.
124
+
125
+ Determines how many features can be reliably used given:
126
+ - Total sample size
127
+ - Minority class events (for classification)
128
+ - Feature correlation structure
129
+ - Model type assumptions
130
+
131
+ Key Assumptions:
132
+ 1. EPV (Events Per Variable) of 10-20 needed for stable logistic regression
133
+ 2. Tree models are more flexible but still benefit from adequate data
134
+ 3. Highly correlated features provide redundant information
135
+ 4. Regularization allows using more features with less data
136
+ """
137
+
138
+ EPV_CONSERVATIVE = 20 # Very stable, regulatory-grade
139
+ EPV_MODERATE = 10 # Standard practice
140
+ EPV_AGGRESSIVE = 5 # With strong regularization
141
+ CORRELATION_THRESHOLD = 0.8 # Features above this are considered redundant
142
+ MIN_SEGMENT_EVENTS = 50 # Minimum events for viable segment model
143
+
144
+ def analyze(
145
+ self,
146
+ df: pd.DataFrame,
147
+ feature_cols: List[str],
148
+ target_col: str,
149
+ ) -> FeatureCapacityResult:
150
+ """Analyze feature capacity for a dataset."""
151
+ n_samples = len(df)
152
+ n_features = len(feature_cols)
153
+
154
+ # Calculate minority class
155
+ target = df[target_col]
156
+ class_counts = target.value_counts()
157
+ minority_samples = int(class_counts.min())
158
+
159
+ # Calculate EPV
160
+ epv = minority_samples / n_features if n_features > 0 else 0
161
+ samples_per_feature = n_samples / n_features if n_features > 0 else 0
162
+
163
+ # Calculate effective features
164
+ eff_result = self.calculate_effective_features(df, feature_cols)
165
+ effective_features = eff_result.effective_count
166
+
167
+ # Recommended feature counts
168
+ rec_conservative = int(minority_samples / self.EPV_CONSERVATIVE)
169
+ rec_moderate = int(minority_samples / self.EPV_MODERATE)
170
+ rec_aggressive = int(minority_samples / self.EPV_AGGRESSIVE)
171
+
172
+ # Determine capacity status
173
+ capacity_status = self._determine_capacity_status(epv, n_features, effective_features)
174
+
175
+ # Generate recommendations
176
+ recommendations = self._generate_recommendations(
177
+ epv, n_features, effective_features, minority_samples, capacity_status
178
+ )
179
+
180
+ # Get complexity guidance
181
+ complexity_guidance = self.get_complexity_guidance(n_samples, minority_samples, n_features)
182
+
183
+ return FeatureCapacityResult(
184
+ total_samples=n_samples,
185
+ minority_class_samples=minority_samples,
186
+ total_features=n_features,
187
+ effective_features=effective_features,
188
+ recommended_features_conservative=rec_conservative,
189
+ recommended_features_moderate=rec_moderate,
190
+ recommended_features_aggressive=rec_aggressive,
191
+ events_per_variable=epv,
192
+ samples_per_feature=samples_per_feature,
193
+ capacity_status=capacity_status,
194
+ recommendations=recommendations,
195
+ effective_features_result=eff_result,
196
+ complexity_guidance=complexity_guidance,
197
+ )
198
+
199
+ def calculate_effective_features(
200
+ self,
201
+ df: pd.DataFrame,
202
+ feature_cols: List[str],
203
+ ) -> EffectiveFeaturesResult:
204
+ """Calculate effective number of independent features.
205
+
206
+ Uses correlation analysis and eigenvalue decomposition to estimate
207
+ the true dimensionality of the feature space.
208
+ """
209
+ valid_cols = [c for c in feature_cols if c in df.columns]
210
+ if len(valid_cols) < 2:
211
+ return EffectiveFeaturesResult(
212
+ total_count=len(valid_cols),
213
+ effective_count=float(len(valid_cols)),
214
+ redundant_features=[],
215
+ representative_features=valid_cols,
216
+ feature_clusters=[valid_cols] if valid_cols else [],
217
+ )
218
+
219
+ # Calculate correlation matrix
220
+ corr_matrix = df[valid_cols].corr()
221
+
222
+ # Find redundant features (highly correlated pairs)
223
+ redundant = set()
224
+ feature_clusters = []
225
+ processed = set()
226
+
227
+ for i, col1 in enumerate(valid_cols):
228
+ if col1 in processed:
229
+ continue
230
+
231
+ cluster = [col1]
232
+ for col2 in valid_cols[i + 1:]:
233
+ if col2 in processed:
234
+ continue
235
+ corr_val = abs(corr_matrix.loc[col1, col2])
236
+ if corr_val >= self.CORRELATION_THRESHOLD:
237
+ cluster.append(col2)
238
+ redundant.add(col2)
239
+
240
+ if len(cluster) > 1:
241
+ feature_clusters.append(cluster)
242
+ processed.update(cluster)
243
+
244
+ # Representative features: one from each cluster + unclustered
245
+ representative = []
246
+ clustered = set()
247
+ for cluster in feature_clusters:
248
+ representative.append(cluster[0]) # First feature represents cluster
249
+ clustered.update(cluster)
250
+
251
+ # Add unclustered features
252
+ for col in valid_cols:
253
+ if col not in clustered:
254
+ representative.append(col)
255
+
256
+ # Estimate effective features using eigenvalue analysis
257
+ try:
258
+ eigenvalues = np.linalg.eigvalsh(corr_matrix.values)
259
+ eigenvalues = np.sort(eigenvalues)[::-1]
260
+ # Effective dimensionality: count eigenvalues > 1 (Kaiser criterion)
261
+ # or use cumulative variance explained
262
+ total_var = eigenvalues.sum()
263
+ cumsum = np.cumsum(eigenvalues)
264
+ # Count eigenvalues needed for 95% variance
265
+ effective_count = float(np.searchsorted(cumsum, 0.95 * total_var) + 1)
266
+ effective_count = min(effective_count, len(valid_cols))
267
+ except Exception:
268
+ effective_count = float(len(representative))
269
+
270
+ return EffectiveFeaturesResult(
271
+ total_count=len(valid_cols),
272
+ effective_count=effective_count,
273
+ redundant_features=list(redundant),
274
+ representative_features=representative,
275
+ feature_clusters=feature_clusters,
276
+ correlation_matrix=corr_matrix,
277
+ )
278
+
279
+ def analyze_segment_capacity(
280
+ self,
281
+ df: pd.DataFrame,
282
+ feature_cols: List[str],
283
+ target_col: str,
284
+ segment_col: str,
285
+ ) -> SegmentCapacityResult:
286
+ """Analyze feature capacity for each segment to guide segmented modeling."""
287
+ segment_capacities = {}
288
+ viable_segments = []
289
+ insufficient_segments = []
290
+
291
+ for segment_value in df[segment_col].unique():
292
+ segment_df = df[df[segment_col] == segment_value]
293
+ capacity = self.analyze(segment_df, feature_cols, target_col)
294
+ segment_capacities[str(segment_value)] = capacity
295
+
296
+ if capacity.capacity_status == "adequate":
297
+ viable_segments.append(str(segment_value))
298
+ else:
299
+ insufficient_segments.append(str(segment_value))
300
+
301
+ # Determine recommended strategy
302
+ strategy, reason = self._determine_segment_strategy(
303
+ segment_capacities, viable_segments, insufficient_segments
304
+ )
305
+
306
+ # Generate recommendations
307
+ recommendations = self._generate_segment_recommendations(
308
+ segment_capacities, viable_segments, insufficient_segments, strategy
309
+ )
310
+
311
+ return SegmentCapacityResult(
312
+ segment_capacities=segment_capacities,
313
+ recommended_strategy=strategy,
314
+ strategy_reason=reason,
315
+ viable_segments=viable_segments,
316
+ insufficient_segments=insufficient_segments,
317
+ recommendations=recommendations,
318
+ )
319
+
320
+ def get_complexity_guidance(
321
+ self,
322
+ n_samples: int,
323
+ n_minority: int,
324
+ n_features: int,
325
+ ) -> ModelComplexityGuidance:
326
+ """Provide model complexity guidance based on data constraints."""
327
+ # Linear models: strict EPV requirements
328
+ max_linear = int(n_minority / self.EPV_MODERATE)
329
+
330
+ # Tree models: more flexible, use samples_per_feature
331
+ # Rule of thumb: at least 20-30 samples per leaf, which loosely translates
332
+ max_tree = int(n_samples / 30)
333
+
334
+ # Regularized models: can use EPV=5 with strong regularization
335
+ max_regularized = int(n_minority / self.EPV_AGGRESSIVE)
336
+
337
+ # Determine recommended model type
338
+ if n_minority < 50:
339
+ recommended = "simple_linear"
340
+ model_recs = [
341
+ "Very limited events - use simple logistic regression with 1-3 features",
342
+ "Consider exact logistic regression for very small samples",
343
+ "Cross-validation may be unreliable - use bootstrap or leave-one-out",
344
+ ]
345
+ elif n_features <= max_linear:
346
+ recommended = "linear"
347
+ model_recs = [
348
+ "Adequate data for standard logistic regression",
349
+ "Can use all features without regularization",
350
+ "Consider tree models for comparison",
351
+ ]
352
+ elif n_features <= max_regularized:
353
+ recommended = "regularized_linear"
354
+ model_recs = [
355
+ "Use L1/L2 regularization (Lasso, Ridge, Elastic Net)",
356
+ "Cross-validate regularization strength",
357
+ "Tree-based models are also well-suited",
358
+ ]
359
+ else:
360
+ recommended = "tree_ensemble"
361
+ model_recs = [
362
+ "Feature count exceeds linear model capacity",
363
+ "Use Random Forest, XGBoost, or LightGBM",
364
+ "Consider feature selection before linear models",
365
+ ]
366
+
367
+ # General recommendations
368
+ recommendations = []
369
+ epv = n_minority / n_features if n_features > 0 else float("inf")
370
+
371
+ if epv < 5:
372
+ recommendations.append(
373
+ f"Critical: EPV={epv:.1f} is very low. Reduce features to {max_linear} or fewer."
374
+ )
375
+ elif epv < 10:
376
+ recommendations.append(
377
+ f"Limited: EPV={epv:.1f}. Use regularization or reduce to {max_linear} features."
378
+ )
379
+ elif epv < 20:
380
+ recommendations.append(
381
+ f"Moderate: EPV={epv:.1f}. Adequate for regularized models."
382
+ )
383
+ else:
384
+ recommendations.append(
385
+ f"Adequate: EPV={epv:.1f}. Sufficient data for robust modeling."
386
+ )
387
+
388
+ return ModelComplexityGuidance(
389
+ max_features_linear=max_linear,
390
+ max_features_tree=max_tree,
391
+ max_features_regularized=max_regularized,
392
+ recommended_model_type=recommended,
393
+ model_recommendations=model_recs,
394
+ recommendations=recommendations,
395
+ )
396
+
397
+ def _determine_capacity_status(
398
+ self,
399
+ epv: float,
400
+ n_features: int,
401
+ effective_features: float,
402
+ ) -> str:
403
+ """Determine overall capacity status."""
404
+ if epv >= self.EPV_MODERATE:
405
+ return "adequate"
406
+ elif epv >= self.EPV_AGGRESSIVE:
407
+ return "limited"
408
+ else:
409
+ return "inadequate"
410
+
411
+ def _generate_recommendations(
412
+ self,
413
+ epv: float,
414
+ n_features: int,
415
+ effective_features: float,
416
+ minority_samples: int,
417
+ capacity_status: str,
418
+ ) -> List[str]:
419
+ """Generate actionable recommendations."""
420
+ recommendations = []
421
+
422
+ # EPV-based recommendations
423
+ if epv < self.EPV_AGGRESSIVE:
424
+ max_features = int(minority_samples / self.EPV_MODERATE)
425
+ recommendations.append(
426
+ f"⚠️ EPV={epv:.1f} is below minimum. Reduce to {max_features} features or collect more data."
427
+ )
428
+ elif epv < self.EPV_MODERATE:
429
+ recommendations.append(
430
+ f"EPV={epv:.1f} is limited. Use strong regularization (L1/Lasso)."
431
+ )
432
+
433
+ # Effective features recommendation
434
+ if effective_features < n_features * 0.7:
435
+ redundant = n_features - int(effective_features)
436
+ recommendations.append(
437
+ f"~{redundant} features are redundant due to high correlation. Consider removing."
438
+ )
439
+
440
+ # Model selection guidance
441
+ if capacity_status == "inadequate":
442
+ recommendations.append(
443
+ "Consider: (1) Feature selection, (2) PCA for dimensionality reduction, "
444
+ "(3) Collecting more data, (4) Simple 2-3 feature model."
445
+ )
446
+ elif capacity_status == "limited":
447
+ recommendations.append(
448
+ "Use regularized models (Lasso, Ridge) or tree ensembles."
449
+ )
450
+
451
+ return recommendations
452
+
453
+ def _determine_segment_strategy(
454
+ self,
455
+ capacities: Dict[str, FeatureCapacityResult],
456
+ viable: List[str],
457
+ insufficient: List[str],
458
+ ) -> Tuple[str, str]:
459
+ """Determine recommended segmentation strategy."""
460
+ n_viable = len(viable)
461
+ n_insufficient = len(insufficient)
462
+ n_total = n_viable + n_insufficient
463
+
464
+ if n_insufficient == 0:
465
+ return "segment_models", "All segments have adequate data for separate models."
466
+ elif n_viable == 0:
467
+ return "single_model", "No segments have adequate data for separate models."
468
+ elif n_viable >= n_total * 0.5:
469
+ return "hybrid", f"{n_viable}/{n_total} segments viable. Use segment models for large segments, pooled model for small."
470
+ else:
471
+ return "single_model", f"Only {n_viable}/{n_total} segments have adequate data. Single model recommended."
472
+
473
+ def _generate_segment_recommendations(
474
+ self,
475
+ capacities: Dict[str, FeatureCapacityResult],
476
+ viable: List[str],
477
+ insufficient: List[str],
478
+ strategy: str,
479
+ ) -> List[str]:
480
+ """Generate segment-specific recommendations."""
481
+ recommendations = []
482
+
483
+ if strategy == "segment_models":
484
+ recommendations.append(
485
+ f"✅ All {len(viable)} segments have sufficient data for independent models."
486
+ )
487
+ recommendations.append(
488
+ "Consider: Separate models may capture segment-specific patterns better."
489
+ )
490
+ elif strategy == "hybrid":
491
+ recommendations.append(
492
+ f"Build separate models for: {', '.join(viable)}"
493
+ )
494
+ recommendations.append(
495
+ f"Pool small segments into single model: {', '.join(insufficient)}"
496
+ )
497
+ else:
498
+ if insufficient:
499
+ events = [capacities[s].minority_class_samples for s in insufficient]
500
+ recommendations.append(
501
+ f"Small segments ({', '.join(insufficient)}) have {sum(events)} total events."
502
+ )
503
+ recommendations.append(
504
+ "Use a single model with segment as a feature for stratification."
505
+ )
506
+
507
+ return recommendations