churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,483 @@
1
+ """
2
+ Distribution analysis module for exploratory data analysis.
3
+
4
+ This module provides functions for analyzing distributions and recommending
5
+ appropriate transformations based on distribution characteristics.
6
+ """
7
+
8
+ from dataclasses import dataclass, field
9
+ from enum import Enum
10
+ from typing import Any, Dict, List, Optional
11
+
12
+ import numpy as np
13
+
14
+ from customer_retention.core.compat import Series, pd
15
+
16
+
17
+ class DistributionTransformationType(Enum):
18
+ """Types of transformations for skewed distributions."""
19
+ NONE = "none"
20
+ LOG_TRANSFORM = "log_transform"
21
+ SQRT_TRANSFORM = "sqrt_transform"
22
+ BOX_COX = "box_cox"
23
+ YERO_JOHNSON = "yeo_johnson"
24
+ CAP_OUTLIERS = "cap_outliers"
25
+ CAP_THEN_LOG = "cap_then_log"
26
+ ZERO_INFLATION_HANDLING = "zero_inflation_handling"
27
+
28
+
29
+ @dataclass
30
+ class DistributionAnalysis:
31
+ """Result of distribution analysis for a numeric column."""
32
+ column_name: str
33
+ count: int
34
+ mean: float
35
+ std: float
36
+ min_value: float
37
+ max_value: float
38
+ median: float
39
+ q1: float
40
+ q3: float
41
+ iqr: float
42
+ skewness: float
43
+ kurtosis: float
44
+ zero_count: int
45
+ zero_percentage: float
46
+ negative_count: int
47
+ negative_percentage: float
48
+ outlier_count_iqr: int
49
+ outlier_percentage: float
50
+ percentiles: Dict[str, float] = field(default_factory=dict)
51
+
52
+ @property
53
+ def is_highly_skewed(self) -> bool:
54
+ """Check if distribution is highly skewed."""
55
+ return abs(self.skewness) > 2.0
56
+
57
+ @property
58
+ def is_moderately_skewed(self) -> bool:
59
+ """Check if distribution is moderately skewed."""
60
+ return 1.0 < abs(self.skewness) <= 2.0
61
+
62
+ @property
63
+ def has_zero_inflation(self) -> bool:
64
+ """Check if distribution has significant zero inflation."""
65
+ return self.zero_percentage > 30.0
66
+
67
+ @property
68
+ def has_heavy_tails(self) -> bool:
69
+ """Check if distribution has heavy tails (high kurtosis)."""
70
+ return self.kurtosis > 3.0
71
+
72
+ def to_dict(self) -> Dict[str, Any]:
73
+ """Convert to dictionary for display."""
74
+ return {
75
+ "column": self.column_name,
76
+ "count": self.count,
77
+ "mean": round(self.mean, 4),
78
+ "std": round(self.std, 4),
79
+ "min": round(self.min_value, 4),
80
+ "max": round(self.max_value, 4),
81
+ "median": round(self.median, 4),
82
+ "skewness": round(self.skewness, 4),
83
+ "kurtosis": round(self.kurtosis, 4),
84
+ "zero_pct": round(self.zero_percentage, 2),
85
+ "outlier_pct": round(self.outlier_percentage, 2),
86
+ "is_highly_skewed": self.is_highly_skewed,
87
+ "has_zero_inflation": self.has_zero_inflation
88
+ }
89
+
90
+
91
+ @dataclass
92
+ class TransformationRecommendation:
93
+ """Recommendation for transforming a column."""
94
+ column_name: str
95
+ recommended_transform: DistributionTransformationType
96
+ reason: str
97
+ priority: str # "high", "medium", "low"
98
+ parameters: Dict[str, Any] = field(default_factory=dict)
99
+ alternative_transforms: List[DistributionTransformationType] = field(default_factory=list)
100
+ warnings: List[str] = field(default_factory=list)
101
+
102
+ def to_dict(self) -> Dict[str, Any]:
103
+ """Convert to dictionary for display."""
104
+ return {
105
+ "column": self.column_name,
106
+ "transform": self.recommended_transform.value,
107
+ "reason": self.reason,
108
+ "priority": self.priority,
109
+ "parameters": self.parameters,
110
+ "alternatives": [t.value for t in self.alternative_transforms],
111
+ "warnings": self.warnings
112
+ }
113
+
114
+
115
+ class DistributionAnalyzer:
116
+ """
117
+ Analyzer for numeric distribution characteristics.
118
+
119
+ Provides methods for comprehensive distribution analysis and
120
+ transformation recommendations.
121
+ """
122
+
123
+ # Thresholds
124
+ HIGH_SKEWNESS_THRESHOLD = 2.0
125
+ MODERATE_SKEWNESS_THRESHOLD = 1.0
126
+ ZERO_INFLATION_THRESHOLD = 30.0
127
+ OUTLIER_THRESHOLD = 5.0
128
+ HIGH_KURTOSIS_THRESHOLD = 7.0
129
+
130
+ def analyze_distribution(
131
+ self,
132
+ series: Series,
133
+ column_name: str
134
+ ) -> DistributionAnalysis:
135
+ """
136
+ Comprehensive distribution analysis for a single column.
137
+
138
+ Parameters
139
+ ----------
140
+ series : Series
141
+ Numeric data to analyze
142
+ column_name : str
143
+ Name of the column
144
+
145
+ Returns
146
+ -------
147
+ DistributionAnalysis
148
+ Detailed distribution statistics
149
+ """
150
+ clean_series = series.dropna()
151
+
152
+ if len(clean_series) == 0:
153
+ return DistributionAnalysis(
154
+ column_name=column_name,
155
+ count=0,
156
+ mean=0.0,
157
+ std=0.0,
158
+ min_value=0.0,
159
+ max_value=0.0,
160
+ median=0.0,
161
+ q1=0.0,
162
+ q3=0.0,
163
+ iqr=0.0,
164
+ skewness=0.0,
165
+ kurtosis=0.0,
166
+ zero_count=0,
167
+ zero_percentage=0.0,
168
+ negative_count=0,
169
+ negative_percentage=0.0,
170
+ outlier_count_iqr=0,
171
+ outlier_percentage=0.0
172
+ )
173
+
174
+ count = len(clean_series)
175
+ mean = float(clean_series.mean())
176
+ std = float(clean_series.std())
177
+ min_value = float(clean_series.min())
178
+ max_value = float(clean_series.max())
179
+ median = float(clean_series.median())
180
+
181
+ q1 = float(clean_series.quantile(0.25))
182
+ q3 = float(clean_series.quantile(0.75))
183
+ iqr = q3 - q1
184
+
185
+ try:
186
+ skewness = float(clean_series.skew())
187
+ kurtosis = float(clean_series.kurtosis())
188
+ except Exception:
189
+ skewness = 0.0
190
+ kurtosis = 0.0
191
+
192
+ # Zero analysis
193
+ zero_count = int((clean_series == 0).sum())
194
+ zero_percentage = (zero_count / count * 100) if count > 0 else 0.0
195
+
196
+ # Negative analysis
197
+ negative_count = int((clean_series < 0).sum())
198
+ negative_percentage = (negative_count / count * 100) if count > 0 else 0.0
199
+
200
+ # Outlier analysis (IQR method)
201
+ lower_bound = q1 - 1.5 * iqr
202
+ upper_bound = q3 + 1.5 * iqr
203
+ outlier_mask = (clean_series < lower_bound) | (clean_series > upper_bound)
204
+ outlier_count_iqr = int(outlier_mask.sum())
205
+ outlier_percentage = (outlier_count_iqr / count * 100) if count > 0 else 0.0
206
+
207
+ # Percentiles
208
+ percentiles = {
209
+ "p1": float(clean_series.quantile(0.01)),
210
+ "p5": float(clean_series.quantile(0.05)),
211
+ "p10": float(clean_series.quantile(0.10)),
212
+ "p25": float(q1),
213
+ "p50": float(median),
214
+ "p75": float(q3),
215
+ "p90": float(clean_series.quantile(0.90)),
216
+ "p95": float(clean_series.quantile(0.95)),
217
+ "p99": float(clean_series.quantile(0.99))
218
+ }
219
+
220
+ return DistributionAnalysis(
221
+ column_name=column_name,
222
+ count=count,
223
+ mean=mean,
224
+ std=std,
225
+ min_value=min_value,
226
+ max_value=max_value,
227
+ median=median,
228
+ q1=q1,
229
+ q3=q3,
230
+ iqr=iqr,
231
+ skewness=skewness,
232
+ kurtosis=kurtosis,
233
+ zero_count=zero_count,
234
+ zero_percentage=zero_percentage,
235
+ negative_count=negative_count,
236
+ negative_percentage=negative_percentage,
237
+ outlier_count_iqr=outlier_count_iqr,
238
+ outlier_percentage=outlier_percentage,
239
+ percentiles=percentiles
240
+ )
241
+
242
+ def recommend_transformation(
243
+ self,
244
+ analysis: DistributionAnalysis
245
+ ) -> TransformationRecommendation:
246
+ """
247
+ Recommend transformation strategy based on distribution analysis.
248
+
249
+ Parameters
250
+ ----------
251
+ analysis : DistributionAnalysis
252
+ Distribution analysis results
253
+
254
+ Returns
255
+ -------
256
+ TransformationRecommendation
257
+ Recommended transformation with rationale
258
+ """
259
+ warnings = []
260
+ alternatives = []
261
+
262
+ # Decision tree for transformation recommendation
263
+ if analysis.has_zero_inflation and analysis.is_highly_skewed:
264
+ # Zero-inflated and highly skewed
265
+ recommended = DistributionTransformationType.ZERO_INFLATION_HANDLING
266
+ reason = f"Zero-inflation ({analysis.zero_percentage:.1f}%) combined with high skewness ({analysis.skewness:.2f})"
267
+ priority = "high"
268
+ parameters = {
269
+ "strategy": "separate_indicator",
270
+ "transform_non_zero": "log"
271
+ }
272
+ alternatives = [DistributionTransformationType.CAP_THEN_LOG]
273
+ warnings.append("Consider creating a binary indicator for zeros plus log transform of non-zero values")
274
+
275
+ elif analysis.has_zero_inflation:
276
+ # Zero-inflated but not highly skewed
277
+ recommended = DistributionTransformationType.ZERO_INFLATION_HANDLING
278
+ reason = f"Significant zero-inflation ({analysis.zero_percentage:.1f}%)"
279
+ priority = "medium"
280
+ parameters = {"strategy": "binary_indicator"}
281
+ alternatives = [DistributionTransformationType.SQRT_TRANSFORM]
282
+ warnings.append("Many zero values may indicate a mixture distribution")
283
+
284
+ elif analysis.negative_count > 0 and analysis.is_highly_skewed:
285
+ # Has negatives and highly skewed - use Yeo-Johnson
286
+ recommended = DistributionTransformationType.YERO_JOHNSON
287
+ reason = f"High skewness ({analysis.skewness:.2f}) with negative values present"
288
+ priority = "high"
289
+ parameters = {}
290
+ alternatives = [DistributionTransformationType.CAP_OUTLIERS]
291
+ warnings.append("Yeo-Johnson handles negative values unlike log/sqrt")
292
+
293
+ elif analysis.is_highly_skewed and analysis.outlier_percentage > self.OUTLIER_THRESHOLD:
294
+ # Highly skewed with many outliers
295
+ recommended = DistributionTransformationType.CAP_THEN_LOG
296
+ reason = f"High skewness ({analysis.skewness:.2f}) with significant outliers ({analysis.outlier_percentage:.1f}%)"
297
+ priority = "high"
298
+ parameters = {
299
+ "cap_method": "iqr",
300
+ "cap_multiplier": 1.5
301
+ }
302
+ alternatives = [DistributionTransformationType.LOG_TRANSFORM, DistributionTransformationType.BOX_COX]
303
+
304
+ elif analysis.is_highly_skewed:
305
+ # Highly skewed without major outliers
306
+ if analysis.min_value > 0:
307
+ recommended = DistributionTransformationType.LOG_TRANSFORM
308
+ reason = f"High positive skewness ({analysis.skewness:.2f}) with all positive values"
309
+ priority = "high"
310
+ parameters = {"base": "natural", "offset": 0}
311
+ alternatives = [DistributionTransformationType.BOX_COX, DistributionTransformationType.SQRT_TRANSFORM]
312
+ else:
313
+ recommended = DistributionTransformationType.YERO_JOHNSON
314
+ reason = f"High skewness ({analysis.skewness:.2f}) with non-positive values"
315
+ priority = "high"
316
+ parameters = {}
317
+ alternatives = [DistributionTransformationType.BOX_COX]
318
+
319
+ elif analysis.is_moderately_skewed:
320
+ # Moderately skewed
321
+ if analysis.min_value >= 0:
322
+ recommended = DistributionTransformationType.SQRT_TRANSFORM
323
+ reason = f"Moderate skewness ({analysis.skewness:.2f})"
324
+ priority = "medium"
325
+ parameters = {}
326
+ alternatives = [DistributionTransformationType.LOG_TRANSFORM]
327
+ else:
328
+ recommended = DistributionTransformationType.YERO_JOHNSON
329
+ reason = f"Moderate skewness ({analysis.skewness:.2f}) with negative values"
330
+ priority = "medium"
331
+ parameters = {}
332
+ alternatives = []
333
+
334
+ elif analysis.outlier_percentage > self.OUTLIER_THRESHOLD:
335
+ # Not skewed but has outliers
336
+ recommended = DistributionTransformationType.CAP_OUTLIERS
337
+ reason = f"Significant outliers ({analysis.outlier_percentage:.1f}%) despite low skewness"
338
+ priority = "medium"
339
+ parameters = {
340
+ "method": "iqr",
341
+ "multiplier": 1.5
342
+ }
343
+ alternatives = []
344
+ warnings.append("Consider investigating outlier causes before capping")
345
+
346
+ else:
347
+ # Distribution is relatively normal
348
+ recommended = DistributionTransformationType.NONE
349
+ reason = f"Distribution is approximately normal (skewness: {analysis.skewness:.2f})"
350
+ priority = "low"
351
+ parameters = {}
352
+ alternatives = []
353
+
354
+ return TransformationRecommendation(
355
+ column_name=analysis.column_name,
356
+ recommended_transform=recommended,
357
+ reason=reason,
358
+ priority=priority,
359
+ parameters=parameters,
360
+ alternative_transforms=alternatives,
361
+ warnings=warnings
362
+ )
363
+
364
+ def analyze_dataframe(
365
+ self,
366
+ df: pd.DataFrame,
367
+ numeric_columns: Optional[List[str]] = None
368
+ ) -> Dict[str, DistributionAnalysis]:
369
+ """
370
+ Analyze distributions for all numeric columns in a DataFrame.
371
+
372
+ Parameters
373
+ ----------
374
+ df : DataFrame
375
+ Data to analyze
376
+ numeric_columns : List[str], optional
377
+ Columns to analyze. If None, analyzes all numeric columns.
378
+
379
+ Returns
380
+ -------
381
+ Dict[str, DistributionAnalysis]
382
+ Analysis results keyed by column name
383
+ """
384
+ if numeric_columns is None:
385
+ numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
386
+
387
+ results = {}
388
+ for col in numeric_columns:
389
+ if col in df.columns:
390
+ results[col] = self.analyze_distribution(df[col], col)
391
+
392
+ return results
393
+
394
+ def get_all_recommendations(
395
+ self,
396
+ df: pd.DataFrame,
397
+ numeric_columns: Optional[List[str]] = None
398
+ ) -> List[TransformationRecommendation]:
399
+ """
400
+ Get transformation recommendations for all numeric columns.
401
+
402
+ Parameters
403
+ ----------
404
+ df : DataFrame
405
+ Data to analyze
406
+ numeric_columns : List[str], optional
407
+ Columns to analyze. If None, analyzes all numeric columns.
408
+
409
+ Returns
410
+ -------
411
+ List[TransformationRecommendation]
412
+ Recommendations sorted by priority
413
+ """
414
+ analyses = self.analyze_dataframe(df, numeric_columns)
415
+ recommendations = []
416
+
417
+ for col_name, analysis in analyses.items():
418
+ rec = self.recommend_transformation(analysis)
419
+ if rec.recommended_transform != DistributionTransformationType.NONE:
420
+ recommendations.append(rec)
421
+
422
+ # Sort by priority
423
+ priority_order = {"high": 0, "medium": 1, "low": 2}
424
+ recommendations.sort(key=lambda r: priority_order.get(r.priority, 3))
425
+
426
+ return recommendations
427
+
428
+ def generate_report(
429
+ self,
430
+ df: pd.DataFrame,
431
+ numeric_columns: Optional[List[str]] = None
432
+ ) -> Dict[str, Any]:
433
+ """
434
+ Generate comprehensive distribution analysis report.
435
+
436
+ Parameters
437
+ ----------
438
+ df : DataFrame
439
+ Data to analyze
440
+ numeric_columns : List[str], optional
441
+ Columns to analyze
442
+
443
+ Returns
444
+ -------
445
+ Dict[str, Any]
446
+ Comprehensive report with analyses and recommendations
447
+ """
448
+ analyses = self.analyze_dataframe(df, numeric_columns)
449
+ recommendations = self.get_all_recommendations(df, numeric_columns)
450
+
451
+ # Categorize columns by skewness
452
+ highly_skewed = []
453
+ moderately_skewed = []
454
+ normal = []
455
+ zero_inflated = []
456
+
457
+ for col_name, analysis in analyses.items():
458
+ if analysis.has_zero_inflation:
459
+ zero_inflated.append(col_name)
460
+ if analysis.is_highly_skewed:
461
+ highly_skewed.append(col_name)
462
+ elif analysis.is_moderately_skewed:
463
+ moderately_skewed.append(col_name)
464
+ else:
465
+ normal.append(col_name)
466
+
467
+ return {
468
+ "summary": {
469
+ "total_columns": len(analyses),
470
+ "highly_skewed_count": len(highly_skewed),
471
+ "moderately_skewed_count": len(moderately_skewed),
472
+ "normal_count": len(normal),
473
+ "zero_inflated_count": len(zero_inflated)
474
+ },
475
+ "categories": {
476
+ "highly_skewed": highly_skewed,
477
+ "moderately_skewed": moderately_skewed,
478
+ "approximately_normal": normal,
479
+ "zero_inflated": zero_inflated
480
+ },
481
+ "analyses": {k: v.to_dict() for k, v in analyses.items()},
482
+ "recommendations": [r.to_dict() for r in recommendations]
483
+ }