churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,2619 @@
1
+ from datetime import datetime
2
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional
3
+
4
+ import numpy as np
5
+ import plotly.express as px
6
+ import plotly.graph_objects as go
7
+
8
+ from customer_retention.core.compat import DataFrame, Series, ensure_pandas_series, to_pandas
9
+
10
+ from .number_formatter import NumberFormatter
11
+
12
+ if TYPE_CHECKING:
13
+ from customer_retention.stages.profiling.segment_analyzer import SegmentationResult
14
+ from customer_retention.stages.profiling.temporal_analyzer import TemporalAnalysis
15
+ from customer_retention.stages.temporal.cutoff_analyzer import CutoffAnalysis
16
+
17
+
18
+ class ChartBuilder:
19
+ DOW_NAMES = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
20
+
21
+ def __init__(self, theme: str = "plotly_white"):
22
+ self.theme = theme
23
+ self.colors = {
24
+ "primary": "#1f77b4",
25
+ "secondary": "#ff7f0e",
26
+ "success": "#2ca02c",
27
+ "warning": "#ffbb00",
28
+ "danger": "#d62728",
29
+ "info": "#17becf"
30
+ }
31
+
32
+ def _get_quality_colors(self, values: List[float], high: float = 80, mid: float = 60) -> List[str]:
33
+ return [
34
+ self.colors["success"] if v > high else self.colors["warning"] if v > mid else self.colors["danger"]
35
+ for v in values
36
+ ]
37
+
38
+ def _get_iv_colors(self, iv_values: List[float]) -> List[str]:
39
+ return [
40
+ self.colors["danger"] if iv > 0.5 else
41
+ self.colors["success"] if iv > 0.3 else
42
+ self.colors["warning"] if iv > 0.1 else
43
+ self.colors["primary"]
44
+ for iv in iv_values
45
+ ]
46
+
47
+ def _get_ks_colors(self, ks_values: List[float]) -> List[str]:
48
+ return [
49
+ self.colors["success"] if ks > 0.4 else
50
+ self.colors["warning"] if ks > 0.2 else
51
+ self.colors["primary"]
52
+ for ks in ks_values
53
+ ]
54
+
55
+ def bar_chart(self, x: List[Any], y: List[Any], title: Optional[str] = None,
56
+ x_label: Optional[str] = None, y_label: Optional[str] = None,
57
+ horizontal: bool = False, color: Optional[str] = None) -> go.Figure:
58
+ marker_color = color or self.colors["primary"]
59
+ if horizontal:
60
+ fig = go.Figure(go.Bar(y=x, x=y, orientation="h", marker_color=marker_color))
61
+ else:
62
+ fig = go.Figure(go.Bar(x=x, y=y, marker_color=marker_color))
63
+ fig.update_layout(
64
+ title=title,
65
+ xaxis_title=x_label,
66
+ yaxis_title=y_label,
67
+ template=self.theme
68
+ )
69
+ return fig
70
+
71
+ def column_type_distribution(self, type_counts: Dict[str, int]) -> go.Figure:
72
+ if not type_counts:
73
+ return go.Figure()
74
+ fig = px.pie(
75
+ values=list(type_counts.values()),
76
+ names=list(type_counts.keys()),
77
+ title="Column Type Distribution",
78
+ hole=0.4
79
+ )
80
+ fig.update_layout(template=self.theme)
81
+ return fig
82
+
83
+ def data_quality_scorecard(self, quality_scores: Dict[str, float]) -> go.Figure:
84
+ columns = list(quality_scores.keys())
85
+ scores = list(quality_scores.values())
86
+ fig = go.Figure(go.Bar(y=columns, x=scores, orientation="h", marker_color=self._get_quality_colors(scores)))
87
+ fig.update_layout(
88
+ title="Data Quality Scores by Column",
89
+ xaxis_title="Quality Score (0-100)",
90
+ template=self.theme,
91
+ height=max(400, len(columns) * 25)
92
+ )
93
+ return fig
94
+
95
+ def missing_value_bars(self, null_percentages: Dict[str, float]) -> go.Figure:
96
+ columns = list(null_percentages.keys())
97
+ pcts = list(null_percentages.values())
98
+ colors = [self.colors["danger"] if p > 20 else self.colors["warning"] if p > 5 else self.colors["success"] for p in pcts]
99
+ fig = go.Figure(go.Bar(x=columns, y=pcts, marker_color=colors))
100
+ fig.update_layout(title="Missing Values by Column", yaxis_title="Missing %", template=self.theme)
101
+ return fig
102
+
103
+ def histogram_with_stats(self, series: Series, title: Optional[str] = None) -> go.Figure:
104
+ series = ensure_pandas_series(series)
105
+ clean = series.dropna()
106
+ mean_val = clean.mean()
107
+ median_val = clean.median()
108
+ fig = go.Figure()
109
+ fig.add_trace(go.Histogram(x=clean, nbinsx=30, name="Distribution"))
110
+ fig.add_vline(x=mean_val, line_dash="dash", line_color=self.colors["primary"], annotation_text=f"Mean: {mean_val:.2f}")
111
+ fig.add_vline(x=median_val, line_dash="dot", line_color=self.colors["secondary"], annotation_text=f"Median: {median_val:.2f}")
112
+ fig.update_layout(
113
+ title=title or f"Distribution of {series.name}",
114
+ xaxis_title=series.name,
115
+ yaxis_title="Count",
116
+ template=self.theme
117
+ )
118
+ return fig
119
+
120
+ def box_plot(self, series: Series, title: Optional[str] = None) -> go.Figure:
121
+ series = ensure_pandas_series(series)
122
+ fig = px.box(y=series.dropna(), title=title or f"Box Plot: {series.name}")
123
+ fig.update_layout(template=self.theme)
124
+ return fig
125
+
126
+ def outlier_visualization(self, series: Series, method: str = "iqr") -> go.Figure:
127
+ series = ensure_pandas_series(series)
128
+ clean = series.dropna().reset_index(drop=True)
129
+ q1, q3 = clean.quantile(0.25), clean.quantile(0.75)
130
+ iqr = q3 - q1
131
+ lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
132
+ is_outlier = (clean < lower) | (clean > upper)
133
+ fig = go.Figure()
134
+ fig.add_trace(go.Scatter(x=clean[~is_outlier].index, y=clean[~is_outlier], mode="markers", name="Normal", marker_color=self.colors["primary"]))
135
+ fig.add_trace(go.Scatter(x=clean[is_outlier].index, y=clean[is_outlier], mode="markers", name="Outliers", marker_color=self.colors["danger"]))
136
+ fig.add_hline(y=upper, line_dash="dash", line_color="gray", annotation_text="Upper Bound")
137
+ fig.add_hline(y=lower, line_dash="dash", line_color="gray", annotation_text="Lower Bound")
138
+ fig.update_layout(title=f"Outlier Detection: {series.name}", template=self.theme)
139
+ return fig
140
+
141
+ def category_bar_chart(self, series: Series, top_n: int = 20) -> go.Figure:
142
+ series = ensure_pandas_series(series)
143
+ value_counts = series.value_counts().head(top_n)
144
+ fig = go.Figure(go.Bar(x=value_counts.index.astype(str), y=value_counts.values, marker_color=self.colors["primary"]))
145
+ fig.update_layout(
146
+ title=f"Top {top_n} Categories: {series.name}",
147
+ xaxis_title="Category",
148
+ yaxis_title="Count",
149
+ template=self.theme
150
+ )
151
+ return fig
152
+
153
+ def correlation_heatmap(self, df: DataFrame, method: str = "pearson") -> go.Figure:
154
+ df = to_pandas(df)
155
+ corr = df.corr(method=method)
156
+ fig = go.Figure(go.Heatmap(z=corr.values, x=corr.columns, y=corr.columns, colorscale="RdBu", zmid=0))
157
+ fig.update_layout(
158
+ title=f"Correlation Matrix ({method})",
159
+ template=self.theme,
160
+ height=max(400, len(corr.columns) * 25)
161
+ )
162
+ return fig
163
+
164
+ def target_correlation_bars(self, correlations: Dict[str, float], target_name: str) -> go.Figure:
165
+ cols = list(correlations.keys())
166
+ vals = list(correlations.values())
167
+ colors = [self.colors["success"] if v > 0 else self.colors["danger"] for v in vals]
168
+ fig = go.Figure(go.Bar(y=cols, x=vals, orientation="h", marker_color=colors))
169
+ fig.update_layout(
170
+ title=f"Correlation with Target: {target_name}",
171
+ xaxis_title="Correlation",
172
+ template=self.theme,
173
+ height=max(400, len(cols) * 25)
174
+ )
175
+ return fig
176
+
177
+ def roc_curve(self, fpr, tpr, auc_score: float) -> go.Figure:
178
+ fig = go.Figure()
179
+ fig.add_trace(go.Scatter(x=fpr, y=tpr, mode="lines", name=f"ROC (AUC={auc_score:.3f})"))
180
+ fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode="lines", line_dash="dash", name="Random"))
181
+ fig.update_layout(
182
+ title="ROC Curve",
183
+ xaxis_title="False Positive Rate",
184
+ yaxis_title="True Positive Rate",
185
+ template=self.theme
186
+ )
187
+ return fig
188
+
189
+ def precision_recall_curve(
190
+ self,
191
+ precision,
192
+ recall,
193
+ pr_auc: float,
194
+ baseline: Optional[float] = None,
195
+ title: Optional[str] = None,
196
+ ) -> go.Figure:
197
+ fig = go.Figure()
198
+ fig.add_trace(go.Scatter(
199
+ x=recall, y=precision,
200
+ mode="lines",
201
+ name=f"PR (AUC={pr_auc:.3f})",
202
+ line={"color": self.colors["primary"], "width": 2}
203
+ ))
204
+
205
+ if baseline is not None:
206
+ fig.add_hline(
207
+ y=baseline,
208
+ line_dash="dash",
209
+ line_color="gray",
210
+ annotation_text=f"Baseline: {baseline:.2f}",
211
+ annotation_position="right"
212
+ )
213
+
214
+ fig.update_layout(
215
+ title=title or "Precision-Recall Curve",
216
+ xaxis_title="Recall",
217
+ yaxis_title="Precision",
218
+ xaxis_range=[0, 1],
219
+ yaxis_range=[0, 1.05],
220
+ template=self.theme
221
+ )
222
+ return fig
223
+
224
+ def model_comparison_grid(self, model_results: Dict[str, Dict[str, Any]], y_test: Any,
225
+ class_labels: Optional[List[str]] = None, title: Optional[str] = None) -> go.Figure:
226
+ from plotly.subplots import make_subplots
227
+ model_names, n_models = list(model_results.keys()), len(model_results)
228
+ class_labels = class_labels or ["0", "1"]
229
+ subplot_titles = [f"{name[:15]}<br>{row}" for row in ["Confusion Matrix", "ROC Curve", "Precision-Recall"] for name in model_names]
230
+ fig = make_subplots(rows=3, cols=n_models, subplot_titles=subplot_titles, vertical_spacing=0.12, horizontal_spacing=0.08,
231
+ specs=[[{"type": "heatmap"} for _ in range(n_models)], [{"type": "xy"} for _ in range(n_models)], [{"type": "xy"} for _ in range(n_models)]])
232
+ model_colors = [self.colors["primary"], self.colors["secondary"], self.colors["success"], self.colors["info"], self.colors["warning"]]
233
+ baseline = np.mean(y_test)
234
+ for i, model_name in enumerate(model_names):
235
+ col, color = i + 1, model_colors[i % len(model_colors)]
236
+ y_pred, y_pred_proba = model_results[model_name]["y_pred"], model_results[model_name]["y_pred_proba"]
237
+ self._add_confusion_matrix_to_grid(fig, y_test, y_pred, class_labels, col)
238
+ self._add_roc_curve_to_grid(fig, y_test, y_pred_proba, color, col, n_models)
239
+ self._add_pr_curve_to_grid(fig, y_test, y_pred_proba, color, col, n_models, baseline)
240
+ self._update_comparison_grid_axes(fig, n_models)
241
+ fig.update_layout(title=title or "Model Comparison", height=300 * 3 + 100, width=350 * n_models + 50, template=self.theme, showlegend=False)
242
+ return fig
243
+
244
+ def _add_confusion_matrix_to_grid(self, fig: go.Figure, y_test: Any, y_pred: Any, class_labels: List[str], col: int) -> None:
245
+ from sklearn.metrics import confusion_matrix
246
+ cm = confusion_matrix(y_test, y_pred)
247
+ cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
248
+ cm_text = [[f"{cm[i][j]}<br>({cm_normalized[i][j]:.0%})" for j in range(len(class_labels))] for i in range(len(class_labels))]
249
+ fig.add_trace(go.Heatmap(z=cm, x=class_labels, y=class_labels, colorscale="Blues", text=cm_text, texttemplate="%{text}",
250
+ textfont={"size": 11}, showscale=False, hovertemplate="Actual: %{y}<br>Predicted: %{x}<br>Count: %{z}<extra></extra>"), row=1, col=col)
251
+
252
+ def _add_roc_curve_to_grid(self, fig: go.Figure, y_test: Any, y_pred_proba: Any, color: str, col: int, n_models: int) -> None:
253
+ from sklearn.metrics import roc_auc_score, roc_curve
254
+ fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
255
+ auc = roc_auc_score(y_test, y_pred_proba)
256
+ fig.add_trace(go.Scatter(x=fpr, y=tpr, mode="lines", line={"color": color, "width": 2}, name=f"AUC={auc:.3f}", showlegend=False,
257
+ hovertemplate="FPR: %{x:.2f}<br>TPR: %{y:.2f}<extra></extra>"), row=2, col=col)
258
+ fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode="lines", line={"color": "gray", "width": 1, "dash": "dash"}, showlegend=False, hoverinfo="skip"), row=2, col=col)
259
+ xref = f"x{col + n_models}" if col > 1 else "x" + str(n_models + 1) if n_models > 1 else "x2"
260
+ yref = f"y{col + n_models}" if col > 1 else "y" + str(n_models + 1) if n_models > 1 else "y2"
261
+ fig.add_annotation(x=0.95, y=0.05, xref=xref, yref=yref, text=f"AUC={auc:.3f}", showarrow=False, font={"size": 11, "color": color}, bgcolor="rgba(255,255,255,0.8)", xanchor="right")
262
+
263
+ def _add_pr_curve_to_grid(self, fig: go.Figure, y_test: Any, y_pred_proba: Any, color: str, col: int, n_models: int, baseline: float) -> None:
264
+ from sklearn.metrics import average_precision_score, precision_recall_curve
265
+ precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
266
+ pr_auc = average_precision_score(y_test, y_pred_proba)
267
+ fig.add_trace(go.Scatter(x=recall, y=precision, mode="lines", line={"color": color, "width": 2}, name=f"PR-AUC={pr_auc:.3f}", showlegend=False,
268
+ hovertemplate="Recall: %{x:.2f}<br>Precision: %{y:.2f}<extra></extra>"), row=3, col=col)
269
+ fig.add_trace(go.Scatter(x=[0, 1], y=[baseline, baseline], mode="lines", line={"color": "gray", "width": 1, "dash": "dash"}, showlegend=False, hoverinfo="skip"), row=3, col=col)
270
+ pr_row_offset = 2 * n_models
271
+ xref = f"x{col + pr_row_offset}" if col + pr_row_offset > 1 else "x"
272
+ yref = f"y{col + pr_row_offset}" if col + pr_row_offset > 1 else "y"
273
+ fig.add_annotation(x=0.05, y=0.05, xref=xref, yref=yref, text=f"PR-AUC={pr_auc:.3f}", showarrow=False, font={"size": 11, "color": color}, bgcolor="rgba(255,255,255,0.8)", xanchor="left")
274
+
275
+ def _update_comparison_grid_axes(self, fig: go.Figure, n_models: int) -> None:
276
+ for i in range(n_models):
277
+ col = i + 1
278
+ fig.update_xaxes(title_text="Predicted", row=1, col=col)
279
+ fig.update_yaxes(title_text="Actual", row=1, col=col)
280
+ fig.update_xaxes(title_text="FPR", row=2, col=col, range=[0, 1])
281
+ fig.update_yaxes(title_text="TPR", row=2, col=col, range=[0, 1.02])
282
+ fig.update_xaxes(title_text="Recall", row=3, col=col, range=[0, 1])
283
+ fig.update_yaxes(title_text="Precision", row=3, col=col, range=[0, 1.05])
284
+
285
+ def confusion_matrix_heatmap(self, cm, labels: Optional[List[str]] = None) -> go.Figure:
286
+ cm_array = np.array(cm)
287
+ if labels is None:
288
+ labels = [str(i) for i in range(len(cm_array))]
289
+ fig = go.Figure(go.Heatmap(
290
+ z=cm_array,
291
+ x=labels,
292
+ y=labels,
293
+ colorscale="Blues",
294
+ text=cm_array,
295
+ texttemplate="%{text}"
296
+ ))
297
+ fig.update_layout(
298
+ title="Confusion Matrix",
299
+ xaxis_title="Predicted",
300
+ yaxis_title="Actual",
301
+ template=self.theme
302
+ )
303
+ return fig
304
+
305
+ def feature_importance_plot(self, importance_df: DataFrame) -> go.Figure:
306
+ importance_df = to_pandas(importance_df)
307
+ fig = go.Figure(go.Bar(
308
+ y=importance_df["feature"],
309
+ x=importance_df["importance"],
310
+ orientation="h",
311
+ marker_color=self.colors["primary"]
312
+ ))
313
+ fig.update_layout(
314
+ title="Feature Importance",
315
+ xaxis_title="Importance",
316
+ template=self.theme,
317
+ height=max(400, len(importance_df) * 25)
318
+ )
319
+ return fig
320
+
321
+ def lift_curve(self, percentiles, lift_values) -> go.Figure:
322
+ fig = go.Figure()
323
+ fig.add_trace(go.Scatter(x=percentiles, y=lift_values, mode="lines+markers", name="Model Lift"))
324
+ fig.add_hline(y=1, line_dash="dash", line_color="gray", annotation_text="Baseline")
325
+ fig.update_layout(
326
+ title="Lift Curve",
327
+ xaxis_title="Percentile",
328
+ yaxis_title="Lift",
329
+ template=self.theme
330
+ )
331
+ return fig
332
+
333
+ def time_series_plot(self, df: DataFrame, date_col: str, value_col: str) -> go.Figure:
334
+ df = to_pandas(df)
335
+ fig = px.line(df, x=date_col, y=value_col)
336
+ fig.update_layout(title=f"{value_col} over Time", template=self.theme)
337
+ return fig
338
+
339
+ def cohort_retention_heatmap(self, retention_matrix: DataFrame) -> go.Figure:
340
+ retention_matrix = to_pandas(retention_matrix)
341
+ fig = go.Figure(go.Heatmap(
342
+ z=retention_matrix.values,
343
+ x=retention_matrix.columns,
344
+ y=retention_matrix.index,
345
+ colorscale="Greens",
346
+ text=np.round(retention_matrix.values, 2),
347
+ texttemplate="%{text:.0%}"
348
+ ))
349
+ fig.update_layout(
350
+ title="Cohort Retention",
351
+ xaxis_title="Months Since Start",
352
+ yaxis_title="Cohort",
353
+ template=self.theme
354
+ )
355
+ return fig
356
+
357
+ def histogram(self, series: Series, title: Optional[str] = None, nbins: int = 30) -> go.Figure:
358
+ series = ensure_pandas_series(series)
359
+ fig = go.Figure(go.Histogram(x=series.dropna(), nbinsx=nbins, marker_color=self.colors["primary"]))
360
+ fig.update_layout(
361
+ title=title or f"Distribution of {series.name}",
362
+ xaxis_title=series.name,
363
+ yaxis_title="Count",
364
+ template=self.theme
365
+ )
366
+ return fig
367
+
368
+ def heatmap(self, z: Any, x_labels: List[str], y_labels: List[str],
369
+ title: Optional[str] = None, colorscale: str = "RdBu") -> go.Figure:
370
+ z_array = np.array(z) if not isinstance(z, np.ndarray) else z
371
+ fig = go.Figure(go.Heatmap(
372
+ z=z_array, x=x_labels, y=y_labels,
373
+ colorscale=colorscale, zmid=0 if colorscale == "RdBu" else None
374
+ ))
375
+ fig.update_layout(
376
+ title=title,
377
+ template=self.theme,
378
+ height=max(400, len(y_labels) * 25)
379
+ )
380
+ return fig
381
+
382
+ def scatter_matrix(
383
+ self,
384
+ df: DataFrame,
385
+ title: Optional[str] = None,
386
+ height: Optional[int] = None,
387
+ width: Optional[int] = None,
388
+ color_column: Optional[Series] = None,
389
+ color_map: Optional[Dict[str, str]] = None,
390
+ ) -> go.Figure:
391
+ df = to_pandas(df)
392
+ n_cols = len(df.columns)
393
+ auto_height = max(500, n_cols * 150)
394
+
395
+ if color_column is not None:
396
+ plot_df = df.copy()
397
+ plot_df["_color_"] = ensure_pandas_series(color_column).values
398
+ default_colors = {"Retained": "#2ECC71", "Churned": "#E74C3C"}
399
+ colors = color_map or default_colors
400
+ fig = px.scatter_matrix(
401
+ plot_df, dimensions=df.columns.tolist(), color="_color_",
402
+ title=title, color_discrete_map=colors
403
+ )
404
+ fig.update_traces(marker=dict(opacity=0.6, size=5))
405
+ else:
406
+ fig = px.scatter_matrix(df, title=title)
407
+
408
+ fig.update_layout(template=self.theme, height=height or auto_height, autosize=True)
409
+ if width:
410
+ fig.update_layout(width=width)
411
+ fig.update_traces(diagonal_visible=False, showupperhalf=False)
412
+ return fig
413
+
414
+ def multi_line_chart(self, data: List[Dict[str, Any]], x_key: str, y_key: str,
415
+ name_key: str, title: Optional[str] = None,
416
+ x_title: Optional[str] = None, y_title: Optional[str] = None) -> go.Figure:
417
+ fig = go.Figure()
418
+ for series in data:
419
+ fig.add_trace(go.Scatter(
420
+ x=series[x_key], y=series[y_key],
421
+ mode="lines", name=series[name_key]
422
+ ))
423
+ fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode="lines", line_dash="dash",
424
+ line_color="gray", name="Random"))
425
+ fig.update_layout(title=title, xaxis_title=x_title, yaxis_title=y_title, template=self.theme)
426
+ return fig
427
+
428
+ def temporal_distribution(
429
+ self,
430
+ analysis: "TemporalAnalysis",
431
+ title: Optional[str] = None,
432
+ chart_type: str = "bar",
433
+ ) -> go.Figure:
434
+ period_counts = analysis.period_counts
435
+ if period_counts.empty:
436
+ fig = go.Figure()
437
+ fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
438
+ return fig
439
+
440
+ x_values = period_counts["period"].astype(str)
441
+ y_values = period_counts["count"]
442
+
443
+ fig = go.Figure()
444
+ if chart_type == "line":
445
+ fig.add_trace(go.Scatter(
446
+ x=x_values, y=y_values,
447
+ mode="lines+markers",
448
+ line={"color": self.colors["primary"], "width": 2},
449
+ marker={"size": 6},
450
+ name="Record Count"
451
+ ))
452
+ else:
453
+ fig.add_trace(go.Bar(
454
+ x=x_values, y=y_values,
455
+ marker_color=self.colors["primary"],
456
+ name="Record Count"
457
+ ))
458
+
459
+ mean_count = y_values.mean()
460
+ fig.add_hline(
461
+ y=mean_count,
462
+ line_dash="dash",
463
+ line_color=self.colors["secondary"],
464
+ annotation_text=f"Avg: {mean_count:.0f}",
465
+ annotation_position="top right"
466
+ )
467
+
468
+ granularity_label = analysis.granularity.value.capitalize()
469
+ default_title = f"Records by {granularity_label}"
470
+ fig.update_layout(
471
+ title=title or default_title,
472
+ xaxis_title=granularity_label,
473
+ yaxis_title="Count",
474
+ template=self.theme,
475
+ xaxis_tickangle=-45 if len(x_values) > 12 else 0
476
+ )
477
+ return fig
478
+
479
+ def temporal_trend(
480
+ self,
481
+ analysis: "TemporalAnalysis",
482
+ title: Optional[str] = None,
483
+ show_trend: bool = True,
484
+ ) -> go.Figure:
485
+ period_counts = analysis.period_counts
486
+ if period_counts.empty:
487
+ fig = go.Figure()
488
+ fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
489
+ return fig
490
+
491
+ x_values = list(range(len(period_counts)))
492
+ x_labels = period_counts["period"].astype(str)
493
+ y_values = period_counts["count"].values
494
+
495
+ fig = go.Figure()
496
+ fig.add_trace(go.Scatter(
497
+ x=x_labels, y=y_values,
498
+ mode="lines+markers",
499
+ line={"color": self.colors["primary"], "width": 2},
500
+ marker={"size": 8},
501
+ name="Actual"
502
+ ))
503
+
504
+ if show_trend and len(x_values) >= 2:
505
+ z = np.polyfit(x_values, y_values, 1)
506
+ trend_line = np.poly1d(z)(x_values)
507
+ slope_pct = ((trend_line[-1] - trend_line[0]) / trend_line[0] * 100) if trend_line[0] != 0 else 0
508
+ trend_direction = "increasing" if z[0] > 0 else "decreasing"
509
+ trend_color = self.colors["success"] if z[0] > 0 else self.colors["danger"]
510
+
511
+ fig.add_trace(go.Scatter(
512
+ x=x_labels, y=trend_line,
513
+ mode="lines",
514
+ line={"color": trend_color, "width": 2, "dash": "dash"},
515
+ name=f"Trend ({trend_direction}, {abs(slope_pct):.1f}%)"
516
+ ))
517
+
518
+ granularity_label = analysis.granularity.value.capitalize()
519
+ default_title = f"Temporal Trend by {granularity_label}"
520
+ fig.update_layout(
521
+ title=title or default_title,
522
+ xaxis_title=granularity_label,
523
+ yaxis_title="Count",
524
+ template=self.theme,
525
+ xaxis_tickangle=-45 if len(x_labels) > 12 else 0,
526
+ showlegend=True
527
+ )
528
+ return fig
529
+
530
+ def temporal_heatmap(
531
+ self,
532
+ dates: Series,
533
+ title: Optional[str] = None,
534
+ ) -> go.Figure:
535
+ import pandas as pd
536
+ dates = ensure_pandas_series(dates)
537
+ parsed = pd.to_datetime(dates, errors="coerce").dropna()
538
+
539
+ if len(parsed) == 0:
540
+ fig = go.Figure()
541
+ fig.add_annotation(text="No valid dates", x=0.5, y=0.5, showarrow=False)
542
+ return fig
543
+
544
+ counts = parsed.dt.dayofweek.value_counts().reindex(range(7), fill_value=0)
545
+
546
+ fig = go.Figure(go.Bar(
547
+ x=self.DOW_NAMES,
548
+ y=counts.values,
549
+ marker_color=[self.colors["info"] if i < 5 else self.colors["warning"] for i in range(7)]
550
+ ))
551
+
552
+ fig.update_layout(
553
+ title=title or "Records by Day of Week",
554
+ xaxis_title="Day of Week",
555
+ yaxis_title="Count",
556
+ template=self.theme
557
+ )
558
+ return fig
559
+
560
+ def year_month_heatmap(
561
+ self,
562
+ pivot_df: "DataFrame",
563
+ title: Optional[str] = None,
564
+ ) -> go.Figure:
565
+ pivot_df = to_pandas(pivot_df)
566
+ if pivot_df.empty:
567
+ fig = go.Figure()
568
+ fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
569
+ return fig
570
+
571
+ fig = go.Figure(go.Heatmap(
572
+ z=pivot_df.values,
573
+ x=pivot_df.columns.tolist(),
574
+ y=pivot_df.index.astype(str).tolist(),
575
+ colorscale="Blues",
576
+ text=pivot_df.values,
577
+ texttemplate="%{text:,}",
578
+ textfont={"size": 10},
579
+ hovertemplate="Year: %{y}<br>Month: %{x}<br>Count: %{z:,}<extra></extra>"
580
+ ))
581
+
582
+ fig.update_layout(
583
+ title=title or "Records by Year and Month",
584
+ xaxis_title="Month",
585
+ yaxis_title="Year",
586
+ template=self.theme,
587
+ height=max(300, len(pivot_df) * 40 + 100)
588
+ )
589
+ return fig
590
+
591
+ def cumulative_growth_chart(
592
+ self,
593
+ cumulative_series: Series,
594
+ title: Optional[str] = None,
595
+ ) -> go.Figure:
596
+ """Create a cumulative growth chart."""
597
+ cumulative_series = ensure_pandas_series(cumulative_series)
598
+ if len(cumulative_series) == 0:
599
+ fig = go.Figure()
600
+ fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
601
+ return fig
602
+
603
+ fig = go.Figure()
604
+ fig.add_trace(go.Scatter(
605
+ x=[str(p) for p in cumulative_series.index],
606
+ y=cumulative_series.values,
607
+ mode="lines+markers",
608
+ fill="tozeroy",
609
+ line={"color": self.colors["primary"], "width": 2},
610
+ marker={"size": 6},
611
+ name="Cumulative Count"
612
+ ))
613
+
614
+ fig.update_layout(
615
+ title=title or "Cumulative Records Over Time",
616
+ xaxis_title="Period",
617
+ yaxis_title="Cumulative Count",
618
+ template=self.theme,
619
+ xaxis_tickangle=-45
620
+ )
621
+ return fig
622
+
623
+ def year_over_year_lines(
624
+ self,
625
+ pivot_df: "DataFrame",
626
+ title: Optional[str] = None,
627
+ ) -> go.Figure:
628
+ """Create year-over-year comparison line chart."""
629
+ pivot_df = to_pandas(pivot_df)
630
+ if pivot_df.empty:
631
+ fig = go.Figure()
632
+ fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
633
+ return fig
634
+
635
+ colors = px.colors.qualitative.Set1
636
+ fig = go.Figure()
637
+
638
+ for i, year in enumerate(pivot_df.index):
639
+ fig.add_trace(go.Scatter(
640
+ x=pivot_df.columns.tolist(),
641
+ y=pivot_df.loc[year].values,
642
+ mode="lines+markers",
643
+ name=str(year),
644
+ line={"color": colors[i % len(colors)], "width": 2},
645
+ marker={"size": 8}
646
+ ))
647
+
648
+ fig.update_layout(
649
+ title=title or "Year-over-Year Comparison",
650
+ xaxis_title="Month",
651
+ yaxis_title="Count",
652
+ template=self.theme,
653
+ showlegend=True,
654
+ legend={"title": "Year"}
655
+ )
656
+ return fig
657
+
658
+ def growth_summary_indicators(
659
+ self,
660
+ growth_data: Dict[str, Any],
661
+ title: Optional[str] = None,
662
+ ) -> go.Figure:
663
+ """Create growth summary with key indicators using compact number formatting."""
664
+ if not growth_data.get("has_data"):
665
+ fig = go.Figure()
666
+ fig.add_annotation(text="Insufficient data", x=0.5, y=0.5, showarrow=False)
667
+ return fig
668
+
669
+ formatter = NumberFormatter()
670
+ fig = go.Figure()
671
+
672
+ # Define indicator positions (x_center, label)
673
+ indicators = [
674
+ (0.15, "Overall Growth", growth_data["overall_growth_pct"], "%"),
675
+ (0.5, "Avg Monthly", growth_data["avg_monthly_growth"], "%/mo"),
676
+ (0.85, f"Trend: {growth_data['trend_direction'].upper()}", growth_data["trend_slope"], "/mo"),
677
+ ]
678
+
679
+ for x_pos, label, value, suffix in indicators:
680
+ color = self.colors["success"] if value >= 0 else self.colors["danger"]
681
+ formatted_value = formatter.compact(abs(value))
682
+ sign = "+" if value >= 0 else "-"
683
+ display_text = f"{sign}{formatted_value}{suffix}"
684
+
685
+ # Value annotation
686
+ fig.add_annotation(
687
+ x=x_pos, y=0.55,
688
+ text=display_text,
689
+ font={"size": 36, "color": color, "family": "Arial Black"},
690
+ showarrow=False,
691
+ xref="paper", yref="paper"
692
+ )
693
+ # Label annotation
694
+ fig.add_annotation(
695
+ x=x_pos, y=0.15,
696
+ text=label,
697
+ font={"size": 14, "color": "#666666"},
698
+ showarrow=False,
699
+ xref="paper", yref="paper"
700
+ )
701
+
702
+ fig.update_layout(
703
+ title={"text": title or "Growth Summary", "font": {"size": 16}},
704
+ template=self.theme,
705
+ height=180,
706
+ margin={"t": 60, "b": 20, "l": 20, "r": 20},
707
+ xaxis={"visible": False},
708
+ yaxis={"visible": False}
709
+ )
710
+ return fig
711
+
712
+ def segment_overview(
713
+ self,
714
+ result: "SegmentationResult",
715
+ title: Optional[str] = None,
716
+ ) -> go.Figure:
717
+ """Create overview of segments showing size and target rate."""
718
+ from plotly.subplots import make_subplots
719
+
720
+ profiles = result.profiles
721
+ if not profiles:
722
+ fig = go.Figure()
723
+ fig.add_annotation(text="No segments found", x=0.5, y=0.5, showarrow=False)
724
+ return fig
725
+
726
+ segment_names = [f"Segment {p.segment_id}" for p in profiles]
727
+ sizes = [p.size_pct for p in profiles]
728
+ target_rates = [p.target_rate for p in profiles]
729
+ has_target = any(tr is not None for tr in target_rates)
730
+
731
+ fig = make_subplots(
732
+ rows=1, cols=2 if has_target else 1,
733
+ specs=[[{"type": "pie"}, {"type": "bar"}]] if has_target else [[{"type": "pie"}]],
734
+ subplot_titles=["Segment Sizes", "Target Rate by Segment"] if has_target else ["Segment Sizes"],
735
+ )
736
+
737
+ colors = px.colors.qualitative.Set2[:len(profiles)]
738
+ fig.add_trace(
739
+ go.Pie(
740
+ labels=segment_names,
741
+ values=sizes,
742
+ marker_colors=colors,
743
+ textinfo="label+percent",
744
+ hovertemplate="<b>%{label}</b><br>Size: %{value:.1f}%<extra></extra>",
745
+ ),
746
+ row=1, col=1
747
+ )
748
+
749
+ if has_target:
750
+ target_rates_clean = [tr if tr is not None else 0 for tr in target_rates]
751
+ fig.add_trace(
752
+ go.Bar(
753
+ x=segment_names,
754
+ y=[tr * 100 for tr in target_rates_clean],
755
+ marker_color=colors,
756
+ text=[f"{tr*100:.1f}%" for tr in target_rates_clean],
757
+ textposition="outside",
758
+ hovertemplate="<b>%{x}</b><br>Target Rate: %{y:.1f}%<extra></extra>",
759
+ ),
760
+ row=1, col=2
761
+ )
762
+ max_rate = max(target_rates_clean) * 100
763
+ y_max = max_rate * 1.3 if max_rate > 0 else 10
764
+ fig.update_yaxes(title_text="Target Rate (%)", row=1, col=2, range=[0, y_max])
765
+
766
+ fig.update_layout(
767
+ title=title or f"Segment Overview ({result.n_segments} segments)",
768
+ template=self.theme,
769
+ height=400,
770
+ showlegend=False,
771
+ )
772
+ return fig
773
+
774
+ def segment_feature_comparison(
775
+ self,
776
+ result: "SegmentationResult",
777
+ features: Optional[List[str]] = None,
778
+ title: Optional[str] = None,
779
+ ) -> go.Figure:
780
+ """Compare feature distributions across segments using grouped bars."""
781
+ profiles = result.profiles
782
+ if not profiles:
783
+ fig = go.Figure()
784
+ fig.add_annotation(text="No segments found", x=0.5, y=0.5, showarrow=False)
785
+ return fig
786
+
787
+ all_features = set()
788
+ for p in profiles:
789
+ all_features.update(p.defining_features.keys())
790
+
791
+ if features:
792
+ all_features = [f for f in features if f in all_features]
793
+ else:
794
+ all_features = sorted(all_features)[:8]
795
+
796
+ if not all_features:
797
+ fig = go.Figure()
798
+ fig.add_annotation(text="No features to compare", x=0.5, y=0.5, showarrow=False)
799
+ return fig
800
+
801
+ colors = px.colors.qualitative.Set2[:len(profiles)]
802
+ fig = go.Figure()
803
+
804
+ for i, profile in enumerate(profiles):
805
+ means = []
806
+ for feat in all_features:
807
+ feat_data = profile.defining_features.get(feat, {})
808
+ means.append(feat_data.get("mean", 0))
809
+
810
+ fig.add_trace(go.Bar(
811
+ name=f"Segment {profile.segment_id}",
812
+ x=list(all_features),
813
+ y=means,
814
+ marker_color=colors[i],
815
+ ))
816
+
817
+ fig.update_layout(
818
+ title=title or "Feature Comparison Across Segments",
819
+ xaxis_title="Feature",
820
+ yaxis_title="Mean Value",
821
+ barmode="group",
822
+ template=self.theme,
823
+ height=400,
824
+ legend={"title": "Segment"},
825
+ )
826
+ return fig
827
+
828
+ def segment_recommendation_card(
829
+ self,
830
+ result: "SegmentationResult",
831
+ title: Optional[str] = None,
832
+ ) -> go.Figure:
833
+ """Display segmentation recommendation with rationale."""
834
+ recommendation_colors = {
835
+ "single_model": self.colors["success"],
836
+ "consider_segmentation": self.colors["warning"],
837
+ "strong_segmentation": self.colors["danger"],
838
+ }
839
+ recommendation_labels = {
840
+ "single_model": "Single Model Recommended",
841
+ "consider_segmentation": "Consider Segmentation",
842
+ "strong_segmentation": "Segmentation Strongly Recommended",
843
+ }
844
+
845
+ rec_color = recommendation_colors.get(result.recommendation, self.colors["info"])
846
+ rec_label = recommendation_labels.get(result.recommendation, result.recommendation)
847
+
848
+ fig = go.Figure()
849
+
850
+ # Recommendation header
851
+ fig.add_annotation(
852
+ x=0.5, y=0.85,
853
+ text=rec_label,
854
+ font={"size": 24, "color": rec_color, "family": "Arial Black"},
855
+ showarrow=False,
856
+ xref="paper", yref="paper"
857
+ )
858
+
859
+ # Confidence indicator
860
+ fig.add_annotation(
861
+ x=0.5, y=0.65,
862
+ text=f"Confidence: {result.confidence*100:.0f}%",
863
+ font={"size": 16, "color": "#666666"},
864
+ showarrow=False,
865
+ xref="paper", yref="paper"
866
+ )
867
+
868
+ # Key metrics
869
+ metrics_text = (
870
+ f"Segments: {result.n_segments} | "
871
+ f"Quality: {result.quality_score:.2f} | "
872
+ f"Target Variance: {result.target_variance_ratio:.2f}"
873
+ if result.target_variance_ratio is not None
874
+ else f"Segments: {result.n_segments} | Quality: {result.quality_score:.2f}"
875
+ )
876
+ fig.add_annotation(
877
+ x=0.5, y=0.48,
878
+ text=metrics_text,
879
+ font={"size": 14, "color": "#888888"},
880
+ showarrow=False,
881
+ xref="paper", yref="paper"
882
+ )
883
+
884
+ # Rationale
885
+ rationale_text = "<br>".join(f"• {r}" for r in result.rationale[:4])
886
+ fig.add_annotation(
887
+ x=0.5, y=0.2,
888
+ text=rationale_text,
889
+ font={"size": 12, "color": "#666666"},
890
+ showarrow=False,
891
+ xref="paper", yref="paper",
892
+ align="center"
893
+ )
894
+
895
+ fig.update_layout(
896
+ title=title or "Segmentation Recommendation",
897
+ template=self.theme,
898
+ height=280,
899
+ margin={"t": 50, "b": 20, "l": 20, "r": 20},
900
+ xaxis={"visible": False, "range": [0, 1]},
901
+ yaxis={"visible": False, "range": [0, 1]},
902
+ )
903
+ return fig
904
+
905
+ # =========================================================================
906
+ # Advanced Time Series Visualizations
907
+ # =========================================================================
908
+
909
+ def sparkline(
910
+ self,
911
+ values: List[float],
912
+ title: Optional[str] = None,
913
+ show_endpoints: bool = True,
914
+ show_min_max: bool = True,
915
+ height: int = 60,
916
+ width: int = 200,
917
+ ) -> go.Figure:
918
+ """Create a compact sparkline for inline time series display.
919
+
920
+ Sparklines are small, word-sized graphics that show trends at a glance.
921
+ Ideal for dashboards and tables where space is limited.
922
+ """
923
+ x = list(range(len(values)))
924
+
925
+ fig = go.Figure()
926
+ fig.add_trace(go.Scatter(
927
+ x=x, y=values,
928
+ mode="lines",
929
+ line={"color": self.colors["primary"], "width": 1.5},
930
+ hoverinfo="y"
931
+ ))
932
+
933
+ if show_endpoints and len(values) >= 2:
934
+ fig.add_trace(go.Scatter(
935
+ x=[0, len(values) - 1],
936
+ y=[values[0], values[-1]],
937
+ mode="markers",
938
+ marker={"color": self.colors["primary"], "size": 6},
939
+ hoverinfo="y"
940
+ ))
941
+
942
+ if show_min_max and len(values) >= 2:
943
+ min_idx, max_idx = int(np.argmin(values)), int(np.argmax(values))
944
+ fig.add_trace(go.Scatter(
945
+ x=[min_idx], y=[values[min_idx]],
946
+ mode="markers",
947
+ marker={"color": self.colors["danger"], "size": 5},
948
+ hovertemplate=f"Min: {values[min_idx]:.2f}<extra></extra>"
949
+ ))
950
+ fig.add_trace(go.Scatter(
951
+ x=[max_idx], y=[values[max_idx]],
952
+ mode="markers",
953
+ marker={"color": self.colors["success"], "size": 5},
954
+ hovertemplate=f"Max: {values[max_idx]:.2f}<extra></extra>"
955
+ ))
956
+
957
+ fig.update_layout(
958
+ title={"text": title, "font": {"size": 10}} if title else None,
959
+ height=height,
960
+ width=width,
961
+ margin={"t": 20 if title else 5, "b": 5, "l": 5, "r": 5},
962
+ xaxis={"visible": False},
963
+ yaxis={"visible": False},
964
+ showlegend=False,
965
+ template=self.theme,
966
+ )
967
+ return fig
968
+
969
+ def sparkline_grid(
970
+ self,
971
+ data: Dict[str, List[float]],
972
+ columns: int = 4,
973
+ sparkline_height: int = 60,
974
+ sparkline_width: int = 180,
975
+ ) -> go.Figure:
976
+ """Create a grid of sparklines for multiple time series comparison."""
977
+ from plotly.subplots import make_subplots
978
+
979
+ names = list(data.keys())
980
+ n_rows = (len(names) + columns - 1) // columns
981
+
982
+ fig = make_subplots(
983
+ rows=n_rows, cols=columns,
984
+ subplot_titles=names,
985
+ vertical_spacing=0.15,
986
+ horizontal_spacing=0.08,
987
+ )
988
+
989
+ for i, (name, values) in enumerate(data.items()):
990
+ row, col = (i // columns) + 1, (i % columns) + 1
991
+ x = list(range(len(values)))
992
+
993
+ fig.add_trace(
994
+ go.Scatter(x=x, y=values, mode="lines",
995
+ line={"color": self.colors["primary"], "width": 1.5},
996
+ showlegend=False),
997
+ row=row, col=col
998
+ )
999
+
1000
+ if len(values) >= 2:
1001
+ trend = values[-1] - values[0]
1002
+ color = self.colors["success"] if trend >= 0 else self.colors["danger"]
1003
+ fig.add_trace(
1004
+ go.Scatter(x=[len(values) - 1], y=[values[-1]], mode="markers",
1005
+ marker={"color": color, "size": 6}, showlegend=False),
1006
+ row=row, col=col
1007
+ )
1008
+
1009
+ fig.update_xaxes(visible=False)
1010
+ fig.update_yaxes(visible=False)
1011
+ fig.update_layout(
1012
+ height=n_rows * sparkline_height + 50,
1013
+ template=self.theme,
1014
+ margin={"t": 40, "b": 20},
1015
+ )
1016
+ return fig
1017
+
1018
+ def calendar_heatmap(
1019
+ self,
1020
+ dates: Series,
1021
+ values: Optional[Series] = None,
1022
+ title: Optional[str] = None,
1023
+ colorscale: str = "Blues",
1024
+ ) -> go.Figure:
1025
+ """Create a calendar heatmap showing patterns by day-of-week and week-of-year.
1026
+
1027
+ Similar to GitHub contribution graphs. Shows temporal patterns at a glance.
1028
+ If values not provided, shows count of occurrences per day.
1029
+ """
1030
+ import pandas as pd
1031
+ dates = ensure_pandas_series(dates)
1032
+ parsed = pd.to_datetime(dates, errors="coerce")
1033
+
1034
+ if values is not None:
1035
+ values = ensure_pandas_series(values)
1036
+ df_cal = pd.DataFrame({"date": parsed, "value": values}).dropna()
1037
+ daily = df_cal.groupby(df_cal["date"].dt.date)["value"].sum()
1038
+ else:
1039
+ daily = parsed.dropna().dt.date.value_counts().sort_index()
1040
+
1041
+ if len(daily) == 0:
1042
+ fig = go.Figure()
1043
+ fig.add_annotation(text="No valid dates", x=0.5, y=0.5, showarrow=False)
1044
+ return fig
1045
+
1046
+ df_daily = pd.DataFrame({"date": pd.to_datetime(daily.index), "value": daily.values})
1047
+ df_daily["week"] = df_daily["date"].dt.isocalendar().week
1048
+ df_daily["year"] = df_daily["date"].dt.year
1049
+ df_daily["dow"] = df_daily["date"].dt.dayofweek
1050
+ df_daily["year_week"] = df_daily["year"].astype(str) + "-W" + df_daily["week"].astype(str).str.zfill(2)
1051
+
1052
+ pivot = df_daily.pivot_table(index="dow", columns="year_week", values="value", aggfunc="sum")
1053
+
1054
+ fig = go.Figure(go.Heatmap(
1055
+ z=pivot.values,
1056
+ x=pivot.columns.tolist(),
1057
+ y=[self.DOW_NAMES[i] for i in pivot.index],
1058
+ colorscale=colorscale,
1059
+ hovertemplate="Week: %{x}<br>Day: %{y}<br>Value: %{z:,.0f}<extra></extra>",
1060
+ ))
1061
+
1062
+ fig.update_layout(
1063
+ title=title or "Calendar Heatmap",
1064
+ xaxis_title="Week",
1065
+ yaxis_title="Day of Week",
1066
+ template=self.theme,
1067
+ height=250,
1068
+ xaxis={"tickangle": -45, "dtick": 4},
1069
+ )
1070
+ return fig
1071
+
1072
+ def monthly_calendar_heatmap(
1073
+ self,
1074
+ dates: Series,
1075
+ values: Optional[Series] = None,
1076
+ title: Optional[str] = None,
1077
+ ) -> go.Figure:
1078
+ """Create a month x day-of-week heatmap for pattern discovery."""
1079
+ import pandas as pd
1080
+ dates = ensure_pandas_series(dates)
1081
+ parsed = pd.to_datetime(dates, errors="coerce").dropna()
1082
+
1083
+ if values is not None:
1084
+ values = ensure_pandas_series(values)
1085
+ df_cal = pd.DataFrame({"date": parsed, "value": values}).dropna()
1086
+ df_cal["month"] = df_cal["date"].dt.month
1087
+ df_cal["dow"] = df_cal["date"].dt.dayofweek
1088
+ pivot = df_cal.pivot_table(index="dow", columns="month", values="value", aggfunc="mean")
1089
+ else:
1090
+ df_cal = pd.DataFrame({"date": parsed})
1091
+ df_cal["month"] = df_cal["date"].dt.month
1092
+ df_cal["dow"] = df_cal["date"].dt.dayofweek
1093
+ pivot = df_cal.groupby(["dow", "month"]).size().unstack(fill_value=0)
1094
+
1095
+ month_labels = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
1096
+ "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
1097
+
1098
+ fig = go.Figure(go.Heatmap(
1099
+ z=pivot.values,
1100
+ x=[month_labels[i-1] for i in pivot.columns],
1101
+ y=[self.DOW_NAMES[i] for i in pivot.index],
1102
+ colorscale="YlOrRd",
1103
+ hovertemplate="Month: %{x}<br>Day: %{y}<br>Value: %{z:,.1f}<extra></extra>",
1104
+ ))
1105
+
1106
+ fig.update_layout(
1107
+ title=title or "Activity by Month and Day of Week",
1108
+ template=self.theme,
1109
+ height=280,
1110
+ )
1111
+ return fig
1112
+
1113
+ def time_series_with_anomalies(
1114
+ self,
1115
+ dates: Series,
1116
+ values: Series,
1117
+ window: int = 7,
1118
+ n_std: float = 2.0,
1119
+ title: Optional[str] = None,
1120
+ ) -> go.Figure:
1121
+ """Create time series plot with anomaly detection bands.
1122
+
1123
+ Uses rolling mean ± n_std * rolling_std to define normal bounds.
1124
+ Points outside bounds are highlighted as anomalies.
1125
+ """
1126
+ import pandas as pd
1127
+ dates = ensure_pandas_series(dates)
1128
+ values = ensure_pandas_series(values)
1129
+
1130
+ df = pd.DataFrame({"date": pd.to_datetime(dates), "value": values}).dropna()
1131
+ df = df.sort_values("date")
1132
+
1133
+ df["rolling_mean"] = df["value"].rolling(window=window, center=True, min_periods=1).mean()
1134
+ df["rolling_std"] = df["value"].rolling(window=window, center=True, min_periods=1).std()
1135
+ df["upper"] = df["rolling_mean"] + n_std * df["rolling_std"]
1136
+ df["lower"] = df["rolling_mean"] - n_std * df["rolling_std"]
1137
+ df["is_anomaly"] = (df["value"] > df["upper"]) | (df["value"] < df["lower"])
1138
+
1139
+ anomaly_count = df["is_anomaly"].sum()
1140
+ anomaly_pct = anomaly_count / len(df) * 100
1141
+
1142
+ fig = go.Figure()
1143
+
1144
+ # Confidence band
1145
+ fig.add_trace(go.Scatter(
1146
+ x=pd.concat([df["date"], df["date"][::-1]]),
1147
+ y=pd.concat([df["upper"], df["lower"][::-1]]),
1148
+ fill="toself",
1149
+ fillcolor="rgba(31, 119, 180, 0.2)",
1150
+ line={"color": "rgba(255,255,255,0)"},
1151
+ name=f"Normal Range (±{n_std}σ)",
1152
+ hoverinfo="skip",
1153
+ ))
1154
+
1155
+ # Rolling mean
1156
+ fig.add_trace(go.Scatter(
1157
+ x=df["date"], y=df["rolling_mean"],
1158
+ mode="lines",
1159
+ line={"color": self.colors["info"], "width": 1, "dash": "dash"},
1160
+ name="Rolling Mean",
1161
+ ))
1162
+
1163
+ # Normal points
1164
+ normal = df[~df["is_anomaly"]]
1165
+ fig.add_trace(go.Scatter(
1166
+ x=normal["date"], y=normal["value"],
1167
+ mode="lines+markers",
1168
+ line={"color": self.colors["primary"], "width": 1.5},
1169
+ marker={"size": 4},
1170
+ name="Normal",
1171
+ ))
1172
+
1173
+ # Anomaly points
1174
+ anomalies = df[df["is_anomaly"]]
1175
+ if len(anomalies) > 0:
1176
+ fig.add_trace(go.Scatter(
1177
+ x=anomalies["date"], y=anomalies["value"],
1178
+ mode="markers",
1179
+ marker={"color": self.colors["danger"], "size": 10, "symbol": "x"},
1180
+ name=f"Anomalies ({anomaly_count})",
1181
+ ))
1182
+
1183
+ fig.update_layout(
1184
+ title=title or f"Time Series with Anomalies ({anomaly_pct:.1f}% anomalous)",
1185
+ xaxis_title="Date",
1186
+ yaxis_title="Value",
1187
+ template=self.theme,
1188
+ height=400,
1189
+ legend={"orientation": "h", "y": -0.15},
1190
+ )
1191
+ return fig
1192
+
1193
+ def waterfall_chart(
1194
+ self,
1195
+ categories: List[str],
1196
+ values: List[float],
1197
+ title: Optional[str] = None,
1198
+ initial_label: str = "Start",
1199
+ final_label: str = "End",
1200
+ ) -> go.Figure:
1201
+ """Create a waterfall chart showing cumulative impact.
1202
+
1203
+ Shows how sequential changes contribute to a final result.
1204
+ Useful for explaining score breakdowns or cumulative effects.
1205
+ """
1206
+ measures = ["absolute"] + ["relative"] * len(values) + ["total"]
1207
+ x_labels = [initial_label] + categories + [final_label]
1208
+
1209
+ initial_value = 0
1210
+ cumulative = initial_value
1211
+ y_values = [initial_value]
1212
+ text_values = [f"{initial_value:,.0f}"]
1213
+
1214
+ for v in values:
1215
+ y_values.append(v)
1216
+ cumulative += v
1217
+ sign = "+" if v >= 0 else ""
1218
+ text_values.append(f"{sign}{v:,.0f}")
1219
+
1220
+ y_values.append(cumulative)
1221
+ text_values.append(f"{cumulative:,.0f}")
1222
+
1223
+ colors = [self.colors["info"]] # Initial
1224
+ for v in values:
1225
+ colors.append(self.colors["success"] if v >= 0 else self.colors["danger"])
1226
+ colors.append(self.colors["primary"]) # Total
1227
+
1228
+ fig = go.Figure(go.Waterfall(
1229
+ x=x_labels,
1230
+ y=y_values,
1231
+ measure=measures,
1232
+ text=text_values,
1233
+ textposition="outside",
1234
+ connector={"line": {"color": "gray", "width": 1, "dash": "dot"}},
1235
+ increasing={"marker": {"color": self.colors["success"]}},
1236
+ decreasing={"marker": {"color": self.colors["danger"]}},
1237
+ totals={"marker": {"color": self.colors["primary"]}},
1238
+ ))
1239
+
1240
+ fig.update_layout(
1241
+ title=title or "Waterfall Chart",
1242
+ template=self.theme,
1243
+ height=400,
1244
+ showlegend=False,
1245
+ )
1246
+ return fig
1247
+
1248
+ def quality_waterfall(
1249
+ self,
1250
+ check_results: List[Dict[str, Any]],
1251
+ max_score: int = 100,
1252
+ title: Optional[str] = None,
1253
+ ) -> go.Figure:
1254
+ """Create a waterfall chart specifically for quality score breakdown.
1255
+
1256
+ Shows how each check contributes to or detracts from the total score.
1257
+
1258
+ Args:
1259
+ check_results: List of dicts with 'name', 'passed', 'weight' keys
1260
+ max_score: Maximum possible score (default 100)
1261
+ title: Chart title
1262
+ """
1263
+ categories = []
1264
+ values = []
1265
+
1266
+ for check in check_results:
1267
+ categories.append(check["name"])
1268
+ if check["passed"]:
1269
+ values.append(0) # No penalty
1270
+ else:
1271
+ penalty = -check["weight"] * (max_score / sum(c["weight"] for c in check_results))
1272
+ values.append(penalty)
1273
+
1274
+ return self.waterfall_chart(
1275
+ categories=categories,
1276
+ values=values,
1277
+ title=title or "Quality Score Breakdown",
1278
+ initial_label="Max Score",
1279
+ final_label="Final Score",
1280
+ )
1281
+
1282
+ def velocity_acceleration_chart(
1283
+ self,
1284
+ data: Dict[str, Dict[str, List[float]]],
1285
+ title: Optional[str] = None,
1286
+ ) -> go.Figure:
1287
+ """Create side-by-side Value/Velocity/Acceleration chart for cohort comparison.
1288
+
1289
+ Args:
1290
+ data: Dict with structure {column: {"retained": [...], "churned": [...], "velocity_retained": [...], ...}}
1291
+ title: Chart title
1292
+ """
1293
+ from plotly.subplots import make_subplots
1294
+
1295
+ columns = list(data.keys())
1296
+ n_cols = len(columns)
1297
+
1298
+ fig = make_subplots(
1299
+ rows=n_cols, cols=3,
1300
+ subplot_titles=[f"{col[:12]} - Value" for col in columns] +
1301
+ [f"{col[:12]} - Velocity" for col in columns] +
1302
+ [f"{col[:12]} - Accel." for col in columns],
1303
+ vertical_spacing=0.08,
1304
+ horizontal_spacing=0.08,
1305
+ )
1306
+
1307
+ for i, col in enumerate(columns):
1308
+ row = i + 1
1309
+ col_data = data[col]
1310
+
1311
+ # Value
1312
+ if "retained" in col_data:
1313
+ fig.add_trace(go.Scatter(
1314
+ y=col_data["retained"], mode="lines",
1315
+ line={"color": self.colors["success"], "width": 1.5},
1316
+ name="Retained", showlegend=(i == 0), legendgroup="retained"
1317
+ ), row=row, col=1)
1318
+ if "churned" in col_data:
1319
+ fig.add_trace(go.Scatter(
1320
+ y=col_data["churned"], mode="lines",
1321
+ line={"color": self.colors["danger"], "width": 1.5},
1322
+ name="Churned", showlegend=(i == 0), legendgroup="churned"
1323
+ ), row=row, col=1)
1324
+
1325
+ # Velocity
1326
+ if "velocity_retained" in col_data:
1327
+ fig.add_trace(go.Scatter(
1328
+ y=col_data["velocity_retained"], mode="lines",
1329
+ line={"color": self.colors["success"], "width": 1.5},
1330
+ showlegend=False, legendgroup="retained"
1331
+ ), row=row, col=2)
1332
+ if "velocity_churned" in col_data:
1333
+ fig.add_trace(go.Scatter(
1334
+ y=col_data["velocity_churned"], mode="lines",
1335
+ line={"color": self.colors["danger"], "width": 1.5},
1336
+ showlegend=False, legendgroup="churned"
1337
+ ), row=row, col=2)
1338
+ fig.add_hline(y=0, line_dash="dot", line_color="gray", row=row, col=2)
1339
+
1340
+ # Acceleration
1341
+ if "accel_retained" in col_data:
1342
+ fig.add_trace(go.Scatter(
1343
+ y=col_data["accel_retained"], mode="lines",
1344
+ line={"color": self.colors["success"], "width": 1.5},
1345
+ showlegend=False, legendgroup="retained"
1346
+ ), row=row, col=3)
1347
+ if "accel_churned" in col_data:
1348
+ fig.add_trace(go.Scatter(
1349
+ y=col_data["accel_churned"], mode="lines",
1350
+ line={"color": self.colors["danger"], "width": 1.5},
1351
+ showlegend=False, legendgroup="churned"
1352
+ ), row=row, col=3)
1353
+ fig.add_hline(y=0, line_dash="dot", line_color="gray", row=row, col=3)
1354
+
1355
+ fig.update_xaxes(showticklabels=False)
1356
+ fig.update_yaxes(showticklabels=False)
1357
+ fig.update_layout(
1358
+ height=150 * n_cols + 80,
1359
+ title=title or "Value → Velocity → Acceleration",
1360
+ template=self.theme,
1361
+ legend={"orientation": "h", "y": 1.02, "x": 0.5, "xanchor": "center"},
1362
+ margin={"t": 100},
1363
+ )
1364
+ return fig
1365
+
1366
+ def _create_effect_heatmap_trace(self, metric_data: Dict, variables: List[str], windows: List[str], show_colorbar: bool) -> go.Heatmap:
1367
+ z_vals = [[metric_data.get(var, {}).get(w, 0) for w in windows] for var in variables]
1368
+ text_vals = [[f"{metric_data.get(var, {}).get(w, 0):.2f}" for w in windows] for var in variables]
1369
+ return go.Heatmap(
1370
+ z=z_vals, x=windows, y=[v[:15] for v in variables],
1371
+ colorscale="RdBu_r", zmid=0, zmin=-1, zmax=1,
1372
+ text=text_vals, texttemplate="%{text}", textfont={"size": 10},
1373
+ showscale=show_colorbar, colorbar={"title": "Cohen's d"} if show_colorbar else None
1374
+ )
1375
+
1376
+ def velocity_signal_heatmap(self, data: Dict[str, Dict[str, Dict[str, float]]], title: Optional[str] = None) -> go.Figure:
1377
+ from plotly.subplots import make_subplots
1378
+ vel_data, accel_data = data.get("velocity", {}), data.get("acceleration", {})
1379
+ if not vel_data and not accel_data:
1380
+ fig = go.Figure()
1381
+ fig.update_layout(title=title or "No data", template=self.theme)
1382
+ return fig
1383
+ variables = list(vel_data.keys()) or list(accel_data.keys())
1384
+ windows = list(next(iter(vel_data.values())).keys()) if vel_data else []
1385
+ fig = make_subplots(
1386
+ rows=2, cols=1, subplot_titles=["Velocity Effect Size (d)", "Acceleration Effect Size (d)"],
1387
+ vertical_spacing=0.15
1388
+ )
1389
+ for row_idx, metric_data in enumerate([vel_data, accel_data], start=1):
1390
+ fig.add_trace(
1391
+ self._create_effect_heatmap_trace(metric_data, variables, windows, row_idx == 2),
1392
+ row=row_idx, col=1
1393
+ )
1394
+ fig.update_layout(
1395
+ title=title or "Velocity & Acceleration Signal Strength",
1396
+ height=max(400, len(variables) * 80 + 200), template=self.theme
1397
+ )
1398
+ return fig
1399
+
1400
+ def cohort_velocity_sparklines(self, results: List[Any], feature_name: str, title: Optional[str] = None) -> go.Figure:
1401
+ from plotly.subplots import make_subplots
1402
+ if not results:
1403
+ fig = go.Figure()
1404
+ fig.update_layout(title=title or f"{feature_name} - No data", template=self.theme)
1405
+ return fig
1406
+ n_windows = len(results)
1407
+ col_titles = [getattr(r, "period_label", f"{r.window_days}d") for r in results]
1408
+ row_titles = ["Retained", "Churned", "Overall", "Retained", "Churned", "Overall"]
1409
+ fig = make_subplots(
1410
+ rows=6, cols=n_windows, row_titles=row_titles, column_titles=col_titles,
1411
+ vertical_spacing=0.06, horizontal_spacing=0.03,
1412
+ row_heights=[1, 1, 1, 1, 1, 1]
1413
+ )
1414
+ styles = {
1415
+ "retained": (self.colors["success"], "rgba(44, 160, 44, 0.2)"),
1416
+ "churned": (self.colors["danger"], "rgba(214, 39, 40, 0.2)"),
1417
+ "overall": (self.colors["info"], "rgba(23, 190, 207, 0.2)"),
1418
+ }
1419
+ for col_idx, r in enumerate(results, start=1):
1420
+ self._add_velocity_sparkline(fig, r.retained_velocity, styles["retained"], 1, col_idx)
1421
+ self._add_velocity_sparkline(fig, r.churned_velocity, styles["churned"], 2, col_idx)
1422
+ self._add_velocity_sparkline(fig, r.overall_velocity, styles["overall"], 3, col_idx)
1423
+ self._add_velocity_sparkline(fig, r.retained_accel, styles["retained"], 4, col_idx)
1424
+ self._add_velocity_sparkline(fig, r.churned_accel, styles["churned"], 5, col_idx)
1425
+ self._add_velocity_sparkline(fig, r.overall_accel, styles["overall"], 6, col_idx)
1426
+ fig.update_xaxes(showticklabels=False, showgrid=False)
1427
+ fig.update_yaxes(showticklabels=False, showgrid=False)
1428
+ fig.update_layout(
1429
+ title=title or f"<b>{feature_name}</b>",
1430
+ height=520, template=self.theme,
1431
+ margin={"t": 60, "b": 20, "l": 80, "r": 70}
1432
+ )
1433
+ fig.add_annotation(
1434
+ text="<b>Velocity</b>", textangle=-90, xref="paper", yref="paper",
1435
+ x=-0.06, y=0.77, showarrow=False, font={"size": 12}
1436
+ )
1437
+ fig.add_annotation(
1438
+ text="<b>Acceleration</b>", textangle=-90, xref="paper", yref="paper",
1439
+ x=-0.06, y=0.23, showarrow=False, font={"size": 12}
1440
+ )
1441
+ return fig
1442
+
1443
+ def _add_velocity_sparkline(
1444
+ self, fig: go.Figure, data: List[float], style: tuple, row: int, col: int
1445
+ ) -> None:
1446
+ if not data:
1447
+ return
1448
+ color, fill = style
1449
+ fig.add_trace(go.Scatter(
1450
+ y=data, mode="lines", line={"color": color, "width": 1.5},
1451
+ fill="tozeroy", fillcolor=fill, showlegend=False
1452
+ ), row=row, col=col)
1453
+
1454
+ def lag_correlation_heatmap(self, data: Dict[str, List[float]], max_lag: int = 14, title: Optional[str] = None) -> go.Figure:
1455
+ columns = list(data.keys())
1456
+ z_values = [data[col][:max_lag] for col in columns]
1457
+ lag_labels = [f"Lag {i}" for i in range(1, max_lag + 1)]
1458
+
1459
+ fig = go.Figure(go.Heatmap(
1460
+ z=z_values,
1461
+ x=lag_labels,
1462
+ y=[col[:15] for col in columns],
1463
+ colorscale="RdBu_r",
1464
+ zmid=0,
1465
+ text=[[f"{v:.2f}" for v in row] for row in z_values],
1466
+ texttemplate="%{text}",
1467
+ textfont={"size": 9},
1468
+ colorbar={"title": "Correlation"},
1469
+ ))
1470
+
1471
+ fig.update_layout(
1472
+ title=title or "Autocorrelation by Lag",
1473
+ xaxis_title="Lag (periods)",
1474
+ yaxis_title="Variable",
1475
+ template=self.theme,
1476
+ height=50 * len(columns) + 150,
1477
+ )
1478
+ return fig
1479
+
1480
+ def predictive_power_chart(
1481
+ self,
1482
+ iv_values: Dict[str, float],
1483
+ ks_values: Dict[str, float],
1484
+ title: Optional[str] = None,
1485
+ ) -> go.Figure:
1486
+ """Create side-by-side IV and KS statistic bar charts.
1487
+
1488
+ Args:
1489
+ iv_values: Dict with {column: iv_value}
1490
+ ks_values: Dict with {column: ks_value}
1491
+ title: Chart title
1492
+ """
1493
+ from plotly.subplots import make_subplots
1494
+
1495
+ # Sort by IV
1496
+ sorted_cols = sorted(iv_values.keys(), key=lambda x: iv_values[x], reverse=True)
1497
+ ivs = [iv_values[c] for c in sorted_cols]
1498
+ kss = [ks_values.get(c, 0) for c in sorted_cols]
1499
+ col_labels = [c[:15] for c in sorted_cols]
1500
+
1501
+ fig = make_subplots(
1502
+ rows=1, cols=2,
1503
+ subplot_titles=("Information Value (IV)", "KS Statistic"),
1504
+ )
1505
+
1506
+ fig.add_trace(go.Bar(
1507
+ x=col_labels, y=ivs, marker_color=self._get_iv_colors(ivs), name="IV"
1508
+ ), row=1, col=1)
1509
+ fig.add_hline(y=0.3, line_dash="dash", line_color="green", row=1, col=1)
1510
+ fig.add_hline(y=0.1, line_dash="dash", line_color="orange", row=1, col=1)
1511
+
1512
+ fig.add_trace(go.Bar(
1513
+ x=col_labels, y=kss, marker_color=self._get_ks_colors(kss), name="KS"
1514
+ ), row=1, col=2)
1515
+ fig.add_hline(y=0.4, line_dash="dash", line_color="green", row=1, col=2)
1516
+ fig.add_hline(y=0.2, line_dash="dash", line_color="orange", row=1, col=2)
1517
+
1518
+ fig.update_layout(
1519
+ title=title or "Variable Predictive Power",
1520
+ template=self.theme,
1521
+ height=400,
1522
+ showlegend=False,
1523
+ )
1524
+ fig.update_xaxes(tickangle=45)
1525
+ return fig
1526
+
1527
+ def momentum_comparison_chart(
1528
+ self,
1529
+ data: Dict[str, Dict[str, float]],
1530
+ title: Optional[str] = None,
1531
+ window_label: Optional[str] = None,
1532
+ ) -> go.Figure:
1533
+ columns = list(data.keys())
1534
+ first_col_data = data[columns[0]] if columns else {}
1535
+ uses_simple_keys = "retained" in first_col_data or "churned" in first_col_data
1536
+
1537
+ if uses_simple_keys:
1538
+ return self._create_simple_momentum_chart(data, columns, title, window_label)
1539
+ return self._create_multi_window_momentum_chart(data, columns, title, window_label)
1540
+
1541
+ def _create_simple_momentum_chart(
1542
+ self, data: Dict, columns: List[str], title: Optional[str], window_label: Optional[str]
1543
+ ) -> go.Figure:
1544
+ col_labels = [c[:15] for c in columns]
1545
+ fig = go.Figure()
1546
+ fig.add_trace(go.Bar(
1547
+ name="Retained", x=col_labels,
1548
+ y=[data[c].get("retained", 1) for c in columns],
1549
+ marker_color=self.colors["success"],
1550
+ ))
1551
+ fig.add_trace(go.Bar(
1552
+ name="Churned", x=col_labels,
1553
+ y=[data[c].get("churned", 1) for c in columns],
1554
+ marker_color=self.colors["danger"],
1555
+ ))
1556
+ fig.add_hline(y=1.0, line_dash="dash", line_color="gray",
1557
+ annotation_text="baseline", annotation_position="right")
1558
+ chart_title = title or f"Momentum Comparison{f' ({window_label})' if window_label else ''}"
1559
+ fig.update_layout(
1560
+ title=chart_title, template=self.theme, height=450, barmode="group",
1561
+ legend={"orientation": "h", "y": -0.15, "x": 0.5, "xanchor": "center"},
1562
+ xaxis_title="Feature", yaxis_title="Momentum (>1 = increasing, <1 = decreasing)",
1563
+ margin={"b": 100},
1564
+ )
1565
+ return fig
1566
+
1567
+ def _create_multi_window_momentum_chart(
1568
+ self, data: Dict, columns: List[str], title: Optional[str], window_label: Optional[str]
1569
+ ) -> go.Figure:
1570
+ from plotly.subplots import make_subplots
1571
+
1572
+ col_labels = [c[:15] for c in columns]
1573
+ fig = make_subplots(
1574
+ rows=1, cols=2,
1575
+ subplot_titles=(window_label or "Short/Medium", "Medium/Long"),
1576
+ )
1577
+ self._add_momentum_cohort_bars(
1578
+ fig, col_labels, columns, data,
1579
+ retained_key="retained_7_30", churned_key="churned_7_30",
1580
+ col=1, show_legend=True,
1581
+ )
1582
+ self._add_momentum_cohort_bars(
1583
+ fig, col_labels, columns, data,
1584
+ retained_key="retained_30_90", churned_key="churned_30_90",
1585
+ col=2, show_legend=False,
1586
+ )
1587
+ fig.update_layout(
1588
+ title=title or "Momentum by Retention Status",
1589
+ template=self.theme, height=450, barmode="group",
1590
+ legend={"orientation": "h", "y": -0.15, "x": 0.5, "xanchor": "center"},
1591
+ margin={"b": 100},
1592
+ )
1593
+ return fig
1594
+
1595
+ def _add_momentum_cohort_bars(
1596
+ self, fig: go.Figure, col_labels: List[str], columns: List[str],
1597
+ data: Dict, retained_key: str, churned_key: str, col: int, show_legend: bool
1598
+ ) -> None:
1599
+ fig.add_trace(go.Bar(
1600
+ name="Retained", x=col_labels,
1601
+ y=[data[c].get(retained_key, data[c].get("retained", 1)) for c in columns],
1602
+ marker_color=self.colors["success"], showlegend=show_legend,
1603
+ ), row=1, col=col)
1604
+ fig.add_trace(go.Bar(
1605
+ name="Churned", x=col_labels,
1606
+ y=[data[c].get(churned_key, data[c].get("churned", 1)) for c in columns],
1607
+ marker_color=self.colors["danger"], showlegend=show_legend,
1608
+ ), row=1, col=col)
1609
+ fig.add_hline(y=1.0, line_dash="dash", line_color="gray", row=1, col=col)
1610
+
1611
+ def cohort_sparklines(
1612
+ self,
1613
+ data: Dict[str, Dict[str, List[float]]],
1614
+ feature_name: str,
1615
+ period_effects: Optional[Dict[str, float]] = None,
1616
+ ) -> go.Figure:
1617
+ """Create 3x3 sparkline grid: cohorts (rows) × time periods (cols) for one feature."""
1618
+ from plotly.subplots import make_subplots
1619
+
1620
+ cohorts = ["retained", "churned", "overall"]
1621
+ periods = ["weekly", "monthly", "yearly"]
1622
+ row_titles = ["Retained", "Churned", "Overall"]
1623
+ col_titles = self._build_period_titles(periods, period_effects)
1624
+
1625
+ fig = make_subplots(
1626
+ rows=3, cols=3,
1627
+ row_titles=row_titles,
1628
+ column_titles=col_titles,
1629
+ vertical_spacing=0.08,
1630
+ horizontal_spacing=0.06,
1631
+ )
1632
+
1633
+ styles = {
1634
+ "retained": (self.colors["success"], "rgba(44, 160, 44, 0.2)"),
1635
+ "churned": (self.colors["danger"], "rgba(214, 39, 40, 0.2)"),
1636
+ "overall": (self.colors["info"], "rgba(23, 190, 207, 0.2)"),
1637
+ }
1638
+
1639
+ for row_idx, cohort in enumerate(cohorts):
1640
+ if cohort not in data:
1641
+ continue
1642
+ color, fill = styles[cohort]
1643
+ for col_idx, period in enumerate(periods):
1644
+ if period in data[cohort]:
1645
+ fig.add_trace(go.Scatter(
1646
+ y=data[cohort][period], mode="lines",
1647
+ line={"color": color, "width": 1.5},
1648
+ fill="tozeroy", fillcolor=fill, showlegend=False,
1649
+ ), row=row_idx + 1, col=col_idx + 1)
1650
+
1651
+ fig.update_xaxes(showticklabels=False, showgrid=False)
1652
+ fig.update_yaxes(showticklabels=False, showgrid=False)
1653
+ fig.update_layout(
1654
+ title=f"<b>{feature_name}</b>",
1655
+ height=280,
1656
+ template=self.theme,
1657
+ margin={"t": 50, "b": 20, "l": 70, "r": 20},
1658
+ )
1659
+ return fig
1660
+
1661
+ def _build_period_titles(self, periods: List[str], effects: Optional[Dict[str, float]]) -> List[str]:
1662
+ labels = {"weekly": "Weekly", "monthly": "Monthly", "yearly": "Yearly"}
1663
+ if not effects:
1664
+ return [labels[p] for p in periods]
1665
+ return [f"{labels[p]} (d={effects.get(p, 0):.2f})" for p in periods]
1666
+
1667
+ def analyze_cohort_trends(
1668
+ self,
1669
+ data: Dict[str, Dict[str, List[float]]],
1670
+ feature_name: str,
1671
+ ) -> Dict[str, Any]:
1672
+ """Analyze separation between retained and churned trends across time periods."""
1673
+ periods_analysis = {}
1674
+ for period in ["weekly", "monthly", "yearly"]:
1675
+ if self._has_cohort_period_data(data, period):
1676
+ periods_analysis[period] = self._analyze_period(data, period)
1677
+
1678
+ best_period = self._find_best_period(periods_analysis)
1679
+ recommendation = self._generate_trend_recommendation(feature_name, periods_analysis, best_period)
1680
+ actions = self._generate_actions(feature_name, periods_analysis, best_period)
1681
+ overall_d = self._compute_overall_effect_size(data)
1682
+
1683
+ return {
1684
+ "feature": feature_name,
1685
+ "periods": periods_analysis,
1686
+ "best_period": best_period,
1687
+ "overall_effect_size": overall_d,
1688
+ "recommendation": recommendation,
1689
+ "actions": actions,
1690
+ }
1691
+
1692
+ def _compute_overall_effect_size(self, data: Dict[str, Dict[str, List[float]]]) -> float:
1693
+ if "retained" not in data or "churned" not in data:
1694
+ return 0.0
1695
+ all_retained = [v for period_data in data["retained"].values() for v in period_data]
1696
+ all_churned = [v for period_data in data["churned"].values() for v in period_data]
1697
+ if len(all_retained) < 2 or len(all_churned) < 2:
1698
+ return 0.0
1699
+ return self._compute_cohens_d(np.array(all_retained), np.array(all_churned))
1700
+
1701
+ def _has_cohort_period_data(self, data: Dict, period: str) -> bool:
1702
+ return ("retained" in data and period in data["retained"] and
1703
+ "churned" in data and period in data["churned"])
1704
+
1705
+ @staticmethod
1706
+ def _classify_slope(slope: float) -> str:
1707
+ if slope > 0.01:
1708
+ return "up"
1709
+ return "down" if slope < -0.01 else "flat"
1710
+
1711
+ def _compute_period_trends(self, retained: np.ndarray, churned: np.ndarray) -> Dict[str, Any]:
1712
+ ret_trend = self._compute_trend_slope(retained)
1713
+ churn_trend = self._compute_trend_slope(churned)
1714
+ return {
1715
+ "retained_trend": self._classify_slope(ret_trend),
1716
+ "churned_trend": self._classify_slope(churn_trend),
1717
+ "opposite_trends": (ret_trend > 0 and churn_trend < 0) or (ret_trend < 0 and churn_trend > 0),
1718
+ }
1719
+
1720
+ @staticmethod
1721
+ def _compute_period_variance(retained: np.ndarray, churned: np.ndarray) -> Dict[str, Any]:
1722
+ ret_var, churn_var = float(np.var(retained)), float(np.var(churned))
1723
+ variance_ratio = ret_var / churn_var if churn_var > 0.001 else (10.0 if ret_var > 0.001 else 1.0)
1724
+ return {
1725
+ "variance_ratio": float(variance_ratio),
1726
+ "high_variance": bool(ret_var > 1.0 or churn_var > 1.0),
1727
+ }
1728
+
1729
+ def _analyze_period(self, data: Dict, period: str) -> Dict[str, Any]:
1730
+ retained = np.array(data["retained"][period])
1731
+ churned = np.array(data["churned"][period])
1732
+ result = {
1733
+ "divergence": self._compute_divergence(retained, churned),
1734
+ "effect_size": self._compute_cohens_d(retained, churned),
1735
+ "seasonality_detected": self._detect_seasonality(retained) or self._detect_seasonality(churned),
1736
+ }
1737
+ result.update(self._compute_period_trends(retained, churned))
1738
+ result.update(self._compute_period_variance(retained, churned))
1739
+ return result
1740
+
1741
+ def _compute_trend_slope(self, values: np.ndarray) -> float:
1742
+ if len(values) < 2:
1743
+ return 0.0
1744
+ x = np.arange(len(values))
1745
+ return float(np.polyfit(x, values, 1)[0])
1746
+
1747
+ def _compute_divergence(self, retained: np.ndarray, churned: np.ndarray) -> float:
1748
+ if len(retained) == 0 or len(churned) == 0:
1749
+ return 0.0
1750
+ combined_std = max(np.std(np.concatenate([retained, churned])), 0.001)
1751
+ return float(abs(np.mean(retained) - np.mean(churned)) / combined_std)
1752
+
1753
+ def _compute_cohens_d(self, retained: np.ndarray, churned: np.ndarray) -> float:
1754
+ if len(retained) < 2 or len(churned) < 2:
1755
+ return 0.0
1756
+ pooled_std = np.sqrt((np.var(retained) + np.var(churned)) / 2)
1757
+ if pooled_std < 0.001:
1758
+ return 0.0
1759
+ return float((np.mean(retained) - np.mean(churned)) / pooled_std)
1760
+
1761
+ def _find_best_period(self, periods: Dict[str, Dict]) -> Optional[str]:
1762
+ if not periods:
1763
+ return None
1764
+ return max(periods.keys(), key=lambda p: abs(periods[p].get("divergence", 0)))
1765
+
1766
+ def _generate_trend_recommendation(self, feature: str, periods: Dict, best: Optional[str]) -> str:
1767
+ if not best or best not in periods:
1768
+ return f"Insufficient data for {feature} trend analysis"
1769
+
1770
+ analysis = periods[best]
1771
+ div, eff = analysis["divergence"], abs(analysis["effect_size"])
1772
+ opposite = analysis["opposite_trends"]
1773
+
1774
+ if div > 1.5 or eff > 0.8:
1775
+ strength = "Strong"
1776
+ action = "high-priority feature for churn prediction"
1777
+ elif div > 0.8 or eff > 0.5:
1778
+ strength = "Moderate"
1779
+ action = "useful discriminator between cohorts"
1780
+ elif div > 0.3 or eff > 0.2:
1781
+ strength = "Weak"
1782
+ action = "consider combining with other features"
1783
+ else:
1784
+ return f"{feature}: No significant separation between retained and churned"
1785
+
1786
+ trend_note = " with opposite trend directions" if opposite else ""
1787
+ period_label = {"weekly": "Weekly", "monthly": "Monthly", "yearly": "Yearly"}[best]
1788
+ return f"{feature}: {strength} separation (d={eff:.2f}) at {period_label} scale{trend_note} - {action}"
1789
+
1790
+ def _detect_seasonality(self, values: np.ndarray) -> bool:
1791
+ if len(values) < 6:
1792
+ return False
1793
+ detrended = values - np.linspace(values[0], values[-1], len(values))
1794
+ autocorr = np.correlate(detrended, detrended, mode='full')
1795
+ autocorr = autocorr[len(autocorr) // 2:]
1796
+ if len(autocorr) < 3 or autocorr[0] < 0.001:
1797
+ return False
1798
+ normalized = autocorr / autocorr[0]
1799
+ peaks = [i for i in range(2, len(normalized) - 1)
1800
+ if normalized[i] > normalized[i-1] and normalized[i] > normalized[i+1]]
1801
+ return any(normalized[p] > 0.3 for p in peaks[:3]) if peaks else False
1802
+
1803
+ def _generate_actions(self, feature: str, periods: Dict, best: Optional[str]) -> List[Dict[str, Any]]:
1804
+ actions = []
1805
+ if not periods:
1806
+ return actions
1807
+
1808
+ any_seasonality = any(p.get("seasonality_detected") for p in periods.values())
1809
+ any_high_variance = any(p.get("high_variance") for p in periods.values())
1810
+
1811
+ if best and periods.get(best, {}).get("opposite_trends"):
1812
+ actions.append({
1813
+ "action_type": "add_trend_feature",
1814
+ "feature": feature,
1815
+ "reason": f"Opposite trends detected at {best} scale",
1816
+ "params": {"period": best, "method": "slope"},
1817
+ })
1818
+
1819
+ if any_seasonality:
1820
+ period_with_season = next((k for k, v in periods.items() if v.get("seasonality_detected")), None)
1821
+ actions.append({
1822
+ "action_type": "add_time_indicator",
1823
+ "feature": feature,
1824
+ "reason": f"Seasonality detected at {period_with_season} scale",
1825
+ "params": {"period": period_with_season, "indicators": ["cyclical_encoding"]},
1826
+ })
1827
+
1828
+ if any_high_variance:
1829
+ max_var_period = max(periods.keys(), key=lambda k: periods[k].get("variance_ratio", 1.0))
1830
+ var_ratio = periods[max_var_period].get("variance_ratio", 1.0)
1831
+ if var_ratio > 2.0:
1832
+ actions.append({
1833
+ "action_type": "robust_scale",
1834
+ "feature": feature,
1835
+ "reason": f"High variance ratio ({var_ratio:.1f}x) between cohorts",
1836
+ "params": {"method": "robust_scaler"},
1837
+ })
1838
+ elif any_high_variance:
1839
+ actions.append({
1840
+ "action_type": "normalize",
1841
+ "feature": feature,
1842
+ "reason": "High variance in temporal trends",
1843
+ "params": {"method": "standard_scaler"},
1844
+ })
1845
+
1846
+ return actions
1847
+
1848
+ def descriptive_stats_tiles(
1849
+ self,
1850
+ df: DataFrame,
1851
+ findings: Any,
1852
+ max_columns: int = 12,
1853
+ columns_per_row: int = 4,
1854
+ ) -> go.Figure:
1855
+ """Create a grid of mini chart tiles showing descriptive statistics for each column.
1856
+
1857
+ Each tile shows a type-appropriate visualization:
1858
+ - Numeric: histogram with mean/median markers and key stats
1859
+ - Categorical: top categories bar chart with cardinality
1860
+ - Binary: pie chart with class balance
1861
+ - Datetime: date range indicator
1862
+ - Identifier: uniqueness gauge
1863
+
1864
+ Args:
1865
+ df: DataFrame to visualize
1866
+ findings: ExplorationFindings object with column metadata
1867
+ max_columns: Maximum number of columns to display
1868
+ columns_per_row: Number of tiles per row
1869
+ """
1870
+ from plotly.subplots import make_subplots
1871
+
1872
+ df = to_pandas(df)
1873
+ formatter = NumberFormatter()
1874
+
1875
+ # Exclude temporal metadata columns from visualization
1876
+ temporal_metadata_cols = {"feature_timestamp", "label_timestamp", "label_available_flag"}
1877
+ available_cols = {k: v for k, v in findings.columns.items() if k not in temporal_metadata_cols}
1878
+
1879
+ # Select columns to display (prioritize by type)
1880
+ type_priority = ['target', 'binary', 'numeric_continuous', 'numeric_discrete',
1881
+ 'categorical_nominal', 'categorical_ordinal', 'datetime', 'identifier']
1882
+ sorted_cols = []
1883
+ for col_type in type_priority:
1884
+ for name, col in available_cols.items():
1885
+ if col.inferred_type.value == col_type and name not in sorted_cols:
1886
+ sorted_cols.append(name)
1887
+ for name in available_cols.keys():
1888
+ if name not in sorted_cols:
1889
+ sorted_cols.append(name)
1890
+ display_cols = sorted_cols[:max_columns]
1891
+
1892
+ n_cols = min(columns_per_row, len(display_cols))
1893
+ n_rows = (len(display_cols) + n_cols - 1) // n_cols
1894
+
1895
+ fig = make_subplots(
1896
+ rows=n_rows, cols=n_cols,
1897
+ subplot_titles=[f"<b>{c[:20]}</b>" for c in display_cols],
1898
+ vertical_spacing=0.12,
1899
+ horizontal_spacing=0.08,
1900
+ specs=[[{"type": "xy"} for _ in range(n_cols)] for _ in range(n_rows)]
1901
+ )
1902
+
1903
+ for i, col_name in enumerate(display_cols):
1904
+ row, col = (i // n_cols) + 1, (i % n_cols) + 1
1905
+ col_finding = findings.columns.get(col_name)
1906
+ col_type = col_finding.inferred_type.value if col_finding else "unknown"
1907
+ series = df[col_name] if col_name in df.columns else None
1908
+
1909
+ if series is None:
1910
+ continue
1911
+
1912
+ self._add_column_tile(fig, series, col_finding, col_type, row, col, formatter, n_cols)
1913
+
1914
+ fig.update_layout(
1915
+ height=250 * n_rows,
1916
+ template=self.theme,
1917
+ showlegend=False,
1918
+ margin={"t": 40, "b": 20, "l": 40, "r": 20},
1919
+ )
1920
+
1921
+ return fig
1922
+
1923
+ def dataset_at_a_glance(
1924
+ self,
1925
+ df: DataFrame,
1926
+ findings: Any,
1927
+ source_path: str = "",
1928
+ granularity: str = "entity",
1929
+ max_columns: int = 12,
1930
+ columns_per_row: int = 4,
1931
+ ) -> go.Figure:
1932
+ """Create a unified dataset overview with key metrics and column distribution tiles.
1933
+
1934
+ Combines dataset-level stats (rows, columns, format, granularity) with
1935
+ small multiples of column distributions for a complete first look.
1936
+
1937
+ Args:
1938
+ df: DataFrame to visualize
1939
+ findings: ExplorationFindings object with column metadata
1940
+ source_path: Path to data source (for format detection)
1941
+ granularity: Dataset granularity ("entity" or "event")
1942
+ max_columns: Maximum number of column tiles to display
1943
+ columns_per_row: Number of tiles per row
1944
+ """
1945
+ from pathlib import Path
1946
+
1947
+ from plotly.subplots import make_subplots
1948
+
1949
+ df = to_pandas(df)
1950
+ formatter = NumberFormatter()
1951
+
1952
+ memory_mb = df.memory_usage(deep=True).sum() / 1024**2
1953
+
1954
+ # Detect format from path
1955
+ path = Path(source_path) if source_path else Path("data.csv")
1956
+ fmt = path.suffix.lstrip('.').upper() or "CSV"
1957
+ if fmt == "":
1958
+ fmt = "CSV"
1959
+
1960
+ # Exclude temporal metadata columns from visualization
1961
+ temporal_metadata_cols = {"feature_timestamp", "label_timestamp", "label_available_flag"}
1962
+ available_cols = {k: v for k, v in findings.columns.items() if k not in temporal_metadata_cols}
1963
+
1964
+ # Select columns to display (prioritize by type)
1965
+ type_priority = ['target', 'binary', 'numeric_continuous', 'numeric_discrete',
1966
+ 'categorical_nominal', 'categorical_ordinal', 'datetime', 'identifier']
1967
+ sorted_cols = []
1968
+ for col_type in type_priority:
1969
+ for name, col in available_cols.items():
1970
+ if col.inferred_type.value == col_type and name not in sorted_cols:
1971
+ sorted_cols.append(name)
1972
+ for name in available_cols.keys():
1973
+ if name not in sorted_cols:
1974
+ sorted_cols.append(name)
1975
+ display_cols = sorted_cols[:max_columns]
1976
+
1977
+ n_cols = min(columns_per_row, len(display_cols))
1978
+ n_tile_rows = (len(display_cols) + n_cols - 1) // n_cols
1979
+
1980
+ # Build specs: 1 header row + tile rows
1981
+ header_specs = [{"type": "indicator"} for _ in range(n_cols)]
1982
+ tile_specs = [[{"type": "xy"} for _ in range(n_cols)] for _ in range(n_tile_rows)]
1983
+
1984
+ # Subplot titles: empty for header, column names for tiles
1985
+ titles = [""] * n_cols + [f"<b>{c[:18]}</b>" for c in display_cols]
1986
+
1987
+ fig = make_subplots(
1988
+ rows=1 + n_tile_rows,
1989
+ cols=n_cols,
1990
+ row_heights=[0.15] + [0.85 / n_tile_rows] * n_tile_rows,
1991
+ specs=[header_specs] + tile_specs,
1992
+ subplot_titles=titles,
1993
+ vertical_spacing=0.08,
1994
+ horizontal_spacing=0.06,
1995
+ )
1996
+
1997
+ # Header row: Order is Rows, Columns, Structure, Format, Memory
1998
+ # Use annotations for all to ensure consistent appearance
1999
+ structure_label = "Event" if granularity.lower() == "event" else "Entity"
2000
+ memory_str = f"{memory_mb:.1f} MB"
2001
+
2002
+ # Calculate header column positions for paper coordinates
2003
+ h_spacing = 0.06
2004
+ col_width = (1.0 - h_spacing * (n_cols - 1)) / n_cols
2005
+
2006
+ def get_header_x(col_idx: int) -> float:
2007
+ """Get x center position for header column (1-indexed)."""
2008
+ return (col_idx - 1) * (col_width + h_spacing) + col_width / 2
2009
+
2010
+ # Header data: (label, value)
2011
+ header_items = [
2012
+ ("Rows", f"{findings.row_count:,}"),
2013
+ ("Columns", str(findings.column_count)),
2014
+ ("Structure", structure_label),
2015
+ ("Format", fmt),
2016
+ ("Memory", memory_str),
2017
+ ]
2018
+
2019
+ # Add placeholder indicators (needed for subplot structure)
2020
+ for i in range(min(n_cols, len(header_items))):
2021
+ fig.add_trace(go.Indicator(
2022
+ mode="number", value=0,
2023
+ number={"font": {"size": 1, "color": "rgba(0,0,0,0)"}}
2024
+ ), row=1, col=i+1)
2025
+
2026
+ # Add labels (small, gray, top) and values (large, blue, below) as annotations
2027
+ label_y = 0.96
2028
+ value_y = 0.92
2029
+
2030
+ for i, (label, value) in enumerate(header_items[:n_cols]):
2031
+ x_pos = get_header_x(i + 1)
2032
+
2033
+ # Label
2034
+ fig.add_annotation(
2035
+ x=x_pos, y=label_y,
2036
+ xref="paper", yref="paper",
2037
+ text=label, showarrow=False,
2038
+ font={"size": 12, "color": "#666"},
2039
+ xanchor="center", yanchor="middle"
2040
+ )
2041
+
2042
+ # Value
2043
+ fig.add_annotation(
2044
+ x=x_pos, y=value_y,
2045
+ xref="paper", yref="paper",
2046
+ text=value, showarrow=False,
2047
+ font={"size": 28, "color": self.colors["primary"]},
2048
+ xanchor="center", yanchor="middle"
2049
+ )
2050
+
2051
+ # Column tiles (starting from row 2)
2052
+ for i, col_name in enumerate(display_cols):
2053
+ tile_row = (i // n_cols) + 2 # +2 because row 1 is header
2054
+ tile_col = (i % n_cols) + 1
2055
+ col_finding = findings.columns.get(col_name)
2056
+ col_type = col_finding.inferred_type.value if col_finding else "unknown"
2057
+ series = df[col_name] if col_name in df.columns else None
2058
+
2059
+ if series is None:
2060
+ continue
2061
+
2062
+ self._add_column_tile(fig, series, col_finding, col_type, tile_row, tile_col, formatter, n_cols)
2063
+
2064
+ fig.update_layout(
2065
+ height=120 + 220 * n_tile_rows,
2066
+ template=self.theme,
2067
+ showlegend=False,
2068
+ margin={"t": 30, "b": 20, "l": 40, "r": 20},
2069
+ )
2070
+
2071
+ return fig
2072
+
2073
+ def _add_column_tile(
2074
+ self,
2075
+ fig: go.Figure,
2076
+ series: Series,
2077
+ col_finding: Any,
2078
+ col_type: str,
2079
+ row: int,
2080
+ col: int,
2081
+ formatter: "NumberFormatter",
2082
+ n_cols: int = 4,
2083
+ ) -> None:
2084
+ """Add a single column tile to the subplot grid."""
2085
+ series = ensure_pandas_series(series)
2086
+ metrics = col_finding.universal_metrics if col_finding else {}
2087
+ type_metrics = col_finding.type_metrics if col_finding else {}
2088
+
2089
+ if col_type in ('numeric_continuous', 'numeric_discrete'):
2090
+ self._add_numeric_tile(fig, series, metrics, type_metrics, row, col, n_cols, formatter)
2091
+ elif col_type in ('categorical_nominal', 'categorical_ordinal', 'categorical_cyclical'):
2092
+ self._add_categorical_tile(fig, series, metrics, row, col, n_cols, formatter)
2093
+ elif col_type == 'binary':
2094
+ self._add_binary_tile(fig, series, metrics, row, col, n_cols, formatter)
2095
+ elif col_type in ('datetime', 'date'):
2096
+ self._add_datetime_tile(fig, series, metrics, row, col, n_cols)
2097
+ elif col_type == 'identifier':
2098
+ self._add_identifier_tile(fig, series, metrics, row, col, n_cols, formatter)
2099
+ elif col_type == 'target':
2100
+ self._add_target_tile(fig, series, metrics, row, col, n_cols, formatter)
2101
+ else:
2102
+ self._add_generic_tile(fig, series, metrics, row, col, n_cols, formatter)
2103
+
2104
+ def _get_axis_ref(self, row: int, col: int, n_cols: int, axis: str = "x") -> str:
2105
+ """Get the correct axis reference for subplot annotations."""
2106
+ # Calculate linear index (0-based)
2107
+ idx = (row - 1) * n_cols + col
2108
+ # First subplot uses 'x'/'y', others use 'x2', 'x3', etc.
2109
+ if idx == 1:
2110
+ return axis
2111
+ return f"{axis}{idx}"
2112
+
2113
+ def _add_numeric_tile(
2114
+ self, fig: go.Figure, series: Series, metrics: Dict, type_metrics: Dict,
2115
+ row: int, col: int, n_cols: int, formatter: "NumberFormatter"
2116
+ ) -> None:
2117
+ """Add numeric column tile with histogram and stats."""
2118
+ clean = series.dropna()
2119
+ if len(clean) == 0:
2120
+ return
2121
+
2122
+ mean_val = type_metrics.get('mean', clean.mean())
2123
+ median_val = type_metrics.get('median', clean.median())
2124
+ std_val = type_metrics.get('std', clean.std())
2125
+ null_pct = metrics.get('null_percentage', 0)
2126
+
2127
+ fig.add_trace(go.Histogram(
2128
+ x=clean, nbinsx=20,
2129
+ marker_color=self.colors["primary"],
2130
+ opacity=0.7,
2131
+ hovertemplate="Range: %{x}<br>Count: %{y}<extra></extra>"
2132
+ ), row=row, col=col)
2133
+
2134
+ xaxis_ref = self._get_axis_ref(row, col, n_cols, 'x')
2135
+ yaxis_ref = self._get_axis_ref(row, col, n_cols, 'y')
2136
+ fig.add_shape(type="line", x0=mean_val, x1=mean_val, y0=0, y1=1,
2137
+ xref=xaxis_ref, yref=f"{yaxis_ref} domain",
2138
+ line={"color": self.colors["secondary"], "width": 2, "dash": "dash"})
2139
+ fig.add_shape(type="line", x0=median_val, x1=median_val, y0=0, y1=1,
2140
+ xref=xaxis_ref, yref=f"{yaxis_ref} domain",
2141
+ line={"color": self.colors["success"], "width": 2, "dash": "dot"})
2142
+
2143
+ stats_text = (f"μ={formatter.compact(mean_val)} | "
2144
+ f"σ={formatter.compact(std_val)}" +
2145
+ (f"<br>null={null_pct:.0f}%" if null_pct > 0 else ""))
2146
+ fig.add_annotation(
2147
+ x=0.98, y=0.98, xref=f"{xaxis_ref} domain", yref=f"{yaxis_ref} domain",
2148
+ text=stats_text, showarrow=False,
2149
+ font={"size": 9, "color": "#666"},
2150
+ bgcolor="rgba(255,255,255,0.8)",
2151
+ xanchor="right", yanchor="top"
2152
+ )
2153
+
2154
+ def _add_categorical_tile(
2155
+ self, fig: go.Figure, series: Series, metrics: Dict,
2156
+ row: int, col: int, n_cols: int, formatter: "NumberFormatter"
2157
+ ) -> None:
2158
+ """Add categorical column tile with top categories bar."""
2159
+ value_counts = series.value_counts().head(5)
2160
+
2161
+ # Gradient colors to show rank
2162
+ colors = [self.colors["info"]] + [self.colors["primary"]] * (len(value_counts) - 1)
2163
+
2164
+ fig.add_trace(go.Bar(
2165
+ x=value_counts.values,
2166
+ y=[str(v)[:10] for v in value_counts.index],
2167
+ orientation='h',
2168
+ marker_color=colors[:len(value_counts)],
2169
+ hovertemplate="%{y}: %{x:,}<extra></extra>"
2170
+ ), row=row, col=col)
2171
+
2172
+ def _add_binary_tile(
2173
+ self, fig: go.Figure, series: Series, metrics: Dict,
2174
+ row: int, col: int, n_cols: int, formatter: "NumberFormatter"
2175
+ ) -> None:
2176
+ """Add binary column tile with horizontal bars showing labels clearly."""
2177
+ value_counts = series.value_counts()
2178
+ if len(value_counts) == 0:
2179
+ return
2180
+
2181
+ labels = [str(v) for v in value_counts.index]
2182
+ values = value_counts.values.tolist()
2183
+ total = sum(values)
2184
+ percentages = [v/total*100 for v in values]
2185
+
2186
+ balance_ratio = max(values) / min(values) if min(values) > 0 else float('inf')
2187
+ balance_color = (self.colors["success"] if balance_ratio < 3
2188
+ else self.colors["warning"] if balance_ratio < 10
2189
+ else self.colors["danger"])
2190
+
2191
+ # Horizontal bars with labels on y-axis
2192
+ colors = [self.colors["primary"], self.colors["secondary"]]
2193
+ fig.add_trace(go.Bar(
2194
+ y=labels[:2],
2195
+ x=percentages[:2],
2196
+ orientation='h',
2197
+ marker_color=colors[:len(labels)],
2198
+ text=[f"{p:.0f}%" for p in percentages[:2]],
2199
+ textposition="inside",
2200
+ textfont={"size": 11, "color": "white"},
2201
+ hovertemplate="%{y}: %{x:.1f}%<extra></extra>",
2202
+ showlegend=False
2203
+ ), row=row, col=col)
2204
+
2205
+ ratio_text = f"{balance_ratio:.1f}:1"
2206
+ xref = f"{self._get_axis_ref(row, col, n_cols, 'x')} domain"
2207
+ yref = f"{self._get_axis_ref(row, col, n_cols, 'y')} domain"
2208
+ fig.add_annotation(
2209
+ x=0.98, y=0.98, xref=xref, yref=yref,
2210
+ text=ratio_text, showarrow=False,
2211
+ font={"size": 10, "color": balance_color, "family": "Arial Black"},
2212
+ xanchor="right", yanchor="top"
2213
+ )
2214
+
2215
+ def _add_datetime_tile(
2216
+ self, fig: go.Figure, series: Series, metrics: Dict,
2217
+ row: int, col: int, n_cols: int
2218
+ ) -> None:
2219
+ """Add datetime column tile with date range visualization."""
2220
+ import warnings
2221
+
2222
+ import pandas as pd
2223
+ with warnings.catch_warnings():
2224
+ warnings.simplefilter("ignore")
2225
+ dates = pd.to_datetime(series, errors='coerce').dropna()
2226
+ if len(dates) == 0:
2227
+ return
2228
+
2229
+ # Monthly distribution as area chart for cleaner look
2230
+ counts = dates.dt.to_period('M').value_counts().sort_index()
2231
+ x_labels = [str(p) for p in counts.index]
2232
+ fig.add_trace(go.Scatter(
2233
+ x=x_labels,
2234
+ y=counts.values,
2235
+ mode='lines',
2236
+ fill='tozeroy',
2237
+ line={"color": self.colors["info"]},
2238
+ fillcolor="rgba(23, 190, 207, 0.3)",
2239
+ hovertemplate="%{x}: %{y:,}<extra></extra>"
2240
+ ), row=row, col=col)
2241
+
2242
+ # Force categorical x-axis to prevent Plotly from interpreting as dates
2243
+ xaxis_name = f"xaxis{(row - 1) * n_cols + col}" if (row - 1) * n_cols + col > 1 else "xaxis"
2244
+ fig.update_layout(**{xaxis_name: {"type": "category", "tickangle": -45}})
2245
+
2246
+ def _add_identifier_tile(
2247
+ self, fig: go.Figure, series: Series, metrics: Dict,
2248
+ row: int, col: int, n_cols: int, formatter: "NumberFormatter"
2249
+ ) -> None:
2250
+ """Add identifier column tile with uniqueness gauge."""
2251
+ total = len(series)
2252
+ unique = metrics.get('distinct_count', series.nunique())
2253
+ unique_pct = (unique / total * 100) if total > 0 else 0
2254
+
2255
+ gauge_color = (self.colors["success"] if unique_pct >= 99
2256
+ else self.colors["warning"] if unique_pct >= 95
2257
+ else self.colors["danger"])
2258
+
2259
+ # Progress bar style for uniqueness
2260
+ fig.add_trace(go.Bar(
2261
+ x=[unique_pct], y=[""],
2262
+ orientation='h',
2263
+ marker_color=gauge_color,
2264
+ text=f"{unique_pct:.1f}% unique",
2265
+ textposition="inside",
2266
+ textfont={"color": "white", "size": 11},
2267
+ hovertemplate=f"Unique: {unique:,} / {total:,}<extra></extra>",
2268
+ showlegend=False
2269
+ ), row=row, col=col)
2270
+
2271
+ fig.add_trace(go.Bar(
2272
+ x=[100 - unique_pct], y=[""],
2273
+ orientation='h',
2274
+ marker_color="#ecf0f1",
2275
+ hoverinfo="skip",
2276
+ showlegend=False
2277
+ ), row=row, col=col)
2278
+
2279
+ def _add_target_tile(
2280
+ self, fig: go.Figure, series: Series, metrics: Dict,
2281
+ row: int, col: int, n_cols: int, formatter: "NumberFormatter"
2282
+ ) -> None:
2283
+ """Add target column tile with horizontal bars showing class distribution."""
2284
+ value_counts = series.value_counts()
2285
+ total = len(series)
2286
+
2287
+ colors_list = [self.colors["success"], self.colors["danger"]] + \
2288
+ [self.colors["warning"], self.colors["info"]]
2289
+
2290
+ labels = [str(v) for v in value_counts.head(4).index]
2291
+ percentages = [(c / total * 100) for c in value_counts.head(4).values]
2292
+
2293
+ # Horizontal bars with labels on y-axis
2294
+ fig.add_trace(go.Bar(
2295
+ y=labels,
2296
+ x=percentages,
2297
+ orientation='h',
2298
+ marker_color=colors_list[:len(labels)],
2299
+ text=[f"{p:.0f}%" for p in percentages],
2300
+ textposition="inside",
2301
+ textfont={"size": 11, "color": "white"},
2302
+ hovertemplate="%{y}: %{x:.1f}%<extra></extra>",
2303
+ showlegend=False
2304
+ ), row=row, col=col)
2305
+
2306
+ xref = f"{self._get_axis_ref(row, col, n_cols, 'x')} domain"
2307
+ yref = f"{self._get_axis_ref(row, col, n_cols, 'y')} domain"
2308
+ if len(value_counts) == 2:
2309
+ ratio = value_counts.max() / value_counts.min() if value_counts.min() > 0 else float('inf')
2310
+ ratio_color = (self.colors["success"] if ratio < 3
2311
+ else self.colors["warning"] if ratio < 10
2312
+ else self.colors["danger"])
2313
+ fig.add_annotation(
2314
+ x=0.98, y=0.98, xref=xref, yref=yref,
2315
+ text=f"{ratio:.1f}:1",
2316
+ showarrow=False, font={"size": 10, "color": ratio_color, "family": "Arial Black"},
2317
+ xanchor="right", yanchor="top"
2318
+ )
2319
+
2320
+ def _add_generic_tile(
2321
+ self, fig: go.Figure, series: Series, metrics: Dict,
2322
+ row: int, col: int, n_cols: int, formatter: "NumberFormatter"
2323
+ ) -> None:
2324
+ """Add generic tile for unknown column types."""
2325
+ value_counts = series.value_counts().head(5)
2326
+
2327
+ fig.add_trace(go.Bar(
2328
+ x=value_counts.values,
2329
+ y=[str(v)[:10] for v in value_counts.index],
2330
+ orientation='h',
2331
+ marker_color=self.colors["primary"],
2332
+ hovertemplate="%{y}: %{x:,}<extra></extra>"
2333
+ ), row=row, col=col)
2334
+
2335
+ def cutoff_selection_chart(
2336
+ self, cutoff_analysis: "CutoffAnalysis", suggested_cutoff: Optional[datetime] = None,
2337
+ current_cutoff: Optional[datetime] = None, title: str = "Point-in-Time Cutoff Selection"
2338
+ ) -> go.Figure:
2339
+ df = cutoff_analysis.to_dataframe()
2340
+ if len(df) == 0:
2341
+ return go.Figure().add_annotation(text="No temporal data available", showarrow=False)
2342
+
2343
+ # Get data date range to check if cutoffs are within bounds
2344
+ min_date = df["date"].min()
2345
+ max_date = df["date"].max()
2346
+
2347
+ fig = go.Figure()
2348
+
2349
+ # Add 100% baseline first (invisible, for fill reference)
2350
+ fig.add_trace(go.Scatter(
2351
+ x=df["date"], y=[100] * len(df), name="_baseline",
2352
+ mode="lines", line={"color": "rgba(0,0,0,0)", "width": 0},
2353
+ showlegend=False, hoverinfo="skip"
2354
+ ))
2355
+
2356
+ # Score area fills from 100% down to train_pct line
2357
+ fig.add_trace(go.Scatter(
2358
+ x=df["date"], y=df["train_pct"], name="Score Set %",
2359
+ mode="lines", line={"color": self.colors["warning"], "width": 2},
2360
+ fill="tonexty", fillcolor="rgba(255, 193, 7, 0.3)",
2361
+ hovertemplate="Cutoff: %{x|%Y-%m-%d}<br>Score: %{customdata:.1f}%<extra></extra>",
2362
+ customdata=df["score_pct"], showlegend=True
2363
+ ))
2364
+
2365
+ # Train area fills from train_pct down to 0
2366
+ fig.add_trace(go.Scatter(
2367
+ x=df["date"], y=df["train_pct"], name="Training Set %",
2368
+ mode="lines", line={"color": self.colors["success"], "width": 2},
2369
+ fill="tozeroy", fillcolor="rgba(40, 167, 69, 0.3)",
2370
+ hovertemplate="Cutoff: %{x|%Y-%m-%d}<br>Train: %{y:.1f}%<extra></extra>",
2371
+ showlegend=True
2372
+ ))
2373
+
2374
+ milestones = cutoff_analysis.get_percentage_milestones(step=5)
2375
+ if milestones:
2376
+ milestone_dates = [m["date"] for m in milestones]
2377
+ milestone_pcts = [m["train_pct"] for m in milestones]
2378
+ fig.add_trace(go.Scatter(
2379
+ x=milestone_dates, y=milestone_pcts, name="Train % Reference",
2380
+ mode="markers+text", marker={"size": 8, "color": self.colors["success"], "symbol": "circle"},
2381
+ text=[f"{int(p)}%" for p in milestone_pcts], textposition="top center",
2382
+ textfont={"size": 8, "color": self.colors["success"]},
2383
+ hovertemplate="Date: %{x|%Y-%m-%d}<br>Train: %{y:.0f}%<extra></extra>",
2384
+ showlegend=False
2385
+ ))
2386
+
2387
+ # Add cutoff lines - only if within data range
2388
+ if suggested_cutoff:
2389
+ split = cutoff_analysis.get_split_at_date(suggested_cutoff)
2390
+ # Check if suggested cutoff is within data range
2391
+ if min_date <= suggested_cutoff <= max_date:
2392
+ fig.add_vline(
2393
+ x=suggested_cutoff, line={"color": self.colors["info"], "dash": "dash", "width": 2}
2394
+ )
2395
+ # Add text annotation label on chart for selected cutoff
2396
+ fig.add_annotation(
2397
+ x=suggested_cutoff, y=1.02, xref="x", yref="paper",
2398
+ text=f"Selected: {suggested_cutoff.strftime('%Y-%m-%d')}",
2399
+ showarrow=False, font={"size": 9, "color": self.colors["info"]},
2400
+ xanchor="center", yanchor="bottom"
2401
+ )
2402
+ # Add legend entry with visible line sample
2403
+ fig.add_trace(go.Scatter(
2404
+ x=[None], y=[None], mode="lines",
2405
+ line={"color": self.colors["info"], "dash": "dash", "width": 2},
2406
+ name=f"Selected: {suggested_cutoff.strftime('%Y-%m-%d')} ({split['train_pct']:.0f}% train)",
2407
+ showlegend=True
2408
+ ))
2409
+
2410
+ if current_cutoff:
2411
+ split = cutoff_analysis.get_split_at_date(current_cutoff)
2412
+ # Check if registry cutoff is within data range
2413
+ cutoff_in_range = min_date <= current_cutoff <= max_date
2414
+ # Determine if registry and selected cutoffs are at the same position
2415
+ same_as_selected = suggested_cutoff and current_cutoff == suggested_cutoff
2416
+ if cutoff_in_range:
2417
+ fig.add_vline(
2418
+ x=current_cutoff, line={"color": self.colors["danger"], "dash": "dot", "width": 2}
2419
+ )
2420
+ # Add text annotation label on chart for registry cutoff
2421
+ # Offset vertically if same as selected to avoid overlap
2422
+ annotation_y = 1.08 if same_as_selected else 1.02
2423
+ fig.add_annotation(
2424
+ x=current_cutoff, y=annotation_y, xref="x", yref="paper",
2425
+ text=f"Registry: {current_cutoff.strftime('%Y-%m-%d')}",
2426
+ showarrow=False, font={"size": 9, "color": self.colors["danger"]},
2427
+ xanchor="center", yanchor="bottom"
2428
+ )
2429
+ legend_label = f"Registry: {current_cutoff.strftime('%Y-%m-%d')} ({split['train_pct']:.0f}% train)"
2430
+ else:
2431
+ # Registry cutoff is outside data range
2432
+ legend_label = f"Registry: {current_cutoff.strftime('%Y-%m-%d')} (outside data range)"
2433
+ # Add legend entry
2434
+ fig.add_trace(go.Scatter(
2435
+ x=[None], y=[None], mode="lines",
2436
+ line={"color": self.colors["danger"], "dash": "dot", "width": 2},
2437
+ name=legend_label,
2438
+ showlegend=True
2439
+ ))
2440
+
2441
+ fig.update_layout(
2442
+ title={"text": "Train/Score Split by Cutoff Date", "x": 0.5, "xanchor": "center"},
2443
+ width=800, height=300, autosize=False, template=self.theme, showlegend=True,
2444
+ legend={
2445
+ "orientation": "h", "yanchor": "top", "y": -0.15,
2446
+ "xanchor": "center", "x": 0.5, "bgcolor": "rgba(255,255,255,0.8)",
2447
+ "font": {"size": 9}
2448
+ },
2449
+ margin={"t": 40, "b": 60, "l": 55, "r": 55},
2450
+ yaxis={"title": "Percentage", "range": [0, 100]},
2451
+ xaxis={"title": ""},
2452
+ )
2453
+
2454
+ return fig
2455
+
2456
+ def recency_analysis_panel(
2457
+ self, retained_recency: np.ndarray, churned_recency: np.ndarray,
2458
+ bucket_stats: list, retained_median: float, churned_median: float,
2459
+ cap_value: Optional[float] = None
2460
+ ) -> go.Figure:
2461
+ from plotly.subplots import make_subplots
2462
+ from scipy.stats import gaussian_kde
2463
+ fig = make_subplots(
2464
+ rows=2, cols=2,
2465
+ subplot_titles=["Retained Distribution", "Target Rate by Recency",
2466
+ "Churned Distribution", "Density Comparison"],
2467
+ row_heights=[0.5, 0.5], column_widths=[0.5, 0.5],
2468
+ horizontal_spacing=0.08, vertical_spacing=0.15,
2469
+ specs=[[{}, {"secondary_y": True}], [{}, {}]]
2470
+ )
2471
+ color_retained, color_churned = "rgba(46,204,113,0.7)", "rgba(231,76,60,0.7)"
2472
+ cap = cap_value or max(np.max(retained_recency), np.max(churned_recency))
2473
+ x_range = [0, cap * 1.05]
2474
+ fig.add_trace(go.Histogram(
2475
+ x=retained_recency, nbinsx=30, marker_color=color_retained, showlegend=False,
2476
+ hovertemplate="Days: %{x}<br>Count: %{y}<extra></extra>"
2477
+ ), row=1, col=1)
2478
+ fig.add_vline(x=retained_median, line_dash="solid", line_color="green",
2479
+ annotation_text=f"Med: {retained_median:.0f}d", row=1, col=1)
2480
+ fig.add_trace(go.Histogram(
2481
+ x=churned_recency, nbinsx=30, marker_color=color_churned, showlegend=False,
2482
+ hovertemplate="Days: %{x}<br>Count: %{y}<extra></extra>"
2483
+ ), row=2, col=1)
2484
+ fig.add_vline(x=churned_median, line_dash="solid", line_color="red",
2485
+ annotation_text=f"Med: {churned_median:.0f}d", row=2, col=1)
2486
+ if bucket_stats:
2487
+ labels = [b.bucket_label for b in bucket_stats]
2488
+ counts = [b.entity_count for b in bucket_stats]
2489
+ rates = [b.target_rate * 100 for b in bucket_stats]
2490
+ fig.add_trace(go.Bar(
2491
+ x=labels, y=counts, name="Entity Count", marker_color="lightsteelblue", opacity=0.7,
2492
+ hovertemplate="Bucket: %{x}<br>Count: %{y}<extra></extra>"
2493
+ ), row=1, col=2)
2494
+ fig.add_trace(go.Scatter(
2495
+ x=labels, y=rates, mode="lines+markers", name="Target Rate %",
2496
+ line={"color": "red", "width": 3}, marker={"size": 8},
2497
+ hovertemplate="Bucket: %{x}<br>Rate: %{y:.1f}%<extra></extra>"
2498
+ ), row=1, col=2, secondary_y=True)
2499
+ x_density = np.linspace(0, cap, 200)
2500
+ if len(retained_recency) > 5 and len(churned_recency) > 5:
2501
+ kde_retained = gaussian_kde(retained_recency, bw_method=0.3)
2502
+ kde_churned = gaussian_kde(churned_recency, bw_method=0.3)
2503
+ fig.add_trace(go.Scatter(
2504
+ x=x_density, y=kde_retained(x_density), mode="lines", name="Retained",
2505
+ line={"color": "green", "width": 2}, fill="tozeroy", fillcolor="rgba(46,204,113,0.3)",
2506
+ hovertemplate="Days: %{x:.0f}<br>Density: %{y:.4f}<extra></extra>"
2507
+ ), row=2, col=2)
2508
+ fig.add_trace(go.Scatter(
2509
+ x=x_density, y=kde_churned(x_density), mode="lines", name="Churned",
2510
+ line={"color": "red", "width": 2}, fill="tozeroy", fillcolor="rgba(231,76,60,0.3)",
2511
+ hovertemplate="Days: %{x:.0f}<br>Density: %{y:.4f}<extra></extra>"
2512
+ ), row=2, col=2)
2513
+ fig.add_vline(x=retained_median, line_dash="dash", line_color="green", line_width=1, row=2, col=2)
2514
+ fig.add_vline(x=churned_median, line_dash="dash", line_color="red", line_width=1, row=2, col=2)
2515
+ separation = self._compute_distribution_separation(kde_retained, kde_churned, x_density)
2516
+ fig.add_annotation(x=0.95, y=0.95, xref="x4 domain", yref="y4 domain",
2517
+ text=f"Separation: {separation:.0%}", showarrow=False,
2518
+ font={"size": 11}, bgcolor="rgba(255,255,255,0.8)", xanchor="right")
2519
+ fig.update_xaxes(range=x_range, row=1, col=1)
2520
+ fig.update_xaxes(range=x_range, row=2, col=1)
2521
+ fig.update_xaxes(range=x_range, row=2, col=2)
2522
+ fig.update_xaxes(title_text="Days Since Last Event", row=2, col=1)
2523
+ fig.update_xaxes(title_text="Recency Bucket", row=1, col=2)
2524
+ fig.update_xaxes(title_text="Days Since Last Event", row=2, col=2)
2525
+ fig.update_yaxes(title_text="Count", row=1, col=1)
2526
+ fig.update_yaxes(title_text="Count", row=2, col=1)
2527
+ fig.update_yaxes(title_text="Entity Count", row=1, col=2)
2528
+ fig.update_yaxes(title_text="Target Rate %", row=1, col=2, secondary_y=True)
2529
+ fig.update_yaxes(title_text="Density", row=2, col=2)
2530
+ fig.update_layout(
2531
+ title={"text": "Recency Analysis: Distribution Comparison & Target Rate", "x": 0.5},
2532
+ template=self.theme, height=550, showlegend=True, autosize=True,
2533
+ legend={"orientation": "h", "yanchor": "top", "y": -0.08, "xanchor": "center", "x": 0.5},
2534
+ margin={"l": 60, "r": 60, "t": 50, "b": 80}
2535
+ )
2536
+ return fig
2537
+
2538
+ def _compute_distribution_separation(self, kde1, kde2, x_values: np.ndarray) -> float:
2539
+ y1, y2 = kde1(x_values), kde2(x_values)
2540
+ overlap = np.trapezoid(np.minimum(y1, y2), x_values)
2541
+ return 1.0 - overlap
2542
+
2543
+ def categorical_analysis_panel(
2544
+ self, insights: list, overall_rate: float, max_features: int = 6
2545
+ ) -> go.Figure:
2546
+ from plotly.subplots import make_subplots
2547
+ if not insights:
2548
+ fig = go.Figure()
2549
+ fig.add_annotation(text="No categorical features to analyze", showarrow=False,
2550
+ xref="paper", yref="paper", x=0.5, y=0.5, font={"size": 16})
2551
+ return fig
2552
+ insights = sorted(insights, key=lambda x: x.cramers_v, reverse=True)[:max_features]
2553
+ fig = make_subplots(
2554
+ rows=2, cols=2,
2555
+ subplot_titles=["Feature Association Strength (Cramér's V)", "Effect Strength Distribution",
2556
+ "High/Low Risk Category Counts", "Top Feature: Category Target Rates"],
2557
+ row_heights=[0.5, 0.5], column_widths=[0.5, 0.5],
2558
+ horizontal_spacing=0.12, vertical_spacing=0.18
2559
+ )
2560
+ features = [i.feature_name for i in insights]
2561
+ cramers_values = [i.cramers_v for i in insights]
2562
+ # Top-Left: Strength gradient (red=strong, orange=moderate, light blue=weak)
2563
+ strength_colors = ["#c0392b" if v >= 0.3 else "#e67e22" if v >= 0.1 else "#85c1e9" for v in cramers_values]
2564
+ fig.add_trace(go.Bar(
2565
+ y=features, x=cramers_values, orientation="h", marker_color=strength_colors,
2566
+ hovertemplate="Feature: %{y}<br>Cramér's V: %{x:.3f}<extra></extra>", showlegend=False
2567
+ ), row=1, col=1)
2568
+ fig.add_vline(x=0.3, line_dash="dash", line_color="#c0392b", annotation_text="Strong",
2569
+ annotation_position="top right", row=1, col=1)
2570
+ fig.add_vline(x=0.1, line_dash="dash", line_color="#e67e22", annotation_text="Moderate",
2571
+ annotation_position="top left", row=1, col=1)
2572
+ # Top-Right: Count distribution (purple palette - distinct from strength colors)
2573
+ effect_counts = {"strong": 0, "moderate": 0, "weak": 0, "negligible": 0}
2574
+ for i in insights:
2575
+ effect_counts[i.effect_strength] = effect_counts.get(i.effect_strength, 0) + 1
2576
+ effect_labels = list(effect_counts.keys())
2577
+ effect_values = list(effect_counts.values())
2578
+ # Purple gradient for counts (darker = more significant category)
2579
+ count_colors = ["#6c3483", "#8e44ad", "#a569bd", "#d2b4de"]
2580
+ fig.add_trace(go.Bar(
2581
+ x=effect_labels, y=effect_values, marker_color=count_colors, showlegend=False,
2582
+ hovertemplate="Effect: %{x}<br>Count: %{y}<extra></extra>"
2583
+ ), row=1, col=2)
2584
+ high_risk = [len(i.high_risk_categories) for i in insights]
2585
+ low_risk = [len(i.low_risk_categories) for i in insights]
2586
+ fig.add_trace(go.Bar(
2587
+ y=features, x=high_risk, orientation="h", name="High Risk Categories",
2588
+ marker_color="rgba(231,76,60,0.7)", hovertemplate="%{y}: %{x} high-risk<extra></extra>"
2589
+ ), row=2, col=1)
2590
+ fig.add_trace(go.Bar(
2591
+ y=features, x=low_risk, orientation="h", name="Low Risk Categories",
2592
+ marker_color="rgba(46,204,113,0.7)", hovertemplate="%{y}: %{x} low-risk<extra></extra>"
2593
+ ), row=2, col=1)
2594
+ top_insight = insights[0]
2595
+ if not top_insight.category_stats.empty:
2596
+ stats = top_insight.category_stats.head(10)
2597
+ categories = stats["category"].astype(str).tolist()
2598
+ rates = (stats["retention_rate"] * 100).tolist()
2599
+ bar_colors = ["#e74c3c" if r < overall_rate * 100 * 0.9 else
2600
+ "#2ecc71" if r > overall_rate * 100 * 1.1 else "#3498db" for r in rates]
2601
+ fig.add_trace(go.Bar(
2602
+ x=categories, y=rates, marker_color=bar_colors, showlegend=False,
2603
+ hovertemplate="Category: %{x}<br>Target Rate: %{y:.1f}%<extra></extra>"
2604
+ ), row=2, col=2)
2605
+ fig.add_hline(y=overall_rate * 100, line_dash="dash", line_color="gray",
2606
+ annotation_text=f"Overall: {overall_rate*100:.1f}%", row=2, col=2)
2607
+ fig.update_xaxes(title_text="Cramér's V", row=1, col=1)
2608
+ fig.update_xaxes(title_text="Category Count", row=2, col=1)
2609
+ fig.update_xaxes(title_text="Category", row=2, col=2, tickangle=45)
2610
+ fig.update_yaxes(title_text="Feature", row=1, col=1)
2611
+ fig.update_yaxes(title_text="Feature", row=2, col=1)
2612
+ fig.update_yaxes(title_text="Target Rate %", row=2, col=2)
2613
+ fig.update_layout(
2614
+ title={"text": "Categorical Feature Analysis", "x": 0.5},
2615
+ template=self.theme, height=600, showlegend=True, autosize=True, barmode="group",
2616
+ legend={"orientation": "h", "yanchor": "top", "y": -0.1, "xanchor": "center", "x": 0.5},
2617
+ margin={"l": 120, "r": 60, "t": 60, "b": 100}
2618
+ )
2619
+ return fig