churnkit 0.75.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
- churnkit-0.75.0a1.dist-info/METADATA +229 -0
- churnkit-0.75.0a1.dist-info/RECORD +302 -0
- churnkit-0.75.0a1.dist-info/WHEEL +4 -0
- churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
- churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
- customer_retention/__init__.py +37 -0
- customer_retention/analysis/__init__.py +0 -0
- customer_retention/analysis/auto_explorer/__init__.py +62 -0
- customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
- customer_retention/analysis/auto_explorer/explorer.py +258 -0
- customer_retention/analysis/auto_explorer/findings.py +291 -0
- customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
- customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
- customer_retention/analysis/auto_explorer/recommendations.py +418 -0
- customer_retention/analysis/business/__init__.py +26 -0
- customer_retention/analysis/business/ab_test_designer.py +144 -0
- customer_retention/analysis/business/fairness_analyzer.py +166 -0
- customer_retention/analysis/business/intervention_matcher.py +121 -0
- customer_retention/analysis/business/report_generator.py +222 -0
- customer_retention/analysis/business/risk_profile.py +199 -0
- customer_retention/analysis/business/roi_analyzer.py +139 -0
- customer_retention/analysis/diagnostics/__init__.py +20 -0
- customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
- customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
- customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
- customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
- customer_retention/analysis/diagnostics/noise_tester.py +140 -0
- customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
- customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
- customer_retention/analysis/discovery/__init__.py +8 -0
- customer_retention/analysis/discovery/config_generator.py +49 -0
- customer_retention/analysis/discovery/discovery_flow.py +19 -0
- customer_retention/analysis/discovery/type_inferencer.py +147 -0
- customer_retention/analysis/interpretability/__init__.py +13 -0
- customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
- customer_retention/analysis/interpretability/counterfactual.py +175 -0
- customer_retention/analysis/interpretability/individual_explainer.py +141 -0
- customer_retention/analysis/interpretability/pdp_generator.py +103 -0
- customer_retention/analysis/interpretability/shap_explainer.py +106 -0
- customer_retention/analysis/jupyter_save_hook.py +28 -0
- customer_retention/analysis/notebook_html_exporter.py +136 -0
- customer_retention/analysis/notebook_progress.py +60 -0
- customer_retention/analysis/plotly_preprocessor.py +154 -0
- customer_retention/analysis/recommendations/__init__.py +54 -0
- customer_retention/analysis/recommendations/base.py +158 -0
- customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
- customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
- customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
- customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
- customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
- customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
- customer_retention/analysis/recommendations/datetime/extract.py +149 -0
- customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
- customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
- customer_retention/analysis/recommendations/pipeline.py +74 -0
- customer_retention/analysis/recommendations/registry.py +76 -0
- customer_retention/analysis/recommendations/selection/__init__.py +3 -0
- customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
- customer_retention/analysis/recommendations/transform/__init__.py +4 -0
- customer_retention/analysis/recommendations/transform/power.py +94 -0
- customer_retention/analysis/recommendations/transform/scale.py +112 -0
- customer_retention/analysis/visualization/__init__.py +15 -0
- customer_retention/analysis/visualization/chart_builder.py +2619 -0
- customer_retention/analysis/visualization/console.py +122 -0
- customer_retention/analysis/visualization/display.py +171 -0
- customer_retention/analysis/visualization/number_formatter.py +36 -0
- customer_retention/artifacts/__init__.py +3 -0
- customer_retention/artifacts/fit_artifact_registry.py +146 -0
- customer_retention/cli.py +93 -0
- customer_retention/core/__init__.py +0 -0
- customer_retention/core/compat/__init__.py +193 -0
- customer_retention/core/compat/detection.py +99 -0
- customer_retention/core/compat/ops.py +48 -0
- customer_retention/core/compat/pandas_backend.py +57 -0
- customer_retention/core/compat/spark_backend.py +75 -0
- customer_retention/core/components/__init__.py +11 -0
- customer_retention/core/components/base.py +79 -0
- customer_retention/core/components/components/__init__.py +13 -0
- customer_retention/core/components/components/deployer.py +26 -0
- customer_retention/core/components/components/explainer.py +26 -0
- customer_retention/core/components/components/feature_eng.py +33 -0
- customer_retention/core/components/components/ingester.py +34 -0
- customer_retention/core/components/components/profiler.py +34 -0
- customer_retention/core/components/components/trainer.py +38 -0
- customer_retention/core/components/components/transformer.py +36 -0
- customer_retention/core/components/components/validator.py +37 -0
- customer_retention/core/components/enums.py +33 -0
- customer_retention/core/components/orchestrator.py +94 -0
- customer_retention/core/components/registry.py +59 -0
- customer_retention/core/config/__init__.py +39 -0
- customer_retention/core/config/column_config.py +95 -0
- customer_retention/core/config/experiments.py +71 -0
- customer_retention/core/config/pipeline_config.py +117 -0
- customer_retention/core/config/source_config.py +83 -0
- customer_retention/core/utils/__init__.py +28 -0
- customer_retention/core/utils/leakage.py +85 -0
- customer_retention/core/utils/severity.py +53 -0
- customer_retention/core/utils/statistics.py +90 -0
- customer_retention/generators/__init__.py +0 -0
- customer_retention/generators/notebook_generator/__init__.py +167 -0
- customer_retention/generators/notebook_generator/base.py +55 -0
- customer_retention/generators/notebook_generator/cell_builder.py +49 -0
- customer_retention/generators/notebook_generator/config.py +47 -0
- customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
- customer_retention/generators/notebook_generator/local_generator.py +48 -0
- customer_retention/generators/notebook_generator/project_init.py +174 -0
- customer_retention/generators/notebook_generator/runner.py +150 -0
- customer_retention/generators/notebook_generator/script_generator.py +110 -0
- customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
- customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
- customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
- customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
- customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
- customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
- customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
- customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
- customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
- customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
- customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
- customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
- customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
- customer_retention/generators/orchestration/__init__.py +23 -0
- customer_retention/generators/orchestration/code_generator.py +196 -0
- customer_retention/generators/orchestration/context.py +147 -0
- customer_retention/generators/orchestration/data_materializer.py +188 -0
- customer_retention/generators/orchestration/databricks_exporter.py +411 -0
- customer_retention/generators/orchestration/doc_generator.py +311 -0
- customer_retention/generators/pipeline_generator/__init__.py +26 -0
- customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
- customer_retention/generators/pipeline_generator/generator.py +142 -0
- customer_retention/generators/pipeline_generator/models.py +166 -0
- customer_retention/generators/pipeline_generator/renderer.py +2125 -0
- customer_retention/generators/spec_generator/__init__.py +37 -0
- customer_retention/generators/spec_generator/databricks_generator.py +433 -0
- customer_retention/generators/spec_generator/generic_generator.py +373 -0
- customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
- customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
- customer_retention/integrations/__init__.py +0 -0
- customer_retention/integrations/adapters/__init__.py +13 -0
- customer_retention/integrations/adapters/base.py +10 -0
- customer_retention/integrations/adapters/factory.py +25 -0
- customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
- customer_retention/integrations/adapters/feature_store/base.py +57 -0
- customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
- customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
- customer_retention/integrations/adapters/feature_store/local.py +75 -0
- customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
- customer_retention/integrations/adapters/mlflow/base.py +32 -0
- customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
- customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
- customer_retention/integrations/adapters/mlflow/local.py +50 -0
- customer_retention/integrations/adapters/storage/__init__.py +5 -0
- customer_retention/integrations/adapters/storage/base.py +33 -0
- customer_retention/integrations/adapters/storage/databricks.py +76 -0
- customer_retention/integrations/adapters/storage/local.py +59 -0
- customer_retention/integrations/feature_store/__init__.py +47 -0
- customer_retention/integrations/feature_store/definitions.py +215 -0
- customer_retention/integrations/feature_store/manager.py +744 -0
- customer_retention/integrations/feature_store/registry.py +412 -0
- customer_retention/integrations/iteration/__init__.py +28 -0
- customer_retention/integrations/iteration/context.py +212 -0
- customer_retention/integrations/iteration/feedback_collector.py +184 -0
- customer_retention/integrations/iteration/orchestrator.py +168 -0
- customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
- customer_retention/integrations/iteration/signals.py +212 -0
- customer_retention/integrations/llm_context/__init__.py +4 -0
- customer_retention/integrations/llm_context/context_builder.py +201 -0
- customer_retention/integrations/llm_context/prompts.py +100 -0
- customer_retention/integrations/streaming/__init__.py +103 -0
- customer_retention/integrations/streaming/batch_integration.py +149 -0
- customer_retention/integrations/streaming/early_warning_model.py +227 -0
- customer_retention/integrations/streaming/event_schema.py +214 -0
- customer_retention/integrations/streaming/online_store_writer.py +249 -0
- customer_retention/integrations/streaming/realtime_scorer.py +261 -0
- customer_retention/integrations/streaming/trigger_engine.py +293 -0
- customer_retention/integrations/streaming/window_aggregator.py +393 -0
- customer_retention/stages/__init__.py +0 -0
- customer_retention/stages/cleaning/__init__.py +9 -0
- customer_retention/stages/cleaning/base.py +28 -0
- customer_retention/stages/cleaning/missing_handler.py +160 -0
- customer_retention/stages/cleaning/outlier_handler.py +204 -0
- customer_retention/stages/deployment/__init__.py +28 -0
- customer_retention/stages/deployment/batch_scorer.py +106 -0
- customer_retention/stages/deployment/champion_challenger.py +299 -0
- customer_retention/stages/deployment/model_registry.py +182 -0
- customer_retention/stages/deployment/retraining_trigger.py +245 -0
- customer_retention/stages/features/__init__.py +73 -0
- customer_retention/stages/features/behavioral_features.py +266 -0
- customer_retention/stages/features/customer_segmentation.py +505 -0
- customer_retention/stages/features/feature_definitions.py +265 -0
- customer_retention/stages/features/feature_engineer.py +551 -0
- customer_retention/stages/features/feature_manifest.py +340 -0
- customer_retention/stages/features/feature_selector.py +239 -0
- customer_retention/stages/features/interaction_features.py +160 -0
- customer_retention/stages/features/temporal_features.py +243 -0
- customer_retention/stages/ingestion/__init__.py +9 -0
- customer_retention/stages/ingestion/load_result.py +32 -0
- customer_retention/stages/ingestion/loaders.py +195 -0
- customer_retention/stages/ingestion/source_registry.py +130 -0
- customer_retention/stages/modeling/__init__.py +31 -0
- customer_retention/stages/modeling/baseline_trainer.py +139 -0
- customer_retention/stages/modeling/cross_validator.py +125 -0
- customer_retention/stages/modeling/data_splitter.py +205 -0
- customer_retention/stages/modeling/feature_scaler.py +99 -0
- customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
- customer_retention/stages/modeling/imbalance_handler.py +282 -0
- customer_retention/stages/modeling/mlflow_logger.py +95 -0
- customer_retention/stages/modeling/model_comparator.py +149 -0
- customer_retention/stages/modeling/model_evaluator.py +138 -0
- customer_retention/stages/modeling/threshold_optimizer.py +131 -0
- customer_retention/stages/monitoring/__init__.py +37 -0
- customer_retention/stages/monitoring/alert_manager.py +328 -0
- customer_retention/stages/monitoring/drift_detector.py +201 -0
- customer_retention/stages/monitoring/performance_monitor.py +242 -0
- customer_retention/stages/preprocessing/__init__.py +5 -0
- customer_retention/stages/preprocessing/transformer_manager.py +284 -0
- customer_retention/stages/profiling/__init__.py +256 -0
- customer_retention/stages/profiling/categorical_distribution.py +269 -0
- customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
- customer_retention/stages/profiling/column_profiler.py +527 -0
- customer_retention/stages/profiling/distribution_analysis.py +483 -0
- customer_retention/stages/profiling/drift_detector.py +310 -0
- customer_retention/stages/profiling/feature_capacity.py +507 -0
- customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
- customer_retention/stages/profiling/profile_result.py +212 -0
- customer_retention/stages/profiling/quality_checks.py +1632 -0
- customer_retention/stages/profiling/relationship_detector.py +256 -0
- customer_retention/stages/profiling/relationship_recommender.py +454 -0
- customer_retention/stages/profiling/report_generator.py +520 -0
- customer_retention/stages/profiling/scd_analyzer.py +151 -0
- customer_retention/stages/profiling/segment_analyzer.py +632 -0
- customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
- customer_retention/stages/profiling/target_level_analyzer.py +217 -0
- customer_retention/stages/profiling/temporal_analyzer.py +388 -0
- customer_retention/stages/profiling/temporal_coverage.py +488 -0
- customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
- customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
- customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
- customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
- customer_retention/stages/profiling/text_embedder.py +87 -0
- customer_retention/stages/profiling/text_processor.py +115 -0
- customer_retention/stages/profiling/text_reducer.py +60 -0
- customer_retention/stages/profiling/time_series_profiler.py +303 -0
- customer_retention/stages/profiling/time_window_aggregator.py +376 -0
- customer_retention/stages/profiling/type_detector.py +382 -0
- customer_retention/stages/profiling/window_recommendation.py +288 -0
- customer_retention/stages/temporal/__init__.py +166 -0
- customer_retention/stages/temporal/access_guard.py +180 -0
- customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
- customer_retention/stages/temporal/data_preparer.py +178 -0
- customer_retention/stages/temporal/point_in_time_join.py +134 -0
- customer_retention/stages/temporal/point_in_time_registry.py +148 -0
- customer_retention/stages/temporal/scenario_detector.py +163 -0
- customer_retention/stages/temporal/snapshot_manager.py +259 -0
- customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
- customer_retention/stages/temporal/timestamp_discovery.py +531 -0
- customer_retention/stages/temporal/timestamp_manager.py +255 -0
- customer_retention/stages/transformation/__init__.py +13 -0
- customer_retention/stages/transformation/binary_handler.py +85 -0
- customer_retention/stages/transformation/categorical_encoder.py +245 -0
- customer_retention/stages/transformation/datetime_transformer.py +97 -0
- customer_retention/stages/transformation/numeric_transformer.py +181 -0
- customer_retention/stages/transformation/pipeline.py +257 -0
- customer_retention/stages/validation/__init__.py +60 -0
- customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
- customer_retention/stages/validation/business_sense_gate.py +173 -0
- customer_retention/stages/validation/data_quality_gate.py +235 -0
- customer_retention/stages/validation/data_validators.py +511 -0
- customer_retention/stages/validation/feature_quality_gate.py +183 -0
- customer_retention/stages/validation/gates.py +117 -0
- customer_retention/stages/validation/leakage_gate.py +352 -0
- customer_retention/stages/validation/model_validity_gate.py +213 -0
- customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
- customer_retention/stages/validation/quality_scorer.py +544 -0
- customer_retention/stages/validation/rule_generator.py +57 -0
- customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
- customer_retention/stages/validation/timeseries_detector.py +769 -0
- customer_retention/transforms/__init__.py +47 -0
- customer_retention/transforms/artifact_store.py +50 -0
- customer_retention/transforms/executor.py +157 -0
- customer_retention/transforms/fitted.py +92 -0
- customer_retention/transforms/ops.py +148 -0
|
@@ -0,0 +1,2619 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import plotly.express as px
|
|
6
|
+
import plotly.graph_objects as go
|
|
7
|
+
|
|
8
|
+
from customer_retention.core.compat import DataFrame, Series, ensure_pandas_series, to_pandas
|
|
9
|
+
|
|
10
|
+
from .number_formatter import NumberFormatter
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from customer_retention.stages.profiling.segment_analyzer import SegmentationResult
|
|
14
|
+
from customer_retention.stages.profiling.temporal_analyzer import TemporalAnalysis
|
|
15
|
+
from customer_retention.stages.temporal.cutoff_analyzer import CutoffAnalysis
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ChartBuilder:
|
|
19
|
+
DOW_NAMES = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
|
|
20
|
+
|
|
21
|
+
def __init__(self, theme: str = "plotly_white"):
|
|
22
|
+
self.theme = theme
|
|
23
|
+
self.colors = {
|
|
24
|
+
"primary": "#1f77b4",
|
|
25
|
+
"secondary": "#ff7f0e",
|
|
26
|
+
"success": "#2ca02c",
|
|
27
|
+
"warning": "#ffbb00",
|
|
28
|
+
"danger": "#d62728",
|
|
29
|
+
"info": "#17becf"
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
def _get_quality_colors(self, values: List[float], high: float = 80, mid: float = 60) -> List[str]:
|
|
33
|
+
return [
|
|
34
|
+
self.colors["success"] if v > high else self.colors["warning"] if v > mid else self.colors["danger"]
|
|
35
|
+
for v in values
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
def _get_iv_colors(self, iv_values: List[float]) -> List[str]:
|
|
39
|
+
return [
|
|
40
|
+
self.colors["danger"] if iv > 0.5 else
|
|
41
|
+
self.colors["success"] if iv > 0.3 else
|
|
42
|
+
self.colors["warning"] if iv > 0.1 else
|
|
43
|
+
self.colors["primary"]
|
|
44
|
+
for iv in iv_values
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
def _get_ks_colors(self, ks_values: List[float]) -> List[str]:
|
|
48
|
+
return [
|
|
49
|
+
self.colors["success"] if ks > 0.4 else
|
|
50
|
+
self.colors["warning"] if ks > 0.2 else
|
|
51
|
+
self.colors["primary"]
|
|
52
|
+
for ks in ks_values
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
def bar_chart(self, x: List[Any], y: List[Any], title: Optional[str] = None,
|
|
56
|
+
x_label: Optional[str] = None, y_label: Optional[str] = None,
|
|
57
|
+
horizontal: bool = False, color: Optional[str] = None) -> go.Figure:
|
|
58
|
+
marker_color = color or self.colors["primary"]
|
|
59
|
+
if horizontal:
|
|
60
|
+
fig = go.Figure(go.Bar(y=x, x=y, orientation="h", marker_color=marker_color))
|
|
61
|
+
else:
|
|
62
|
+
fig = go.Figure(go.Bar(x=x, y=y, marker_color=marker_color))
|
|
63
|
+
fig.update_layout(
|
|
64
|
+
title=title,
|
|
65
|
+
xaxis_title=x_label,
|
|
66
|
+
yaxis_title=y_label,
|
|
67
|
+
template=self.theme
|
|
68
|
+
)
|
|
69
|
+
return fig
|
|
70
|
+
|
|
71
|
+
def column_type_distribution(self, type_counts: Dict[str, int]) -> go.Figure:
|
|
72
|
+
if not type_counts:
|
|
73
|
+
return go.Figure()
|
|
74
|
+
fig = px.pie(
|
|
75
|
+
values=list(type_counts.values()),
|
|
76
|
+
names=list(type_counts.keys()),
|
|
77
|
+
title="Column Type Distribution",
|
|
78
|
+
hole=0.4
|
|
79
|
+
)
|
|
80
|
+
fig.update_layout(template=self.theme)
|
|
81
|
+
return fig
|
|
82
|
+
|
|
83
|
+
def data_quality_scorecard(self, quality_scores: Dict[str, float]) -> go.Figure:
|
|
84
|
+
columns = list(quality_scores.keys())
|
|
85
|
+
scores = list(quality_scores.values())
|
|
86
|
+
fig = go.Figure(go.Bar(y=columns, x=scores, orientation="h", marker_color=self._get_quality_colors(scores)))
|
|
87
|
+
fig.update_layout(
|
|
88
|
+
title="Data Quality Scores by Column",
|
|
89
|
+
xaxis_title="Quality Score (0-100)",
|
|
90
|
+
template=self.theme,
|
|
91
|
+
height=max(400, len(columns) * 25)
|
|
92
|
+
)
|
|
93
|
+
return fig
|
|
94
|
+
|
|
95
|
+
def missing_value_bars(self, null_percentages: Dict[str, float]) -> go.Figure:
|
|
96
|
+
columns = list(null_percentages.keys())
|
|
97
|
+
pcts = list(null_percentages.values())
|
|
98
|
+
colors = [self.colors["danger"] if p > 20 else self.colors["warning"] if p > 5 else self.colors["success"] for p in pcts]
|
|
99
|
+
fig = go.Figure(go.Bar(x=columns, y=pcts, marker_color=colors))
|
|
100
|
+
fig.update_layout(title="Missing Values by Column", yaxis_title="Missing %", template=self.theme)
|
|
101
|
+
return fig
|
|
102
|
+
|
|
103
|
+
def histogram_with_stats(self, series: Series, title: Optional[str] = None) -> go.Figure:
|
|
104
|
+
series = ensure_pandas_series(series)
|
|
105
|
+
clean = series.dropna()
|
|
106
|
+
mean_val = clean.mean()
|
|
107
|
+
median_val = clean.median()
|
|
108
|
+
fig = go.Figure()
|
|
109
|
+
fig.add_trace(go.Histogram(x=clean, nbinsx=30, name="Distribution"))
|
|
110
|
+
fig.add_vline(x=mean_val, line_dash="dash", line_color=self.colors["primary"], annotation_text=f"Mean: {mean_val:.2f}")
|
|
111
|
+
fig.add_vline(x=median_val, line_dash="dot", line_color=self.colors["secondary"], annotation_text=f"Median: {median_val:.2f}")
|
|
112
|
+
fig.update_layout(
|
|
113
|
+
title=title or f"Distribution of {series.name}",
|
|
114
|
+
xaxis_title=series.name,
|
|
115
|
+
yaxis_title="Count",
|
|
116
|
+
template=self.theme
|
|
117
|
+
)
|
|
118
|
+
return fig
|
|
119
|
+
|
|
120
|
+
def box_plot(self, series: Series, title: Optional[str] = None) -> go.Figure:
|
|
121
|
+
series = ensure_pandas_series(series)
|
|
122
|
+
fig = px.box(y=series.dropna(), title=title or f"Box Plot: {series.name}")
|
|
123
|
+
fig.update_layout(template=self.theme)
|
|
124
|
+
return fig
|
|
125
|
+
|
|
126
|
+
def outlier_visualization(self, series: Series, method: str = "iqr") -> go.Figure:
|
|
127
|
+
series = ensure_pandas_series(series)
|
|
128
|
+
clean = series.dropna().reset_index(drop=True)
|
|
129
|
+
q1, q3 = clean.quantile(0.25), clean.quantile(0.75)
|
|
130
|
+
iqr = q3 - q1
|
|
131
|
+
lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
|
|
132
|
+
is_outlier = (clean < lower) | (clean > upper)
|
|
133
|
+
fig = go.Figure()
|
|
134
|
+
fig.add_trace(go.Scatter(x=clean[~is_outlier].index, y=clean[~is_outlier], mode="markers", name="Normal", marker_color=self.colors["primary"]))
|
|
135
|
+
fig.add_trace(go.Scatter(x=clean[is_outlier].index, y=clean[is_outlier], mode="markers", name="Outliers", marker_color=self.colors["danger"]))
|
|
136
|
+
fig.add_hline(y=upper, line_dash="dash", line_color="gray", annotation_text="Upper Bound")
|
|
137
|
+
fig.add_hline(y=lower, line_dash="dash", line_color="gray", annotation_text="Lower Bound")
|
|
138
|
+
fig.update_layout(title=f"Outlier Detection: {series.name}", template=self.theme)
|
|
139
|
+
return fig
|
|
140
|
+
|
|
141
|
+
def category_bar_chart(self, series: Series, top_n: int = 20) -> go.Figure:
|
|
142
|
+
series = ensure_pandas_series(series)
|
|
143
|
+
value_counts = series.value_counts().head(top_n)
|
|
144
|
+
fig = go.Figure(go.Bar(x=value_counts.index.astype(str), y=value_counts.values, marker_color=self.colors["primary"]))
|
|
145
|
+
fig.update_layout(
|
|
146
|
+
title=f"Top {top_n} Categories: {series.name}",
|
|
147
|
+
xaxis_title="Category",
|
|
148
|
+
yaxis_title="Count",
|
|
149
|
+
template=self.theme
|
|
150
|
+
)
|
|
151
|
+
return fig
|
|
152
|
+
|
|
153
|
+
def correlation_heatmap(self, df: DataFrame, method: str = "pearson") -> go.Figure:
|
|
154
|
+
df = to_pandas(df)
|
|
155
|
+
corr = df.corr(method=method)
|
|
156
|
+
fig = go.Figure(go.Heatmap(z=corr.values, x=corr.columns, y=corr.columns, colorscale="RdBu", zmid=0))
|
|
157
|
+
fig.update_layout(
|
|
158
|
+
title=f"Correlation Matrix ({method})",
|
|
159
|
+
template=self.theme,
|
|
160
|
+
height=max(400, len(corr.columns) * 25)
|
|
161
|
+
)
|
|
162
|
+
return fig
|
|
163
|
+
|
|
164
|
+
def target_correlation_bars(self, correlations: Dict[str, float], target_name: str) -> go.Figure:
|
|
165
|
+
cols = list(correlations.keys())
|
|
166
|
+
vals = list(correlations.values())
|
|
167
|
+
colors = [self.colors["success"] if v > 0 else self.colors["danger"] for v in vals]
|
|
168
|
+
fig = go.Figure(go.Bar(y=cols, x=vals, orientation="h", marker_color=colors))
|
|
169
|
+
fig.update_layout(
|
|
170
|
+
title=f"Correlation with Target: {target_name}",
|
|
171
|
+
xaxis_title="Correlation",
|
|
172
|
+
template=self.theme,
|
|
173
|
+
height=max(400, len(cols) * 25)
|
|
174
|
+
)
|
|
175
|
+
return fig
|
|
176
|
+
|
|
177
|
+
def roc_curve(self, fpr, tpr, auc_score: float) -> go.Figure:
|
|
178
|
+
fig = go.Figure()
|
|
179
|
+
fig.add_trace(go.Scatter(x=fpr, y=tpr, mode="lines", name=f"ROC (AUC={auc_score:.3f})"))
|
|
180
|
+
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode="lines", line_dash="dash", name="Random"))
|
|
181
|
+
fig.update_layout(
|
|
182
|
+
title="ROC Curve",
|
|
183
|
+
xaxis_title="False Positive Rate",
|
|
184
|
+
yaxis_title="True Positive Rate",
|
|
185
|
+
template=self.theme
|
|
186
|
+
)
|
|
187
|
+
return fig
|
|
188
|
+
|
|
189
|
+
def precision_recall_curve(
|
|
190
|
+
self,
|
|
191
|
+
precision,
|
|
192
|
+
recall,
|
|
193
|
+
pr_auc: float,
|
|
194
|
+
baseline: Optional[float] = None,
|
|
195
|
+
title: Optional[str] = None,
|
|
196
|
+
) -> go.Figure:
|
|
197
|
+
fig = go.Figure()
|
|
198
|
+
fig.add_trace(go.Scatter(
|
|
199
|
+
x=recall, y=precision,
|
|
200
|
+
mode="lines",
|
|
201
|
+
name=f"PR (AUC={pr_auc:.3f})",
|
|
202
|
+
line={"color": self.colors["primary"], "width": 2}
|
|
203
|
+
))
|
|
204
|
+
|
|
205
|
+
if baseline is not None:
|
|
206
|
+
fig.add_hline(
|
|
207
|
+
y=baseline,
|
|
208
|
+
line_dash="dash",
|
|
209
|
+
line_color="gray",
|
|
210
|
+
annotation_text=f"Baseline: {baseline:.2f}",
|
|
211
|
+
annotation_position="right"
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
fig.update_layout(
|
|
215
|
+
title=title or "Precision-Recall Curve",
|
|
216
|
+
xaxis_title="Recall",
|
|
217
|
+
yaxis_title="Precision",
|
|
218
|
+
xaxis_range=[0, 1],
|
|
219
|
+
yaxis_range=[0, 1.05],
|
|
220
|
+
template=self.theme
|
|
221
|
+
)
|
|
222
|
+
return fig
|
|
223
|
+
|
|
224
|
+
def model_comparison_grid(self, model_results: Dict[str, Dict[str, Any]], y_test: Any,
|
|
225
|
+
class_labels: Optional[List[str]] = None, title: Optional[str] = None) -> go.Figure:
|
|
226
|
+
from plotly.subplots import make_subplots
|
|
227
|
+
model_names, n_models = list(model_results.keys()), len(model_results)
|
|
228
|
+
class_labels = class_labels or ["0", "1"]
|
|
229
|
+
subplot_titles = [f"{name[:15]}<br>{row}" for row in ["Confusion Matrix", "ROC Curve", "Precision-Recall"] for name in model_names]
|
|
230
|
+
fig = make_subplots(rows=3, cols=n_models, subplot_titles=subplot_titles, vertical_spacing=0.12, horizontal_spacing=0.08,
|
|
231
|
+
specs=[[{"type": "heatmap"} for _ in range(n_models)], [{"type": "xy"} for _ in range(n_models)], [{"type": "xy"} for _ in range(n_models)]])
|
|
232
|
+
model_colors = [self.colors["primary"], self.colors["secondary"], self.colors["success"], self.colors["info"], self.colors["warning"]]
|
|
233
|
+
baseline = np.mean(y_test)
|
|
234
|
+
for i, model_name in enumerate(model_names):
|
|
235
|
+
col, color = i + 1, model_colors[i % len(model_colors)]
|
|
236
|
+
y_pred, y_pred_proba = model_results[model_name]["y_pred"], model_results[model_name]["y_pred_proba"]
|
|
237
|
+
self._add_confusion_matrix_to_grid(fig, y_test, y_pred, class_labels, col)
|
|
238
|
+
self._add_roc_curve_to_grid(fig, y_test, y_pred_proba, color, col, n_models)
|
|
239
|
+
self._add_pr_curve_to_grid(fig, y_test, y_pred_proba, color, col, n_models, baseline)
|
|
240
|
+
self._update_comparison_grid_axes(fig, n_models)
|
|
241
|
+
fig.update_layout(title=title or "Model Comparison", height=300 * 3 + 100, width=350 * n_models + 50, template=self.theme, showlegend=False)
|
|
242
|
+
return fig
|
|
243
|
+
|
|
244
|
+
def _add_confusion_matrix_to_grid(self, fig: go.Figure, y_test: Any, y_pred: Any, class_labels: List[str], col: int) -> None:
|
|
245
|
+
from sklearn.metrics import confusion_matrix
|
|
246
|
+
cm = confusion_matrix(y_test, y_pred)
|
|
247
|
+
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
|
|
248
|
+
cm_text = [[f"{cm[i][j]}<br>({cm_normalized[i][j]:.0%})" for j in range(len(class_labels))] for i in range(len(class_labels))]
|
|
249
|
+
fig.add_trace(go.Heatmap(z=cm, x=class_labels, y=class_labels, colorscale="Blues", text=cm_text, texttemplate="%{text}",
|
|
250
|
+
textfont={"size": 11}, showscale=False, hovertemplate="Actual: %{y}<br>Predicted: %{x}<br>Count: %{z}<extra></extra>"), row=1, col=col)
|
|
251
|
+
|
|
252
|
+
def _add_roc_curve_to_grid(self, fig: go.Figure, y_test: Any, y_pred_proba: Any, color: str, col: int, n_models: int) -> None:
|
|
253
|
+
from sklearn.metrics import roc_auc_score, roc_curve
|
|
254
|
+
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
|
|
255
|
+
auc = roc_auc_score(y_test, y_pred_proba)
|
|
256
|
+
fig.add_trace(go.Scatter(x=fpr, y=tpr, mode="lines", line={"color": color, "width": 2}, name=f"AUC={auc:.3f}", showlegend=False,
|
|
257
|
+
hovertemplate="FPR: %{x:.2f}<br>TPR: %{y:.2f}<extra></extra>"), row=2, col=col)
|
|
258
|
+
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode="lines", line={"color": "gray", "width": 1, "dash": "dash"}, showlegend=False, hoverinfo="skip"), row=2, col=col)
|
|
259
|
+
xref = f"x{col + n_models}" if col > 1 else "x" + str(n_models + 1) if n_models > 1 else "x2"
|
|
260
|
+
yref = f"y{col + n_models}" if col > 1 else "y" + str(n_models + 1) if n_models > 1 else "y2"
|
|
261
|
+
fig.add_annotation(x=0.95, y=0.05, xref=xref, yref=yref, text=f"AUC={auc:.3f}", showarrow=False, font={"size": 11, "color": color}, bgcolor="rgba(255,255,255,0.8)", xanchor="right")
|
|
262
|
+
|
|
263
|
+
def _add_pr_curve_to_grid(self, fig: go.Figure, y_test: Any, y_pred_proba: Any, color: str, col: int, n_models: int, baseline: float) -> None:
|
|
264
|
+
from sklearn.metrics import average_precision_score, precision_recall_curve
|
|
265
|
+
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
|
|
266
|
+
pr_auc = average_precision_score(y_test, y_pred_proba)
|
|
267
|
+
fig.add_trace(go.Scatter(x=recall, y=precision, mode="lines", line={"color": color, "width": 2}, name=f"PR-AUC={pr_auc:.3f}", showlegend=False,
|
|
268
|
+
hovertemplate="Recall: %{x:.2f}<br>Precision: %{y:.2f}<extra></extra>"), row=3, col=col)
|
|
269
|
+
fig.add_trace(go.Scatter(x=[0, 1], y=[baseline, baseline], mode="lines", line={"color": "gray", "width": 1, "dash": "dash"}, showlegend=False, hoverinfo="skip"), row=3, col=col)
|
|
270
|
+
pr_row_offset = 2 * n_models
|
|
271
|
+
xref = f"x{col + pr_row_offset}" if col + pr_row_offset > 1 else "x"
|
|
272
|
+
yref = f"y{col + pr_row_offset}" if col + pr_row_offset > 1 else "y"
|
|
273
|
+
fig.add_annotation(x=0.05, y=0.05, xref=xref, yref=yref, text=f"PR-AUC={pr_auc:.3f}", showarrow=False, font={"size": 11, "color": color}, bgcolor="rgba(255,255,255,0.8)", xanchor="left")
|
|
274
|
+
|
|
275
|
+
def _update_comparison_grid_axes(self, fig: go.Figure, n_models: int) -> None:
|
|
276
|
+
for i in range(n_models):
|
|
277
|
+
col = i + 1
|
|
278
|
+
fig.update_xaxes(title_text="Predicted", row=1, col=col)
|
|
279
|
+
fig.update_yaxes(title_text="Actual", row=1, col=col)
|
|
280
|
+
fig.update_xaxes(title_text="FPR", row=2, col=col, range=[0, 1])
|
|
281
|
+
fig.update_yaxes(title_text="TPR", row=2, col=col, range=[0, 1.02])
|
|
282
|
+
fig.update_xaxes(title_text="Recall", row=3, col=col, range=[0, 1])
|
|
283
|
+
fig.update_yaxes(title_text="Precision", row=3, col=col, range=[0, 1.05])
|
|
284
|
+
|
|
285
|
+
def confusion_matrix_heatmap(self, cm, labels: Optional[List[str]] = None) -> go.Figure:
|
|
286
|
+
cm_array = np.array(cm)
|
|
287
|
+
if labels is None:
|
|
288
|
+
labels = [str(i) for i in range(len(cm_array))]
|
|
289
|
+
fig = go.Figure(go.Heatmap(
|
|
290
|
+
z=cm_array,
|
|
291
|
+
x=labels,
|
|
292
|
+
y=labels,
|
|
293
|
+
colorscale="Blues",
|
|
294
|
+
text=cm_array,
|
|
295
|
+
texttemplate="%{text}"
|
|
296
|
+
))
|
|
297
|
+
fig.update_layout(
|
|
298
|
+
title="Confusion Matrix",
|
|
299
|
+
xaxis_title="Predicted",
|
|
300
|
+
yaxis_title="Actual",
|
|
301
|
+
template=self.theme
|
|
302
|
+
)
|
|
303
|
+
return fig
|
|
304
|
+
|
|
305
|
+
def feature_importance_plot(self, importance_df: DataFrame) -> go.Figure:
|
|
306
|
+
importance_df = to_pandas(importance_df)
|
|
307
|
+
fig = go.Figure(go.Bar(
|
|
308
|
+
y=importance_df["feature"],
|
|
309
|
+
x=importance_df["importance"],
|
|
310
|
+
orientation="h",
|
|
311
|
+
marker_color=self.colors["primary"]
|
|
312
|
+
))
|
|
313
|
+
fig.update_layout(
|
|
314
|
+
title="Feature Importance",
|
|
315
|
+
xaxis_title="Importance",
|
|
316
|
+
template=self.theme,
|
|
317
|
+
height=max(400, len(importance_df) * 25)
|
|
318
|
+
)
|
|
319
|
+
return fig
|
|
320
|
+
|
|
321
|
+
def lift_curve(self, percentiles, lift_values) -> go.Figure:
|
|
322
|
+
fig = go.Figure()
|
|
323
|
+
fig.add_trace(go.Scatter(x=percentiles, y=lift_values, mode="lines+markers", name="Model Lift"))
|
|
324
|
+
fig.add_hline(y=1, line_dash="dash", line_color="gray", annotation_text="Baseline")
|
|
325
|
+
fig.update_layout(
|
|
326
|
+
title="Lift Curve",
|
|
327
|
+
xaxis_title="Percentile",
|
|
328
|
+
yaxis_title="Lift",
|
|
329
|
+
template=self.theme
|
|
330
|
+
)
|
|
331
|
+
return fig
|
|
332
|
+
|
|
333
|
+
def time_series_plot(self, df: DataFrame, date_col: str, value_col: str) -> go.Figure:
|
|
334
|
+
df = to_pandas(df)
|
|
335
|
+
fig = px.line(df, x=date_col, y=value_col)
|
|
336
|
+
fig.update_layout(title=f"{value_col} over Time", template=self.theme)
|
|
337
|
+
return fig
|
|
338
|
+
|
|
339
|
+
def cohort_retention_heatmap(self, retention_matrix: DataFrame) -> go.Figure:
|
|
340
|
+
retention_matrix = to_pandas(retention_matrix)
|
|
341
|
+
fig = go.Figure(go.Heatmap(
|
|
342
|
+
z=retention_matrix.values,
|
|
343
|
+
x=retention_matrix.columns,
|
|
344
|
+
y=retention_matrix.index,
|
|
345
|
+
colorscale="Greens",
|
|
346
|
+
text=np.round(retention_matrix.values, 2),
|
|
347
|
+
texttemplate="%{text:.0%}"
|
|
348
|
+
))
|
|
349
|
+
fig.update_layout(
|
|
350
|
+
title="Cohort Retention",
|
|
351
|
+
xaxis_title="Months Since Start",
|
|
352
|
+
yaxis_title="Cohort",
|
|
353
|
+
template=self.theme
|
|
354
|
+
)
|
|
355
|
+
return fig
|
|
356
|
+
|
|
357
|
+
def histogram(self, series: Series, title: Optional[str] = None, nbins: int = 30) -> go.Figure:
|
|
358
|
+
series = ensure_pandas_series(series)
|
|
359
|
+
fig = go.Figure(go.Histogram(x=series.dropna(), nbinsx=nbins, marker_color=self.colors["primary"]))
|
|
360
|
+
fig.update_layout(
|
|
361
|
+
title=title or f"Distribution of {series.name}",
|
|
362
|
+
xaxis_title=series.name,
|
|
363
|
+
yaxis_title="Count",
|
|
364
|
+
template=self.theme
|
|
365
|
+
)
|
|
366
|
+
return fig
|
|
367
|
+
|
|
368
|
+
def heatmap(self, z: Any, x_labels: List[str], y_labels: List[str],
|
|
369
|
+
title: Optional[str] = None, colorscale: str = "RdBu") -> go.Figure:
|
|
370
|
+
z_array = np.array(z) if not isinstance(z, np.ndarray) else z
|
|
371
|
+
fig = go.Figure(go.Heatmap(
|
|
372
|
+
z=z_array, x=x_labels, y=y_labels,
|
|
373
|
+
colorscale=colorscale, zmid=0 if colorscale == "RdBu" else None
|
|
374
|
+
))
|
|
375
|
+
fig.update_layout(
|
|
376
|
+
title=title,
|
|
377
|
+
template=self.theme,
|
|
378
|
+
height=max(400, len(y_labels) * 25)
|
|
379
|
+
)
|
|
380
|
+
return fig
|
|
381
|
+
|
|
382
|
+
def scatter_matrix(
|
|
383
|
+
self,
|
|
384
|
+
df: DataFrame,
|
|
385
|
+
title: Optional[str] = None,
|
|
386
|
+
height: Optional[int] = None,
|
|
387
|
+
width: Optional[int] = None,
|
|
388
|
+
color_column: Optional[Series] = None,
|
|
389
|
+
color_map: Optional[Dict[str, str]] = None,
|
|
390
|
+
) -> go.Figure:
|
|
391
|
+
df = to_pandas(df)
|
|
392
|
+
n_cols = len(df.columns)
|
|
393
|
+
auto_height = max(500, n_cols * 150)
|
|
394
|
+
|
|
395
|
+
if color_column is not None:
|
|
396
|
+
plot_df = df.copy()
|
|
397
|
+
plot_df["_color_"] = ensure_pandas_series(color_column).values
|
|
398
|
+
default_colors = {"Retained": "#2ECC71", "Churned": "#E74C3C"}
|
|
399
|
+
colors = color_map or default_colors
|
|
400
|
+
fig = px.scatter_matrix(
|
|
401
|
+
plot_df, dimensions=df.columns.tolist(), color="_color_",
|
|
402
|
+
title=title, color_discrete_map=colors
|
|
403
|
+
)
|
|
404
|
+
fig.update_traces(marker=dict(opacity=0.6, size=5))
|
|
405
|
+
else:
|
|
406
|
+
fig = px.scatter_matrix(df, title=title)
|
|
407
|
+
|
|
408
|
+
fig.update_layout(template=self.theme, height=height or auto_height, autosize=True)
|
|
409
|
+
if width:
|
|
410
|
+
fig.update_layout(width=width)
|
|
411
|
+
fig.update_traces(diagonal_visible=False, showupperhalf=False)
|
|
412
|
+
return fig
|
|
413
|
+
|
|
414
|
+
def multi_line_chart(self, data: List[Dict[str, Any]], x_key: str, y_key: str,
|
|
415
|
+
name_key: str, title: Optional[str] = None,
|
|
416
|
+
x_title: Optional[str] = None, y_title: Optional[str] = None) -> go.Figure:
|
|
417
|
+
fig = go.Figure()
|
|
418
|
+
for series in data:
|
|
419
|
+
fig.add_trace(go.Scatter(
|
|
420
|
+
x=series[x_key], y=series[y_key],
|
|
421
|
+
mode="lines", name=series[name_key]
|
|
422
|
+
))
|
|
423
|
+
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode="lines", line_dash="dash",
|
|
424
|
+
line_color="gray", name="Random"))
|
|
425
|
+
fig.update_layout(title=title, xaxis_title=x_title, yaxis_title=y_title, template=self.theme)
|
|
426
|
+
return fig
|
|
427
|
+
|
|
428
|
+
def temporal_distribution(
|
|
429
|
+
self,
|
|
430
|
+
analysis: "TemporalAnalysis",
|
|
431
|
+
title: Optional[str] = None,
|
|
432
|
+
chart_type: str = "bar",
|
|
433
|
+
) -> go.Figure:
|
|
434
|
+
period_counts = analysis.period_counts
|
|
435
|
+
if period_counts.empty:
|
|
436
|
+
fig = go.Figure()
|
|
437
|
+
fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
|
|
438
|
+
return fig
|
|
439
|
+
|
|
440
|
+
x_values = period_counts["period"].astype(str)
|
|
441
|
+
y_values = period_counts["count"]
|
|
442
|
+
|
|
443
|
+
fig = go.Figure()
|
|
444
|
+
if chart_type == "line":
|
|
445
|
+
fig.add_trace(go.Scatter(
|
|
446
|
+
x=x_values, y=y_values,
|
|
447
|
+
mode="lines+markers",
|
|
448
|
+
line={"color": self.colors["primary"], "width": 2},
|
|
449
|
+
marker={"size": 6},
|
|
450
|
+
name="Record Count"
|
|
451
|
+
))
|
|
452
|
+
else:
|
|
453
|
+
fig.add_trace(go.Bar(
|
|
454
|
+
x=x_values, y=y_values,
|
|
455
|
+
marker_color=self.colors["primary"],
|
|
456
|
+
name="Record Count"
|
|
457
|
+
))
|
|
458
|
+
|
|
459
|
+
mean_count = y_values.mean()
|
|
460
|
+
fig.add_hline(
|
|
461
|
+
y=mean_count,
|
|
462
|
+
line_dash="dash",
|
|
463
|
+
line_color=self.colors["secondary"],
|
|
464
|
+
annotation_text=f"Avg: {mean_count:.0f}",
|
|
465
|
+
annotation_position="top right"
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
granularity_label = analysis.granularity.value.capitalize()
|
|
469
|
+
default_title = f"Records by {granularity_label}"
|
|
470
|
+
fig.update_layout(
|
|
471
|
+
title=title or default_title,
|
|
472
|
+
xaxis_title=granularity_label,
|
|
473
|
+
yaxis_title="Count",
|
|
474
|
+
template=self.theme,
|
|
475
|
+
xaxis_tickangle=-45 if len(x_values) > 12 else 0
|
|
476
|
+
)
|
|
477
|
+
return fig
|
|
478
|
+
|
|
479
|
+
def temporal_trend(
|
|
480
|
+
self,
|
|
481
|
+
analysis: "TemporalAnalysis",
|
|
482
|
+
title: Optional[str] = None,
|
|
483
|
+
show_trend: bool = True,
|
|
484
|
+
) -> go.Figure:
|
|
485
|
+
period_counts = analysis.period_counts
|
|
486
|
+
if period_counts.empty:
|
|
487
|
+
fig = go.Figure()
|
|
488
|
+
fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
|
|
489
|
+
return fig
|
|
490
|
+
|
|
491
|
+
x_values = list(range(len(period_counts)))
|
|
492
|
+
x_labels = period_counts["period"].astype(str)
|
|
493
|
+
y_values = period_counts["count"].values
|
|
494
|
+
|
|
495
|
+
fig = go.Figure()
|
|
496
|
+
fig.add_trace(go.Scatter(
|
|
497
|
+
x=x_labels, y=y_values,
|
|
498
|
+
mode="lines+markers",
|
|
499
|
+
line={"color": self.colors["primary"], "width": 2},
|
|
500
|
+
marker={"size": 8},
|
|
501
|
+
name="Actual"
|
|
502
|
+
))
|
|
503
|
+
|
|
504
|
+
if show_trend and len(x_values) >= 2:
|
|
505
|
+
z = np.polyfit(x_values, y_values, 1)
|
|
506
|
+
trend_line = np.poly1d(z)(x_values)
|
|
507
|
+
slope_pct = ((trend_line[-1] - trend_line[0]) / trend_line[0] * 100) if trend_line[0] != 0 else 0
|
|
508
|
+
trend_direction = "increasing" if z[0] > 0 else "decreasing"
|
|
509
|
+
trend_color = self.colors["success"] if z[0] > 0 else self.colors["danger"]
|
|
510
|
+
|
|
511
|
+
fig.add_trace(go.Scatter(
|
|
512
|
+
x=x_labels, y=trend_line,
|
|
513
|
+
mode="lines",
|
|
514
|
+
line={"color": trend_color, "width": 2, "dash": "dash"},
|
|
515
|
+
name=f"Trend ({trend_direction}, {abs(slope_pct):.1f}%)"
|
|
516
|
+
))
|
|
517
|
+
|
|
518
|
+
granularity_label = analysis.granularity.value.capitalize()
|
|
519
|
+
default_title = f"Temporal Trend by {granularity_label}"
|
|
520
|
+
fig.update_layout(
|
|
521
|
+
title=title or default_title,
|
|
522
|
+
xaxis_title=granularity_label,
|
|
523
|
+
yaxis_title="Count",
|
|
524
|
+
template=self.theme,
|
|
525
|
+
xaxis_tickangle=-45 if len(x_labels) > 12 else 0,
|
|
526
|
+
showlegend=True
|
|
527
|
+
)
|
|
528
|
+
return fig
|
|
529
|
+
|
|
530
|
+
def temporal_heatmap(
|
|
531
|
+
self,
|
|
532
|
+
dates: Series,
|
|
533
|
+
title: Optional[str] = None,
|
|
534
|
+
) -> go.Figure:
|
|
535
|
+
import pandas as pd
|
|
536
|
+
dates = ensure_pandas_series(dates)
|
|
537
|
+
parsed = pd.to_datetime(dates, errors="coerce").dropna()
|
|
538
|
+
|
|
539
|
+
if len(parsed) == 0:
|
|
540
|
+
fig = go.Figure()
|
|
541
|
+
fig.add_annotation(text="No valid dates", x=0.5, y=0.5, showarrow=False)
|
|
542
|
+
return fig
|
|
543
|
+
|
|
544
|
+
counts = parsed.dt.dayofweek.value_counts().reindex(range(7), fill_value=0)
|
|
545
|
+
|
|
546
|
+
fig = go.Figure(go.Bar(
|
|
547
|
+
x=self.DOW_NAMES,
|
|
548
|
+
y=counts.values,
|
|
549
|
+
marker_color=[self.colors["info"] if i < 5 else self.colors["warning"] for i in range(7)]
|
|
550
|
+
))
|
|
551
|
+
|
|
552
|
+
fig.update_layout(
|
|
553
|
+
title=title or "Records by Day of Week",
|
|
554
|
+
xaxis_title="Day of Week",
|
|
555
|
+
yaxis_title="Count",
|
|
556
|
+
template=self.theme
|
|
557
|
+
)
|
|
558
|
+
return fig
|
|
559
|
+
|
|
560
|
+
def year_month_heatmap(
|
|
561
|
+
self,
|
|
562
|
+
pivot_df: "DataFrame",
|
|
563
|
+
title: Optional[str] = None,
|
|
564
|
+
) -> go.Figure:
|
|
565
|
+
pivot_df = to_pandas(pivot_df)
|
|
566
|
+
if pivot_df.empty:
|
|
567
|
+
fig = go.Figure()
|
|
568
|
+
fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
|
|
569
|
+
return fig
|
|
570
|
+
|
|
571
|
+
fig = go.Figure(go.Heatmap(
|
|
572
|
+
z=pivot_df.values,
|
|
573
|
+
x=pivot_df.columns.tolist(),
|
|
574
|
+
y=pivot_df.index.astype(str).tolist(),
|
|
575
|
+
colorscale="Blues",
|
|
576
|
+
text=pivot_df.values,
|
|
577
|
+
texttemplate="%{text:,}",
|
|
578
|
+
textfont={"size": 10},
|
|
579
|
+
hovertemplate="Year: %{y}<br>Month: %{x}<br>Count: %{z:,}<extra></extra>"
|
|
580
|
+
))
|
|
581
|
+
|
|
582
|
+
fig.update_layout(
|
|
583
|
+
title=title or "Records by Year and Month",
|
|
584
|
+
xaxis_title="Month",
|
|
585
|
+
yaxis_title="Year",
|
|
586
|
+
template=self.theme,
|
|
587
|
+
height=max(300, len(pivot_df) * 40 + 100)
|
|
588
|
+
)
|
|
589
|
+
return fig
|
|
590
|
+
|
|
591
|
+
def cumulative_growth_chart(
|
|
592
|
+
self,
|
|
593
|
+
cumulative_series: Series,
|
|
594
|
+
title: Optional[str] = None,
|
|
595
|
+
) -> go.Figure:
|
|
596
|
+
"""Create a cumulative growth chart."""
|
|
597
|
+
cumulative_series = ensure_pandas_series(cumulative_series)
|
|
598
|
+
if len(cumulative_series) == 0:
|
|
599
|
+
fig = go.Figure()
|
|
600
|
+
fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
|
|
601
|
+
return fig
|
|
602
|
+
|
|
603
|
+
fig = go.Figure()
|
|
604
|
+
fig.add_trace(go.Scatter(
|
|
605
|
+
x=[str(p) for p in cumulative_series.index],
|
|
606
|
+
y=cumulative_series.values,
|
|
607
|
+
mode="lines+markers",
|
|
608
|
+
fill="tozeroy",
|
|
609
|
+
line={"color": self.colors["primary"], "width": 2},
|
|
610
|
+
marker={"size": 6},
|
|
611
|
+
name="Cumulative Count"
|
|
612
|
+
))
|
|
613
|
+
|
|
614
|
+
fig.update_layout(
|
|
615
|
+
title=title or "Cumulative Records Over Time",
|
|
616
|
+
xaxis_title="Period",
|
|
617
|
+
yaxis_title="Cumulative Count",
|
|
618
|
+
template=self.theme,
|
|
619
|
+
xaxis_tickangle=-45
|
|
620
|
+
)
|
|
621
|
+
return fig
|
|
622
|
+
|
|
623
|
+
def year_over_year_lines(
|
|
624
|
+
self,
|
|
625
|
+
pivot_df: "DataFrame",
|
|
626
|
+
title: Optional[str] = None,
|
|
627
|
+
) -> go.Figure:
|
|
628
|
+
"""Create year-over-year comparison line chart."""
|
|
629
|
+
pivot_df = to_pandas(pivot_df)
|
|
630
|
+
if pivot_df.empty:
|
|
631
|
+
fig = go.Figure()
|
|
632
|
+
fig.add_annotation(text="No data available", x=0.5, y=0.5, showarrow=False)
|
|
633
|
+
return fig
|
|
634
|
+
|
|
635
|
+
colors = px.colors.qualitative.Set1
|
|
636
|
+
fig = go.Figure()
|
|
637
|
+
|
|
638
|
+
for i, year in enumerate(pivot_df.index):
|
|
639
|
+
fig.add_trace(go.Scatter(
|
|
640
|
+
x=pivot_df.columns.tolist(),
|
|
641
|
+
y=pivot_df.loc[year].values,
|
|
642
|
+
mode="lines+markers",
|
|
643
|
+
name=str(year),
|
|
644
|
+
line={"color": colors[i % len(colors)], "width": 2},
|
|
645
|
+
marker={"size": 8}
|
|
646
|
+
))
|
|
647
|
+
|
|
648
|
+
fig.update_layout(
|
|
649
|
+
title=title or "Year-over-Year Comparison",
|
|
650
|
+
xaxis_title="Month",
|
|
651
|
+
yaxis_title="Count",
|
|
652
|
+
template=self.theme,
|
|
653
|
+
showlegend=True,
|
|
654
|
+
legend={"title": "Year"}
|
|
655
|
+
)
|
|
656
|
+
return fig
|
|
657
|
+
|
|
658
|
+
def growth_summary_indicators(
|
|
659
|
+
self,
|
|
660
|
+
growth_data: Dict[str, Any],
|
|
661
|
+
title: Optional[str] = None,
|
|
662
|
+
) -> go.Figure:
|
|
663
|
+
"""Create growth summary with key indicators using compact number formatting."""
|
|
664
|
+
if not growth_data.get("has_data"):
|
|
665
|
+
fig = go.Figure()
|
|
666
|
+
fig.add_annotation(text="Insufficient data", x=0.5, y=0.5, showarrow=False)
|
|
667
|
+
return fig
|
|
668
|
+
|
|
669
|
+
formatter = NumberFormatter()
|
|
670
|
+
fig = go.Figure()
|
|
671
|
+
|
|
672
|
+
# Define indicator positions (x_center, label)
|
|
673
|
+
indicators = [
|
|
674
|
+
(0.15, "Overall Growth", growth_data["overall_growth_pct"], "%"),
|
|
675
|
+
(0.5, "Avg Monthly", growth_data["avg_monthly_growth"], "%/mo"),
|
|
676
|
+
(0.85, f"Trend: {growth_data['trend_direction'].upper()}", growth_data["trend_slope"], "/mo"),
|
|
677
|
+
]
|
|
678
|
+
|
|
679
|
+
for x_pos, label, value, suffix in indicators:
|
|
680
|
+
color = self.colors["success"] if value >= 0 else self.colors["danger"]
|
|
681
|
+
formatted_value = formatter.compact(abs(value))
|
|
682
|
+
sign = "+" if value >= 0 else "-"
|
|
683
|
+
display_text = f"{sign}{formatted_value}{suffix}"
|
|
684
|
+
|
|
685
|
+
# Value annotation
|
|
686
|
+
fig.add_annotation(
|
|
687
|
+
x=x_pos, y=0.55,
|
|
688
|
+
text=display_text,
|
|
689
|
+
font={"size": 36, "color": color, "family": "Arial Black"},
|
|
690
|
+
showarrow=False,
|
|
691
|
+
xref="paper", yref="paper"
|
|
692
|
+
)
|
|
693
|
+
# Label annotation
|
|
694
|
+
fig.add_annotation(
|
|
695
|
+
x=x_pos, y=0.15,
|
|
696
|
+
text=label,
|
|
697
|
+
font={"size": 14, "color": "#666666"},
|
|
698
|
+
showarrow=False,
|
|
699
|
+
xref="paper", yref="paper"
|
|
700
|
+
)
|
|
701
|
+
|
|
702
|
+
fig.update_layout(
|
|
703
|
+
title={"text": title or "Growth Summary", "font": {"size": 16}},
|
|
704
|
+
template=self.theme,
|
|
705
|
+
height=180,
|
|
706
|
+
margin={"t": 60, "b": 20, "l": 20, "r": 20},
|
|
707
|
+
xaxis={"visible": False},
|
|
708
|
+
yaxis={"visible": False}
|
|
709
|
+
)
|
|
710
|
+
return fig
|
|
711
|
+
|
|
712
|
+
def segment_overview(
|
|
713
|
+
self,
|
|
714
|
+
result: "SegmentationResult",
|
|
715
|
+
title: Optional[str] = None,
|
|
716
|
+
) -> go.Figure:
|
|
717
|
+
"""Create overview of segments showing size and target rate."""
|
|
718
|
+
from plotly.subplots import make_subplots
|
|
719
|
+
|
|
720
|
+
profiles = result.profiles
|
|
721
|
+
if not profiles:
|
|
722
|
+
fig = go.Figure()
|
|
723
|
+
fig.add_annotation(text="No segments found", x=0.5, y=0.5, showarrow=False)
|
|
724
|
+
return fig
|
|
725
|
+
|
|
726
|
+
segment_names = [f"Segment {p.segment_id}" for p in profiles]
|
|
727
|
+
sizes = [p.size_pct for p in profiles]
|
|
728
|
+
target_rates = [p.target_rate for p in profiles]
|
|
729
|
+
has_target = any(tr is not None for tr in target_rates)
|
|
730
|
+
|
|
731
|
+
fig = make_subplots(
|
|
732
|
+
rows=1, cols=2 if has_target else 1,
|
|
733
|
+
specs=[[{"type": "pie"}, {"type": "bar"}]] if has_target else [[{"type": "pie"}]],
|
|
734
|
+
subplot_titles=["Segment Sizes", "Target Rate by Segment"] if has_target else ["Segment Sizes"],
|
|
735
|
+
)
|
|
736
|
+
|
|
737
|
+
colors = px.colors.qualitative.Set2[:len(profiles)]
|
|
738
|
+
fig.add_trace(
|
|
739
|
+
go.Pie(
|
|
740
|
+
labels=segment_names,
|
|
741
|
+
values=sizes,
|
|
742
|
+
marker_colors=colors,
|
|
743
|
+
textinfo="label+percent",
|
|
744
|
+
hovertemplate="<b>%{label}</b><br>Size: %{value:.1f}%<extra></extra>",
|
|
745
|
+
),
|
|
746
|
+
row=1, col=1
|
|
747
|
+
)
|
|
748
|
+
|
|
749
|
+
if has_target:
|
|
750
|
+
target_rates_clean = [tr if tr is not None else 0 for tr in target_rates]
|
|
751
|
+
fig.add_trace(
|
|
752
|
+
go.Bar(
|
|
753
|
+
x=segment_names,
|
|
754
|
+
y=[tr * 100 for tr in target_rates_clean],
|
|
755
|
+
marker_color=colors,
|
|
756
|
+
text=[f"{tr*100:.1f}%" for tr in target_rates_clean],
|
|
757
|
+
textposition="outside",
|
|
758
|
+
hovertemplate="<b>%{x}</b><br>Target Rate: %{y:.1f}%<extra></extra>",
|
|
759
|
+
),
|
|
760
|
+
row=1, col=2
|
|
761
|
+
)
|
|
762
|
+
max_rate = max(target_rates_clean) * 100
|
|
763
|
+
y_max = max_rate * 1.3 if max_rate > 0 else 10
|
|
764
|
+
fig.update_yaxes(title_text="Target Rate (%)", row=1, col=2, range=[0, y_max])
|
|
765
|
+
|
|
766
|
+
fig.update_layout(
|
|
767
|
+
title=title or f"Segment Overview ({result.n_segments} segments)",
|
|
768
|
+
template=self.theme,
|
|
769
|
+
height=400,
|
|
770
|
+
showlegend=False,
|
|
771
|
+
)
|
|
772
|
+
return fig
|
|
773
|
+
|
|
774
|
+
def segment_feature_comparison(
|
|
775
|
+
self,
|
|
776
|
+
result: "SegmentationResult",
|
|
777
|
+
features: Optional[List[str]] = None,
|
|
778
|
+
title: Optional[str] = None,
|
|
779
|
+
) -> go.Figure:
|
|
780
|
+
"""Compare feature distributions across segments using grouped bars."""
|
|
781
|
+
profiles = result.profiles
|
|
782
|
+
if not profiles:
|
|
783
|
+
fig = go.Figure()
|
|
784
|
+
fig.add_annotation(text="No segments found", x=0.5, y=0.5, showarrow=False)
|
|
785
|
+
return fig
|
|
786
|
+
|
|
787
|
+
all_features = set()
|
|
788
|
+
for p in profiles:
|
|
789
|
+
all_features.update(p.defining_features.keys())
|
|
790
|
+
|
|
791
|
+
if features:
|
|
792
|
+
all_features = [f for f in features if f in all_features]
|
|
793
|
+
else:
|
|
794
|
+
all_features = sorted(all_features)[:8]
|
|
795
|
+
|
|
796
|
+
if not all_features:
|
|
797
|
+
fig = go.Figure()
|
|
798
|
+
fig.add_annotation(text="No features to compare", x=0.5, y=0.5, showarrow=False)
|
|
799
|
+
return fig
|
|
800
|
+
|
|
801
|
+
colors = px.colors.qualitative.Set2[:len(profiles)]
|
|
802
|
+
fig = go.Figure()
|
|
803
|
+
|
|
804
|
+
for i, profile in enumerate(profiles):
|
|
805
|
+
means = []
|
|
806
|
+
for feat in all_features:
|
|
807
|
+
feat_data = profile.defining_features.get(feat, {})
|
|
808
|
+
means.append(feat_data.get("mean", 0))
|
|
809
|
+
|
|
810
|
+
fig.add_trace(go.Bar(
|
|
811
|
+
name=f"Segment {profile.segment_id}",
|
|
812
|
+
x=list(all_features),
|
|
813
|
+
y=means,
|
|
814
|
+
marker_color=colors[i],
|
|
815
|
+
))
|
|
816
|
+
|
|
817
|
+
fig.update_layout(
|
|
818
|
+
title=title or "Feature Comparison Across Segments",
|
|
819
|
+
xaxis_title="Feature",
|
|
820
|
+
yaxis_title="Mean Value",
|
|
821
|
+
barmode="group",
|
|
822
|
+
template=self.theme,
|
|
823
|
+
height=400,
|
|
824
|
+
legend={"title": "Segment"},
|
|
825
|
+
)
|
|
826
|
+
return fig
|
|
827
|
+
|
|
828
|
+
def segment_recommendation_card(
|
|
829
|
+
self,
|
|
830
|
+
result: "SegmentationResult",
|
|
831
|
+
title: Optional[str] = None,
|
|
832
|
+
) -> go.Figure:
|
|
833
|
+
"""Display segmentation recommendation with rationale."""
|
|
834
|
+
recommendation_colors = {
|
|
835
|
+
"single_model": self.colors["success"],
|
|
836
|
+
"consider_segmentation": self.colors["warning"],
|
|
837
|
+
"strong_segmentation": self.colors["danger"],
|
|
838
|
+
}
|
|
839
|
+
recommendation_labels = {
|
|
840
|
+
"single_model": "Single Model Recommended",
|
|
841
|
+
"consider_segmentation": "Consider Segmentation",
|
|
842
|
+
"strong_segmentation": "Segmentation Strongly Recommended",
|
|
843
|
+
}
|
|
844
|
+
|
|
845
|
+
rec_color = recommendation_colors.get(result.recommendation, self.colors["info"])
|
|
846
|
+
rec_label = recommendation_labels.get(result.recommendation, result.recommendation)
|
|
847
|
+
|
|
848
|
+
fig = go.Figure()
|
|
849
|
+
|
|
850
|
+
# Recommendation header
|
|
851
|
+
fig.add_annotation(
|
|
852
|
+
x=0.5, y=0.85,
|
|
853
|
+
text=rec_label,
|
|
854
|
+
font={"size": 24, "color": rec_color, "family": "Arial Black"},
|
|
855
|
+
showarrow=False,
|
|
856
|
+
xref="paper", yref="paper"
|
|
857
|
+
)
|
|
858
|
+
|
|
859
|
+
# Confidence indicator
|
|
860
|
+
fig.add_annotation(
|
|
861
|
+
x=0.5, y=0.65,
|
|
862
|
+
text=f"Confidence: {result.confidence*100:.0f}%",
|
|
863
|
+
font={"size": 16, "color": "#666666"},
|
|
864
|
+
showarrow=False,
|
|
865
|
+
xref="paper", yref="paper"
|
|
866
|
+
)
|
|
867
|
+
|
|
868
|
+
# Key metrics
|
|
869
|
+
metrics_text = (
|
|
870
|
+
f"Segments: {result.n_segments} | "
|
|
871
|
+
f"Quality: {result.quality_score:.2f} | "
|
|
872
|
+
f"Target Variance: {result.target_variance_ratio:.2f}"
|
|
873
|
+
if result.target_variance_ratio is not None
|
|
874
|
+
else f"Segments: {result.n_segments} | Quality: {result.quality_score:.2f}"
|
|
875
|
+
)
|
|
876
|
+
fig.add_annotation(
|
|
877
|
+
x=0.5, y=0.48,
|
|
878
|
+
text=metrics_text,
|
|
879
|
+
font={"size": 14, "color": "#888888"},
|
|
880
|
+
showarrow=False,
|
|
881
|
+
xref="paper", yref="paper"
|
|
882
|
+
)
|
|
883
|
+
|
|
884
|
+
# Rationale
|
|
885
|
+
rationale_text = "<br>".join(f"• {r}" for r in result.rationale[:4])
|
|
886
|
+
fig.add_annotation(
|
|
887
|
+
x=0.5, y=0.2,
|
|
888
|
+
text=rationale_text,
|
|
889
|
+
font={"size": 12, "color": "#666666"},
|
|
890
|
+
showarrow=False,
|
|
891
|
+
xref="paper", yref="paper",
|
|
892
|
+
align="center"
|
|
893
|
+
)
|
|
894
|
+
|
|
895
|
+
fig.update_layout(
|
|
896
|
+
title=title or "Segmentation Recommendation",
|
|
897
|
+
template=self.theme,
|
|
898
|
+
height=280,
|
|
899
|
+
margin={"t": 50, "b": 20, "l": 20, "r": 20},
|
|
900
|
+
xaxis={"visible": False, "range": [0, 1]},
|
|
901
|
+
yaxis={"visible": False, "range": [0, 1]},
|
|
902
|
+
)
|
|
903
|
+
return fig
|
|
904
|
+
|
|
905
|
+
# =========================================================================
|
|
906
|
+
# Advanced Time Series Visualizations
|
|
907
|
+
# =========================================================================
|
|
908
|
+
|
|
909
|
+
def sparkline(
|
|
910
|
+
self,
|
|
911
|
+
values: List[float],
|
|
912
|
+
title: Optional[str] = None,
|
|
913
|
+
show_endpoints: bool = True,
|
|
914
|
+
show_min_max: bool = True,
|
|
915
|
+
height: int = 60,
|
|
916
|
+
width: int = 200,
|
|
917
|
+
) -> go.Figure:
|
|
918
|
+
"""Create a compact sparkline for inline time series display.
|
|
919
|
+
|
|
920
|
+
Sparklines are small, word-sized graphics that show trends at a glance.
|
|
921
|
+
Ideal for dashboards and tables where space is limited.
|
|
922
|
+
"""
|
|
923
|
+
x = list(range(len(values)))
|
|
924
|
+
|
|
925
|
+
fig = go.Figure()
|
|
926
|
+
fig.add_trace(go.Scatter(
|
|
927
|
+
x=x, y=values,
|
|
928
|
+
mode="lines",
|
|
929
|
+
line={"color": self.colors["primary"], "width": 1.5},
|
|
930
|
+
hoverinfo="y"
|
|
931
|
+
))
|
|
932
|
+
|
|
933
|
+
if show_endpoints and len(values) >= 2:
|
|
934
|
+
fig.add_trace(go.Scatter(
|
|
935
|
+
x=[0, len(values) - 1],
|
|
936
|
+
y=[values[0], values[-1]],
|
|
937
|
+
mode="markers",
|
|
938
|
+
marker={"color": self.colors["primary"], "size": 6},
|
|
939
|
+
hoverinfo="y"
|
|
940
|
+
))
|
|
941
|
+
|
|
942
|
+
if show_min_max and len(values) >= 2:
|
|
943
|
+
min_idx, max_idx = int(np.argmin(values)), int(np.argmax(values))
|
|
944
|
+
fig.add_trace(go.Scatter(
|
|
945
|
+
x=[min_idx], y=[values[min_idx]],
|
|
946
|
+
mode="markers",
|
|
947
|
+
marker={"color": self.colors["danger"], "size": 5},
|
|
948
|
+
hovertemplate=f"Min: {values[min_idx]:.2f}<extra></extra>"
|
|
949
|
+
))
|
|
950
|
+
fig.add_trace(go.Scatter(
|
|
951
|
+
x=[max_idx], y=[values[max_idx]],
|
|
952
|
+
mode="markers",
|
|
953
|
+
marker={"color": self.colors["success"], "size": 5},
|
|
954
|
+
hovertemplate=f"Max: {values[max_idx]:.2f}<extra></extra>"
|
|
955
|
+
))
|
|
956
|
+
|
|
957
|
+
fig.update_layout(
|
|
958
|
+
title={"text": title, "font": {"size": 10}} if title else None,
|
|
959
|
+
height=height,
|
|
960
|
+
width=width,
|
|
961
|
+
margin={"t": 20 if title else 5, "b": 5, "l": 5, "r": 5},
|
|
962
|
+
xaxis={"visible": False},
|
|
963
|
+
yaxis={"visible": False},
|
|
964
|
+
showlegend=False,
|
|
965
|
+
template=self.theme,
|
|
966
|
+
)
|
|
967
|
+
return fig
|
|
968
|
+
|
|
969
|
+
def sparkline_grid(
|
|
970
|
+
self,
|
|
971
|
+
data: Dict[str, List[float]],
|
|
972
|
+
columns: int = 4,
|
|
973
|
+
sparkline_height: int = 60,
|
|
974
|
+
sparkline_width: int = 180,
|
|
975
|
+
) -> go.Figure:
|
|
976
|
+
"""Create a grid of sparklines for multiple time series comparison."""
|
|
977
|
+
from plotly.subplots import make_subplots
|
|
978
|
+
|
|
979
|
+
names = list(data.keys())
|
|
980
|
+
n_rows = (len(names) + columns - 1) // columns
|
|
981
|
+
|
|
982
|
+
fig = make_subplots(
|
|
983
|
+
rows=n_rows, cols=columns,
|
|
984
|
+
subplot_titles=names,
|
|
985
|
+
vertical_spacing=0.15,
|
|
986
|
+
horizontal_spacing=0.08,
|
|
987
|
+
)
|
|
988
|
+
|
|
989
|
+
for i, (name, values) in enumerate(data.items()):
|
|
990
|
+
row, col = (i // columns) + 1, (i % columns) + 1
|
|
991
|
+
x = list(range(len(values)))
|
|
992
|
+
|
|
993
|
+
fig.add_trace(
|
|
994
|
+
go.Scatter(x=x, y=values, mode="lines",
|
|
995
|
+
line={"color": self.colors["primary"], "width": 1.5},
|
|
996
|
+
showlegend=False),
|
|
997
|
+
row=row, col=col
|
|
998
|
+
)
|
|
999
|
+
|
|
1000
|
+
if len(values) >= 2:
|
|
1001
|
+
trend = values[-1] - values[0]
|
|
1002
|
+
color = self.colors["success"] if trend >= 0 else self.colors["danger"]
|
|
1003
|
+
fig.add_trace(
|
|
1004
|
+
go.Scatter(x=[len(values) - 1], y=[values[-1]], mode="markers",
|
|
1005
|
+
marker={"color": color, "size": 6}, showlegend=False),
|
|
1006
|
+
row=row, col=col
|
|
1007
|
+
)
|
|
1008
|
+
|
|
1009
|
+
fig.update_xaxes(visible=False)
|
|
1010
|
+
fig.update_yaxes(visible=False)
|
|
1011
|
+
fig.update_layout(
|
|
1012
|
+
height=n_rows * sparkline_height + 50,
|
|
1013
|
+
template=self.theme,
|
|
1014
|
+
margin={"t": 40, "b": 20},
|
|
1015
|
+
)
|
|
1016
|
+
return fig
|
|
1017
|
+
|
|
1018
|
+
def calendar_heatmap(
|
|
1019
|
+
self,
|
|
1020
|
+
dates: Series,
|
|
1021
|
+
values: Optional[Series] = None,
|
|
1022
|
+
title: Optional[str] = None,
|
|
1023
|
+
colorscale: str = "Blues",
|
|
1024
|
+
) -> go.Figure:
|
|
1025
|
+
"""Create a calendar heatmap showing patterns by day-of-week and week-of-year.
|
|
1026
|
+
|
|
1027
|
+
Similar to GitHub contribution graphs. Shows temporal patterns at a glance.
|
|
1028
|
+
If values not provided, shows count of occurrences per day.
|
|
1029
|
+
"""
|
|
1030
|
+
import pandas as pd
|
|
1031
|
+
dates = ensure_pandas_series(dates)
|
|
1032
|
+
parsed = pd.to_datetime(dates, errors="coerce")
|
|
1033
|
+
|
|
1034
|
+
if values is not None:
|
|
1035
|
+
values = ensure_pandas_series(values)
|
|
1036
|
+
df_cal = pd.DataFrame({"date": parsed, "value": values}).dropna()
|
|
1037
|
+
daily = df_cal.groupby(df_cal["date"].dt.date)["value"].sum()
|
|
1038
|
+
else:
|
|
1039
|
+
daily = parsed.dropna().dt.date.value_counts().sort_index()
|
|
1040
|
+
|
|
1041
|
+
if len(daily) == 0:
|
|
1042
|
+
fig = go.Figure()
|
|
1043
|
+
fig.add_annotation(text="No valid dates", x=0.5, y=0.5, showarrow=False)
|
|
1044
|
+
return fig
|
|
1045
|
+
|
|
1046
|
+
df_daily = pd.DataFrame({"date": pd.to_datetime(daily.index), "value": daily.values})
|
|
1047
|
+
df_daily["week"] = df_daily["date"].dt.isocalendar().week
|
|
1048
|
+
df_daily["year"] = df_daily["date"].dt.year
|
|
1049
|
+
df_daily["dow"] = df_daily["date"].dt.dayofweek
|
|
1050
|
+
df_daily["year_week"] = df_daily["year"].astype(str) + "-W" + df_daily["week"].astype(str).str.zfill(2)
|
|
1051
|
+
|
|
1052
|
+
pivot = df_daily.pivot_table(index="dow", columns="year_week", values="value", aggfunc="sum")
|
|
1053
|
+
|
|
1054
|
+
fig = go.Figure(go.Heatmap(
|
|
1055
|
+
z=pivot.values,
|
|
1056
|
+
x=pivot.columns.tolist(),
|
|
1057
|
+
y=[self.DOW_NAMES[i] for i in pivot.index],
|
|
1058
|
+
colorscale=colorscale,
|
|
1059
|
+
hovertemplate="Week: %{x}<br>Day: %{y}<br>Value: %{z:,.0f}<extra></extra>",
|
|
1060
|
+
))
|
|
1061
|
+
|
|
1062
|
+
fig.update_layout(
|
|
1063
|
+
title=title or "Calendar Heatmap",
|
|
1064
|
+
xaxis_title="Week",
|
|
1065
|
+
yaxis_title="Day of Week",
|
|
1066
|
+
template=self.theme,
|
|
1067
|
+
height=250,
|
|
1068
|
+
xaxis={"tickangle": -45, "dtick": 4},
|
|
1069
|
+
)
|
|
1070
|
+
return fig
|
|
1071
|
+
|
|
1072
|
+
def monthly_calendar_heatmap(
|
|
1073
|
+
self,
|
|
1074
|
+
dates: Series,
|
|
1075
|
+
values: Optional[Series] = None,
|
|
1076
|
+
title: Optional[str] = None,
|
|
1077
|
+
) -> go.Figure:
|
|
1078
|
+
"""Create a month x day-of-week heatmap for pattern discovery."""
|
|
1079
|
+
import pandas as pd
|
|
1080
|
+
dates = ensure_pandas_series(dates)
|
|
1081
|
+
parsed = pd.to_datetime(dates, errors="coerce").dropna()
|
|
1082
|
+
|
|
1083
|
+
if values is not None:
|
|
1084
|
+
values = ensure_pandas_series(values)
|
|
1085
|
+
df_cal = pd.DataFrame({"date": parsed, "value": values}).dropna()
|
|
1086
|
+
df_cal["month"] = df_cal["date"].dt.month
|
|
1087
|
+
df_cal["dow"] = df_cal["date"].dt.dayofweek
|
|
1088
|
+
pivot = df_cal.pivot_table(index="dow", columns="month", values="value", aggfunc="mean")
|
|
1089
|
+
else:
|
|
1090
|
+
df_cal = pd.DataFrame({"date": parsed})
|
|
1091
|
+
df_cal["month"] = df_cal["date"].dt.month
|
|
1092
|
+
df_cal["dow"] = df_cal["date"].dt.dayofweek
|
|
1093
|
+
pivot = df_cal.groupby(["dow", "month"]).size().unstack(fill_value=0)
|
|
1094
|
+
|
|
1095
|
+
month_labels = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
|
|
1096
|
+
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
|
|
1097
|
+
|
|
1098
|
+
fig = go.Figure(go.Heatmap(
|
|
1099
|
+
z=pivot.values,
|
|
1100
|
+
x=[month_labels[i-1] for i in pivot.columns],
|
|
1101
|
+
y=[self.DOW_NAMES[i] for i in pivot.index],
|
|
1102
|
+
colorscale="YlOrRd",
|
|
1103
|
+
hovertemplate="Month: %{x}<br>Day: %{y}<br>Value: %{z:,.1f}<extra></extra>",
|
|
1104
|
+
))
|
|
1105
|
+
|
|
1106
|
+
fig.update_layout(
|
|
1107
|
+
title=title or "Activity by Month and Day of Week",
|
|
1108
|
+
template=self.theme,
|
|
1109
|
+
height=280,
|
|
1110
|
+
)
|
|
1111
|
+
return fig
|
|
1112
|
+
|
|
1113
|
+
def time_series_with_anomalies(
|
|
1114
|
+
self,
|
|
1115
|
+
dates: Series,
|
|
1116
|
+
values: Series,
|
|
1117
|
+
window: int = 7,
|
|
1118
|
+
n_std: float = 2.0,
|
|
1119
|
+
title: Optional[str] = None,
|
|
1120
|
+
) -> go.Figure:
|
|
1121
|
+
"""Create time series plot with anomaly detection bands.
|
|
1122
|
+
|
|
1123
|
+
Uses rolling mean ± n_std * rolling_std to define normal bounds.
|
|
1124
|
+
Points outside bounds are highlighted as anomalies.
|
|
1125
|
+
"""
|
|
1126
|
+
import pandas as pd
|
|
1127
|
+
dates = ensure_pandas_series(dates)
|
|
1128
|
+
values = ensure_pandas_series(values)
|
|
1129
|
+
|
|
1130
|
+
df = pd.DataFrame({"date": pd.to_datetime(dates), "value": values}).dropna()
|
|
1131
|
+
df = df.sort_values("date")
|
|
1132
|
+
|
|
1133
|
+
df["rolling_mean"] = df["value"].rolling(window=window, center=True, min_periods=1).mean()
|
|
1134
|
+
df["rolling_std"] = df["value"].rolling(window=window, center=True, min_periods=1).std()
|
|
1135
|
+
df["upper"] = df["rolling_mean"] + n_std * df["rolling_std"]
|
|
1136
|
+
df["lower"] = df["rolling_mean"] - n_std * df["rolling_std"]
|
|
1137
|
+
df["is_anomaly"] = (df["value"] > df["upper"]) | (df["value"] < df["lower"])
|
|
1138
|
+
|
|
1139
|
+
anomaly_count = df["is_anomaly"].sum()
|
|
1140
|
+
anomaly_pct = anomaly_count / len(df) * 100
|
|
1141
|
+
|
|
1142
|
+
fig = go.Figure()
|
|
1143
|
+
|
|
1144
|
+
# Confidence band
|
|
1145
|
+
fig.add_trace(go.Scatter(
|
|
1146
|
+
x=pd.concat([df["date"], df["date"][::-1]]),
|
|
1147
|
+
y=pd.concat([df["upper"], df["lower"][::-1]]),
|
|
1148
|
+
fill="toself",
|
|
1149
|
+
fillcolor="rgba(31, 119, 180, 0.2)",
|
|
1150
|
+
line={"color": "rgba(255,255,255,0)"},
|
|
1151
|
+
name=f"Normal Range (±{n_std}σ)",
|
|
1152
|
+
hoverinfo="skip",
|
|
1153
|
+
))
|
|
1154
|
+
|
|
1155
|
+
# Rolling mean
|
|
1156
|
+
fig.add_trace(go.Scatter(
|
|
1157
|
+
x=df["date"], y=df["rolling_mean"],
|
|
1158
|
+
mode="lines",
|
|
1159
|
+
line={"color": self.colors["info"], "width": 1, "dash": "dash"},
|
|
1160
|
+
name="Rolling Mean",
|
|
1161
|
+
))
|
|
1162
|
+
|
|
1163
|
+
# Normal points
|
|
1164
|
+
normal = df[~df["is_anomaly"]]
|
|
1165
|
+
fig.add_trace(go.Scatter(
|
|
1166
|
+
x=normal["date"], y=normal["value"],
|
|
1167
|
+
mode="lines+markers",
|
|
1168
|
+
line={"color": self.colors["primary"], "width": 1.5},
|
|
1169
|
+
marker={"size": 4},
|
|
1170
|
+
name="Normal",
|
|
1171
|
+
))
|
|
1172
|
+
|
|
1173
|
+
# Anomaly points
|
|
1174
|
+
anomalies = df[df["is_anomaly"]]
|
|
1175
|
+
if len(anomalies) > 0:
|
|
1176
|
+
fig.add_trace(go.Scatter(
|
|
1177
|
+
x=anomalies["date"], y=anomalies["value"],
|
|
1178
|
+
mode="markers",
|
|
1179
|
+
marker={"color": self.colors["danger"], "size": 10, "symbol": "x"},
|
|
1180
|
+
name=f"Anomalies ({anomaly_count})",
|
|
1181
|
+
))
|
|
1182
|
+
|
|
1183
|
+
fig.update_layout(
|
|
1184
|
+
title=title or f"Time Series with Anomalies ({anomaly_pct:.1f}% anomalous)",
|
|
1185
|
+
xaxis_title="Date",
|
|
1186
|
+
yaxis_title="Value",
|
|
1187
|
+
template=self.theme,
|
|
1188
|
+
height=400,
|
|
1189
|
+
legend={"orientation": "h", "y": -0.15},
|
|
1190
|
+
)
|
|
1191
|
+
return fig
|
|
1192
|
+
|
|
1193
|
+
def waterfall_chart(
|
|
1194
|
+
self,
|
|
1195
|
+
categories: List[str],
|
|
1196
|
+
values: List[float],
|
|
1197
|
+
title: Optional[str] = None,
|
|
1198
|
+
initial_label: str = "Start",
|
|
1199
|
+
final_label: str = "End",
|
|
1200
|
+
) -> go.Figure:
|
|
1201
|
+
"""Create a waterfall chart showing cumulative impact.
|
|
1202
|
+
|
|
1203
|
+
Shows how sequential changes contribute to a final result.
|
|
1204
|
+
Useful for explaining score breakdowns or cumulative effects.
|
|
1205
|
+
"""
|
|
1206
|
+
measures = ["absolute"] + ["relative"] * len(values) + ["total"]
|
|
1207
|
+
x_labels = [initial_label] + categories + [final_label]
|
|
1208
|
+
|
|
1209
|
+
initial_value = 0
|
|
1210
|
+
cumulative = initial_value
|
|
1211
|
+
y_values = [initial_value]
|
|
1212
|
+
text_values = [f"{initial_value:,.0f}"]
|
|
1213
|
+
|
|
1214
|
+
for v in values:
|
|
1215
|
+
y_values.append(v)
|
|
1216
|
+
cumulative += v
|
|
1217
|
+
sign = "+" if v >= 0 else ""
|
|
1218
|
+
text_values.append(f"{sign}{v:,.0f}")
|
|
1219
|
+
|
|
1220
|
+
y_values.append(cumulative)
|
|
1221
|
+
text_values.append(f"{cumulative:,.0f}")
|
|
1222
|
+
|
|
1223
|
+
colors = [self.colors["info"]] # Initial
|
|
1224
|
+
for v in values:
|
|
1225
|
+
colors.append(self.colors["success"] if v >= 0 else self.colors["danger"])
|
|
1226
|
+
colors.append(self.colors["primary"]) # Total
|
|
1227
|
+
|
|
1228
|
+
fig = go.Figure(go.Waterfall(
|
|
1229
|
+
x=x_labels,
|
|
1230
|
+
y=y_values,
|
|
1231
|
+
measure=measures,
|
|
1232
|
+
text=text_values,
|
|
1233
|
+
textposition="outside",
|
|
1234
|
+
connector={"line": {"color": "gray", "width": 1, "dash": "dot"}},
|
|
1235
|
+
increasing={"marker": {"color": self.colors["success"]}},
|
|
1236
|
+
decreasing={"marker": {"color": self.colors["danger"]}},
|
|
1237
|
+
totals={"marker": {"color": self.colors["primary"]}},
|
|
1238
|
+
))
|
|
1239
|
+
|
|
1240
|
+
fig.update_layout(
|
|
1241
|
+
title=title or "Waterfall Chart",
|
|
1242
|
+
template=self.theme,
|
|
1243
|
+
height=400,
|
|
1244
|
+
showlegend=False,
|
|
1245
|
+
)
|
|
1246
|
+
return fig
|
|
1247
|
+
|
|
1248
|
+
def quality_waterfall(
|
|
1249
|
+
self,
|
|
1250
|
+
check_results: List[Dict[str, Any]],
|
|
1251
|
+
max_score: int = 100,
|
|
1252
|
+
title: Optional[str] = None,
|
|
1253
|
+
) -> go.Figure:
|
|
1254
|
+
"""Create a waterfall chart specifically for quality score breakdown.
|
|
1255
|
+
|
|
1256
|
+
Shows how each check contributes to or detracts from the total score.
|
|
1257
|
+
|
|
1258
|
+
Args:
|
|
1259
|
+
check_results: List of dicts with 'name', 'passed', 'weight' keys
|
|
1260
|
+
max_score: Maximum possible score (default 100)
|
|
1261
|
+
title: Chart title
|
|
1262
|
+
"""
|
|
1263
|
+
categories = []
|
|
1264
|
+
values = []
|
|
1265
|
+
|
|
1266
|
+
for check in check_results:
|
|
1267
|
+
categories.append(check["name"])
|
|
1268
|
+
if check["passed"]:
|
|
1269
|
+
values.append(0) # No penalty
|
|
1270
|
+
else:
|
|
1271
|
+
penalty = -check["weight"] * (max_score / sum(c["weight"] for c in check_results))
|
|
1272
|
+
values.append(penalty)
|
|
1273
|
+
|
|
1274
|
+
return self.waterfall_chart(
|
|
1275
|
+
categories=categories,
|
|
1276
|
+
values=values,
|
|
1277
|
+
title=title or "Quality Score Breakdown",
|
|
1278
|
+
initial_label="Max Score",
|
|
1279
|
+
final_label="Final Score",
|
|
1280
|
+
)
|
|
1281
|
+
|
|
1282
|
+
def velocity_acceleration_chart(
|
|
1283
|
+
self,
|
|
1284
|
+
data: Dict[str, Dict[str, List[float]]],
|
|
1285
|
+
title: Optional[str] = None,
|
|
1286
|
+
) -> go.Figure:
|
|
1287
|
+
"""Create side-by-side Value/Velocity/Acceleration chart for cohort comparison.
|
|
1288
|
+
|
|
1289
|
+
Args:
|
|
1290
|
+
data: Dict with structure {column: {"retained": [...], "churned": [...], "velocity_retained": [...], ...}}
|
|
1291
|
+
title: Chart title
|
|
1292
|
+
"""
|
|
1293
|
+
from plotly.subplots import make_subplots
|
|
1294
|
+
|
|
1295
|
+
columns = list(data.keys())
|
|
1296
|
+
n_cols = len(columns)
|
|
1297
|
+
|
|
1298
|
+
fig = make_subplots(
|
|
1299
|
+
rows=n_cols, cols=3,
|
|
1300
|
+
subplot_titles=[f"{col[:12]} - Value" for col in columns] +
|
|
1301
|
+
[f"{col[:12]} - Velocity" for col in columns] +
|
|
1302
|
+
[f"{col[:12]} - Accel." for col in columns],
|
|
1303
|
+
vertical_spacing=0.08,
|
|
1304
|
+
horizontal_spacing=0.08,
|
|
1305
|
+
)
|
|
1306
|
+
|
|
1307
|
+
for i, col in enumerate(columns):
|
|
1308
|
+
row = i + 1
|
|
1309
|
+
col_data = data[col]
|
|
1310
|
+
|
|
1311
|
+
# Value
|
|
1312
|
+
if "retained" in col_data:
|
|
1313
|
+
fig.add_trace(go.Scatter(
|
|
1314
|
+
y=col_data["retained"], mode="lines",
|
|
1315
|
+
line={"color": self.colors["success"], "width": 1.5},
|
|
1316
|
+
name="Retained", showlegend=(i == 0), legendgroup="retained"
|
|
1317
|
+
), row=row, col=1)
|
|
1318
|
+
if "churned" in col_data:
|
|
1319
|
+
fig.add_trace(go.Scatter(
|
|
1320
|
+
y=col_data["churned"], mode="lines",
|
|
1321
|
+
line={"color": self.colors["danger"], "width": 1.5},
|
|
1322
|
+
name="Churned", showlegend=(i == 0), legendgroup="churned"
|
|
1323
|
+
), row=row, col=1)
|
|
1324
|
+
|
|
1325
|
+
# Velocity
|
|
1326
|
+
if "velocity_retained" in col_data:
|
|
1327
|
+
fig.add_trace(go.Scatter(
|
|
1328
|
+
y=col_data["velocity_retained"], mode="lines",
|
|
1329
|
+
line={"color": self.colors["success"], "width": 1.5},
|
|
1330
|
+
showlegend=False, legendgroup="retained"
|
|
1331
|
+
), row=row, col=2)
|
|
1332
|
+
if "velocity_churned" in col_data:
|
|
1333
|
+
fig.add_trace(go.Scatter(
|
|
1334
|
+
y=col_data["velocity_churned"], mode="lines",
|
|
1335
|
+
line={"color": self.colors["danger"], "width": 1.5},
|
|
1336
|
+
showlegend=False, legendgroup="churned"
|
|
1337
|
+
), row=row, col=2)
|
|
1338
|
+
fig.add_hline(y=0, line_dash="dot", line_color="gray", row=row, col=2)
|
|
1339
|
+
|
|
1340
|
+
# Acceleration
|
|
1341
|
+
if "accel_retained" in col_data:
|
|
1342
|
+
fig.add_trace(go.Scatter(
|
|
1343
|
+
y=col_data["accel_retained"], mode="lines",
|
|
1344
|
+
line={"color": self.colors["success"], "width": 1.5},
|
|
1345
|
+
showlegend=False, legendgroup="retained"
|
|
1346
|
+
), row=row, col=3)
|
|
1347
|
+
if "accel_churned" in col_data:
|
|
1348
|
+
fig.add_trace(go.Scatter(
|
|
1349
|
+
y=col_data["accel_churned"], mode="lines",
|
|
1350
|
+
line={"color": self.colors["danger"], "width": 1.5},
|
|
1351
|
+
showlegend=False, legendgroup="churned"
|
|
1352
|
+
), row=row, col=3)
|
|
1353
|
+
fig.add_hline(y=0, line_dash="dot", line_color="gray", row=row, col=3)
|
|
1354
|
+
|
|
1355
|
+
fig.update_xaxes(showticklabels=False)
|
|
1356
|
+
fig.update_yaxes(showticklabels=False)
|
|
1357
|
+
fig.update_layout(
|
|
1358
|
+
height=150 * n_cols + 80,
|
|
1359
|
+
title=title or "Value → Velocity → Acceleration",
|
|
1360
|
+
template=self.theme,
|
|
1361
|
+
legend={"orientation": "h", "y": 1.02, "x": 0.5, "xanchor": "center"},
|
|
1362
|
+
margin={"t": 100},
|
|
1363
|
+
)
|
|
1364
|
+
return fig
|
|
1365
|
+
|
|
1366
|
+
def _create_effect_heatmap_trace(self, metric_data: Dict, variables: List[str], windows: List[str], show_colorbar: bool) -> go.Heatmap:
|
|
1367
|
+
z_vals = [[metric_data.get(var, {}).get(w, 0) for w in windows] for var in variables]
|
|
1368
|
+
text_vals = [[f"{metric_data.get(var, {}).get(w, 0):.2f}" for w in windows] for var in variables]
|
|
1369
|
+
return go.Heatmap(
|
|
1370
|
+
z=z_vals, x=windows, y=[v[:15] for v in variables],
|
|
1371
|
+
colorscale="RdBu_r", zmid=0, zmin=-1, zmax=1,
|
|
1372
|
+
text=text_vals, texttemplate="%{text}", textfont={"size": 10},
|
|
1373
|
+
showscale=show_colorbar, colorbar={"title": "Cohen's d"} if show_colorbar else None
|
|
1374
|
+
)
|
|
1375
|
+
|
|
1376
|
+
def velocity_signal_heatmap(self, data: Dict[str, Dict[str, Dict[str, float]]], title: Optional[str] = None) -> go.Figure:
|
|
1377
|
+
from plotly.subplots import make_subplots
|
|
1378
|
+
vel_data, accel_data = data.get("velocity", {}), data.get("acceleration", {})
|
|
1379
|
+
if not vel_data and not accel_data:
|
|
1380
|
+
fig = go.Figure()
|
|
1381
|
+
fig.update_layout(title=title or "No data", template=self.theme)
|
|
1382
|
+
return fig
|
|
1383
|
+
variables = list(vel_data.keys()) or list(accel_data.keys())
|
|
1384
|
+
windows = list(next(iter(vel_data.values())).keys()) if vel_data else []
|
|
1385
|
+
fig = make_subplots(
|
|
1386
|
+
rows=2, cols=1, subplot_titles=["Velocity Effect Size (d)", "Acceleration Effect Size (d)"],
|
|
1387
|
+
vertical_spacing=0.15
|
|
1388
|
+
)
|
|
1389
|
+
for row_idx, metric_data in enumerate([vel_data, accel_data], start=1):
|
|
1390
|
+
fig.add_trace(
|
|
1391
|
+
self._create_effect_heatmap_trace(metric_data, variables, windows, row_idx == 2),
|
|
1392
|
+
row=row_idx, col=1
|
|
1393
|
+
)
|
|
1394
|
+
fig.update_layout(
|
|
1395
|
+
title=title or "Velocity & Acceleration Signal Strength",
|
|
1396
|
+
height=max(400, len(variables) * 80 + 200), template=self.theme
|
|
1397
|
+
)
|
|
1398
|
+
return fig
|
|
1399
|
+
|
|
1400
|
+
def cohort_velocity_sparklines(self, results: List[Any], feature_name: str, title: Optional[str] = None) -> go.Figure:
|
|
1401
|
+
from plotly.subplots import make_subplots
|
|
1402
|
+
if not results:
|
|
1403
|
+
fig = go.Figure()
|
|
1404
|
+
fig.update_layout(title=title or f"{feature_name} - No data", template=self.theme)
|
|
1405
|
+
return fig
|
|
1406
|
+
n_windows = len(results)
|
|
1407
|
+
col_titles = [getattr(r, "period_label", f"{r.window_days}d") for r in results]
|
|
1408
|
+
row_titles = ["Retained", "Churned", "Overall", "Retained", "Churned", "Overall"]
|
|
1409
|
+
fig = make_subplots(
|
|
1410
|
+
rows=6, cols=n_windows, row_titles=row_titles, column_titles=col_titles,
|
|
1411
|
+
vertical_spacing=0.06, horizontal_spacing=0.03,
|
|
1412
|
+
row_heights=[1, 1, 1, 1, 1, 1]
|
|
1413
|
+
)
|
|
1414
|
+
styles = {
|
|
1415
|
+
"retained": (self.colors["success"], "rgba(44, 160, 44, 0.2)"),
|
|
1416
|
+
"churned": (self.colors["danger"], "rgba(214, 39, 40, 0.2)"),
|
|
1417
|
+
"overall": (self.colors["info"], "rgba(23, 190, 207, 0.2)"),
|
|
1418
|
+
}
|
|
1419
|
+
for col_idx, r in enumerate(results, start=1):
|
|
1420
|
+
self._add_velocity_sparkline(fig, r.retained_velocity, styles["retained"], 1, col_idx)
|
|
1421
|
+
self._add_velocity_sparkline(fig, r.churned_velocity, styles["churned"], 2, col_idx)
|
|
1422
|
+
self._add_velocity_sparkline(fig, r.overall_velocity, styles["overall"], 3, col_idx)
|
|
1423
|
+
self._add_velocity_sparkline(fig, r.retained_accel, styles["retained"], 4, col_idx)
|
|
1424
|
+
self._add_velocity_sparkline(fig, r.churned_accel, styles["churned"], 5, col_idx)
|
|
1425
|
+
self._add_velocity_sparkline(fig, r.overall_accel, styles["overall"], 6, col_idx)
|
|
1426
|
+
fig.update_xaxes(showticklabels=False, showgrid=False)
|
|
1427
|
+
fig.update_yaxes(showticklabels=False, showgrid=False)
|
|
1428
|
+
fig.update_layout(
|
|
1429
|
+
title=title or f"<b>{feature_name}</b>",
|
|
1430
|
+
height=520, template=self.theme,
|
|
1431
|
+
margin={"t": 60, "b": 20, "l": 80, "r": 70}
|
|
1432
|
+
)
|
|
1433
|
+
fig.add_annotation(
|
|
1434
|
+
text="<b>Velocity</b>", textangle=-90, xref="paper", yref="paper",
|
|
1435
|
+
x=-0.06, y=0.77, showarrow=False, font={"size": 12}
|
|
1436
|
+
)
|
|
1437
|
+
fig.add_annotation(
|
|
1438
|
+
text="<b>Acceleration</b>", textangle=-90, xref="paper", yref="paper",
|
|
1439
|
+
x=-0.06, y=0.23, showarrow=False, font={"size": 12}
|
|
1440
|
+
)
|
|
1441
|
+
return fig
|
|
1442
|
+
|
|
1443
|
+
def _add_velocity_sparkline(
|
|
1444
|
+
self, fig: go.Figure, data: List[float], style: tuple, row: int, col: int
|
|
1445
|
+
) -> None:
|
|
1446
|
+
if not data:
|
|
1447
|
+
return
|
|
1448
|
+
color, fill = style
|
|
1449
|
+
fig.add_trace(go.Scatter(
|
|
1450
|
+
y=data, mode="lines", line={"color": color, "width": 1.5},
|
|
1451
|
+
fill="tozeroy", fillcolor=fill, showlegend=False
|
|
1452
|
+
), row=row, col=col)
|
|
1453
|
+
|
|
1454
|
+
def lag_correlation_heatmap(self, data: Dict[str, List[float]], max_lag: int = 14, title: Optional[str] = None) -> go.Figure:
|
|
1455
|
+
columns = list(data.keys())
|
|
1456
|
+
z_values = [data[col][:max_lag] for col in columns]
|
|
1457
|
+
lag_labels = [f"Lag {i}" for i in range(1, max_lag + 1)]
|
|
1458
|
+
|
|
1459
|
+
fig = go.Figure(go.Heatmap(
|
|
1460
|
+
z=z_values,
|
|
1461
|
+
x=lag_labels,
|
|
1462
|
+
y=[col[:15] for col in columns],
|
|
1463
|
+
colorscale="RdBu_r",
|
|
1464
|
+
zmid=0,
|
|
1465
|
+
text=[[f"{v:.2f}" for v in row] for row in z_values],
|
|
1466
|
+
texttemplate="%{text}",
|
|
1467
|
+
textfont={"size": 9},
|
|
1468
|
+
colorbar={"title": "Correlation"},
|
|
1469
|
+
))
|
|
1470
|
+
|
|
1471
|
+
fig.update_layout(
|
|
1472
|
+
title=title or "Autocorrelation by Lag",
|
|
1473
|
+
xaxis_title="Lag (periods)",
|
|
1474
|
+
yaxis_title="Variable",
|
|
1475
|
+
template=self.theme,
|
|
1476
|
+
height=50 * len(columns) + 150,
|
|
1477
|
+
)
|
|
1478
|
+
return fig
|
|
1479
|
+
|
|
1480
|
+
def predictive_power_chart(
|
|
1481
|
+
self,
|
|
1482
|
+
iv_values: Dict[str, float],
|
|
1483
|
+
ks_values: Dict[str, float],
|
|
1484
|
+
title: Optional[str] = None,
|
|
1485
|
+
) -> go.Figure:
|
|
1486
|
+
"""Create side-by-side IV and KS statistic bar charts.
|
|
1487
|
+
|
|
1488
|
+
Args:
|
|
1489
|
+
iv_values: Dict with {column: iv_value}
|
|
1490
|
+
ks_values: Dict with {column: ks_value}
|
|
1491
|
+
title: Chart title
|
|
1492
|
+
"""
|
|
1493
|
+
from plotly.subplots import make_subplots
|
|
1494
|
+
|
|
1495
|
+
# Sort by IV
|
|
1496
|
+
sorted_cols = sorted(iv_values.keys(), key=lambda x: iv_values[x], reverse=True)
|
|
1497
|
+
ivs = [iv_values[c] for c in sorted_cols]
|
|
1498
|
+
kss = [ks_values.get(c, 0) for c in sorted_cols]
|
|
1499
|
+
col_labels = [c[:15] for c in sorted_cols]
|
|
1500
|
+
|
|
1501
|
+
fig = make_subplots(
|
|
1502
|
+
rows=1, cols=2,
|
|
1503
|
+
subplot_titles=("Information Value (IV)", "KS Statistic"),
|
|
1504
|
+
)
|
|
1505
|
+
|
|
1506
|
+
fig.add_trace(go.Bar(
|
|
1507
|
+
x=col_labels, y=ivs, marker_color=self._get_iv_colors(ivs), name="IV"
|
|
1508
|
+
), row=1, col=1)
|
|
1509
|
+
fig.add_hline(y=0.3, line_dash="dash", line_color="green", row=1, col=1)
|
|
1510
|
+
fig.add_hline(y=0.1, line_dash="dash", line_color="orange", row=1, col=1)
|
|
1511
|
+
|
|
1512
|
+
fig.add_trace(go.Bar(
|
|
1513
|
+
x=col_labels, y=kss, marker_color=self._get_ks_colors(kss), name="KS"
|
|
1514
|
+
), row=1, col=2)
|
|
1515
|
+
fig.add_hline(y=0.4, line_dash="dash", line_color="green", row=1, col=2)
|
|
1516
|
+
fig.add_hline(y=0.2, line_dash="dash", line_color="orange", row=1, col=2)
|
|
1517
|
+
|
|
1518
|
+
fig.update_layout(
|
|
1519
|
+
title=title or "Variable Predictive Power",
|
|
1520
|
+
template=self.theme,
|
|
1521
|
+
height=400,
|
|
1522
|
+
showlegend=False,
|
|
1523
|
+
)
|
|
1524
|
+
fig.update_xaxes(tickangle=45)
|
|
1525
|
+
return fig
|
|
1526
|
+
|
|
1527
|
+
def momentum_comparison_chart(
|
|
1528
|
+
self,
|
|
1529
|
+
data: Dict[str, Dict[str, float]],
|
|
1530
|
+
title: Optional[str] = None,
|
|
1531
|
+
window_label: Optional[str] = None,
|
|
1532
|
+
) -> go.Figure:
|
|
1533
|
+
columns = list(data.keys())
|
|
1534
|
+
first_col_data = data[columns[0]] if columns else {}
|
|
1535
|
+
uses_simple_keys = "retained" in first_col_data or "churned" in first_col_data
|
|
1536
|
+
|
|
1537
|
+
if uses_simple_keys:
|
|
1538
|
+
return self._create_simple_momentum_chart(data, columns, title, window_label)
|
|
1539
|
+
return self._create_multi_window_momentum_chart(data, columns, title, window_label)
|
|
1540
|
+
|
|
1541
|
+
def _create_simple_momentum_chart(
|
|
1542
|
+
self, data: Dict, columns: List[str], title: Optional[str], window_label: Optional[str]
|
|
1543
|
+
) -> go.Figure:
|
|
1544
|
+
col_labels = [c[:15] for c in columns]
|
|
1545
|
+
fig = go.Figure()
|
|
1546
|
+
fig.add_trace(go.Bar(
|
|
1547
|
+
name="Retained", x=col_labels,
|
|
1548
|
+
y=[data[c].get("retained", 1) for c in columns],
|
|
1549
|
+
marker_color=self.colors["success"],
|
|
1550
|
+
))
|
|
1551
|
+
fig.add_trace(go.Bar(
|
|
1552
|
+
name="Churned", x=col_labels,
|
|
1553
|
+
y=[data[c].get("churned", 1) for c in columns],
|
|
1554
|
+
marker_color=self.colors["danger"],
|
|
1555
|
+
))
|
|
1556
|
+
fig.add_hline(y=1.0, line_dash="dash", line_color="gray",
|
|
1557
|
+
annotation_text="baseline", annotation_position="right")
|
|
1558
|
+
chart_title = title or f"Momentum Comparison{f' ({window_label})' if window_label else ''}"
|
|
1559
|
+
fig.update_layout(
|
|
1560
|
+
title=chart_title, template=self.theme, height=450, barmode="group",
|
|
1561
|
+
legend={"orientation": "h", "y": -0.15, "x": 0.5, "xanchor": "center"},
|
|
1562
|
+
xaxis_title="Feature", yaxis_title="Momentum (>1 = increasing, <1 = decreasing)",
|
|
1563
|
+
margin={"b": 100},
|
|
1564
|
+
)
|
|
1565
|
+
return fig
|
|
1566
|
+
|
|
1567
|
+
def _create_multi_window_momentum_chart(
|
|
1568
|
+
self, data: Dict, columns: List[str], title: Optional[str], window_label: Optional[str]
|
|
1569
|
+
) -> go.Figure:
|
|
1570
|
+
from plotly.subplots import make_subplots
|
|
1571
|
+
|
|
1572
|
+
col_labels = [c[:15] for c in columns]
|
|
1573
|
+
fig = make_subplots(
|
|
1574
|
+
rows=1, cols=2,
|
|
1575
|
+
subplot_titles=(window_label or "Short/Medium", "Medium/Long"),
|
|
1576
|
+
)
|
|
1577
|
+
self._add_momentum_cohort_bars(
|
|
1578
|
+
fig, col_labels, columns, data,
|
|
1579
|
+
retained_key="retained_7_30", churned_key="churned_7_30",
|
|
1580
|
+
col=1, show_legend=True,
|
|
1581
|
+
)
|
|
1582
|
+
self._add_momentum_cohort_bars(
|
|
1583
|
+
fig, col_labels, columns, data,
|
|
1584
|
+
retained_key="retained_30_90", churned_key="churned_30_90",
|
|
1585
|
+
col=2, show_legend=False,
|
|
1586
|
+
)
|
|
1587
|
+
fig.update_layout(
|
|
1588
|
+
title=title or "Momentum by Retention Status",
|
|
1589
|
+
template=self.theme, height=450, barmode="group",
|
|
1590
|
+
legend={"orientation": "h", "y": -0.15, "x": 0.5, "xanchor": "center"},
|
|
1591
|
+
margin={"b": 100},
|
|
1592
|
+
)
|
|
1593
|
+
return fig
|
|
1594
|
+
|
|
1595
|
+
def _add_momentum_cohort_bars(
|
|
1596
|
+
self, fig: go.Figure, col_labels: List[str], columns: List[str],
|
|
1597
|
+
data: Dict, retained_key: str, churned_key: str, col: int, show_legend: bool
|
|
1598
|
+
) -> None:
|
|
1599
|
+
fig.add_trace(go.Bar(
|
|
1600
|
+
name="Retained", x=col_labels,
|
|
1601
|
+
y=[data[c].get(retained_key, data[c].get("retained", 1)) for c in columns],
|
|
1602
|
+
marker_color=self.colors["success"], showlegend=show_legend,
|
|
1603
|
+
), row=1, col=col)
|
|
1604
|
+
fig.add_trace(go.Bar(
|
|
1605
|
+
name="Churned", x=col_labels,
|
|
1606
|
+
y=[data[c].get(churned_key, data[c].get("churned", 1)) for c in columns],
|
|
1607
|
+
marker_color=self.colors["danger"], showlegend=show_legend,
|
|
1608
|
+
), row=1, col=col)
|
|
1609
|
+
fig.add_hline(y=1.0, line_dash="dash", line_color="gray", row=1, col=col)
|
|
1610
|
+
|
|
1611
|
+
def cohort_sparklines(
|
|
1612
|
+
self,
|
|
1613
|
+
data: Dict[str, Dict[str, List[float]]],
|
|
1614
|
+
feature_name: str,
|
|
1615
|
+
period_effects: Optional[Dict[str, float]] = None,
|
|
1616
|
+
) -> go.Figure:
|
|
1617
|
+
"""Create 3x3 sparkline grid: cohorts (rows) × time periods (cols) for one feature."""
|
|
1618
|
+
from plotly.subplots import make_subplots
|
|
1619
|
+
|
|
1620
|
+
cohorts = ["retained", "churned", "overall"]
|
|
1621
|
+
periods = ["weekly", "monthly", "yearly"]
|
|
1622
|
+
row_titles = ["Retained", "Churned", "Overall"]
|
|
1623
|
+
col_titles = self._build_period_titles(periods, period_effects)
|
|
1624
|
+
|
|
1625
|
+
fig = make_subplots(
|
|
1626
|
+
rows=3, cols=3,
|
|
1627
|
+
row_titles=row_titles,
|
|
1628
|
+
column_titles=col_titles,
|
|
1629
|
+
vertical_spacing=0.08,
|
|
1630
|
+
horizontal_spacing=0.06,
|
|
1631
|
+
)
|
|
1632
|
+
|
|
1633
|
+
styles = {
|
|
1634
|
+
"retained": (self.colors["success"], "rgba(44, 160, 44, 0.2)"),
|
|
1635
|
+
"churned": (self.colors["danger"], "rgba(214, 39, 40, 0.2)"),
|
|
1636
|
+
"overall": (self.colors["info"], "rgba(23, 190, 207, 0.2)"),
|
|
1637
|
+
}
|
|
1638
|
+
|
|
1639
|
+
for row_idx, cohort in enumerate(cohorts):
|
|
1640
|
+
if cohort not in data:
|
|
1641
|
+
continue
|
|
1642
|
+
color, fill = styles[cohort]
|
|
1643
|
+
for col_idx, period in enumerate(periods):
|
|
1644
|
+
if period in data[cohort]:
|
|
1645
|
+
fig.add_trace(go.Scatter(
|
|
1646
|
+
y=data[cohort][period], mode="lines",
|
|
1647
|
+
line={"color": color, "width": 1.5},
|
|
1648
|
+
fill="tozeroy", fillcolor=fill, showlegend=False,
|
|
1649
|
+
), row=row_idx + 1, col=col_idx + 1)
|
|
1650
|
+
|
|
1651
|
+
fig.update_xaxes(showticklabels=False, showgrid=False)
|
|
1652
|
+
fig.update_yaxes(showticklabels=False, showgrid=False)
|
|
1653
|
+
fig.update_layout(
|
|
1654
|
+
title=f"<b>{feature_name}</b>",
|
|
1655
|
+
height=280,
|
|
1656
|
+
template=self.theme,
|
|
1657
|
+
margin={"t": 50, "b": 20, "l": 70, "r": 20},
|
|
1658
|
+
)
|
|
1659
|
+
return fig
|
|
1660
|
+
|
|
1661
|
+
def _build_period_titles(self, periods: List[str], effects: Optional[Dict[str, float]]) -> List[str]:
|
|
1662
|
+
labels = {"weekly": "Weekly", "monthly": "Monthly", "yearly": "Yearly"}
|
|
1663
|
+
if not effects:
|
|
1664
|
+
return [labels[p] for p in periods]
|
|
1665
|
+
return [f"{labels[p]} (d={effects.get(p, 0):.2f})" for p in periods]
|
|
1666
|
+
|
|
1667
|
+
def analyze_cohort_trends(
|
|
1668
|
+
self,
|
|
1669
|
+
data: Dict[str, Dict[str, List[float]]],
|
|
1670
|
+
feature_name: str,
|
|
1671
|
+
) -> Dict[str, Any]:
|
|
1672
|
+
"""Analyze separation between retained and churned trends across time periods."""
|
|
1673
|
+
periods_analysis = {}
|
|
1674
|
+
for period in ["weekly", "monthly", "yearly"]:
|
|
1675
|
+
if self._has_cohort_period_data(data, period):
|
|
1676
|
+
periods_analysis[period] = self._analyze_period(data, period)
|
|
1677
|
+
|
|
1678
|
+
best_period = self._find_best_period(periods_analysis)
|
|
1679
|
+
recommendation = self._generate_trend_recommendation(feature_name, periods_analysis, best_period)
|
|
1680
|
+
actions = self._generate_actions(feature_name, periods_analysis, best_period)
|
|
1681
|
+
overall_d = self._compute_overall_effect_size(data)
|
|
1682
|
+
|
|
1683
|
+
return {
|
|
1684
|
+
"feature": feature_name,
|
|
1685
|
+
"periods": periods_analysis,
|
|
1686
|
+
"best_period": best_period,
|
|
1687
|
+
"overall_effect_size": overall_d,
|
|
1688
|
+
"recommendation": recommendation,
|
|
1689
|
+
"actions": actions,
|
|
1690
|
+
}
|
|
1691
|
+
|
|
1692
|
+
def _compute_overall_effect_size(self, data: Dict[str, Dict[str, List[float]]]) -> float:
|
|
1693
|
+
if "retained" not in data or "churned" not in data:
|
|
1694
|
+
return 0.0
|
|
1695
|
+
all_retained = [v for period_data in data["retained"].values() for v in period_data]
|
|
1696
|
+
all_churned = [v for period_data in data["churned"].values() for v in period_data]
|
|
1697
|
+
if len(all_retained) < 2 or len(all_churned) < 2:
|
|
1698
|
+
return 0.0
|
|
1699
|
+
return self._compute_cohens_d(np.array(all_retained), np.array(all_churned))
|
|
1700
|
+
|
|
1701
|
+
def _has_cohort_period_data(self, data: Dict, period: str) -> bool:
|
|
1702
|
+
return ("retained" in data and period in data["retained"] and
|
|
1703
|
+
"churned" in data and period in data["churned"])
|
|
1704
|
+
|
|
1705
|
+
@staticmethod
|
|
1706
|
+
def _classify_slope(slope: float) -> str:
|
|
1707
|
+
if slope > 0.01:
|
|
1708
|
+
return "up"
|
|
1709
|
+
return "down" if slope < -0.01 else "flat"
|
|
1710
|
+
|
|
1711
|
+
def _compute_period_trends(self, retained: np.ndarray, churned: np.ndarray) -> Dict[str, Any]:
|
|
1712
|
+
ret_trend = self._compute_trend_slope(retained)
|
|
1713
|
+
churn_trend = self._compute_trend_slope(churned)
|
|
1714
|
+
return {
|
|
1715
|
+
"retained_trend": self._classify_slope(ret_trend),
|
|
1716
|
+
"churned_trend": self._classify_slope(churn_trend),
|
|
1717
|
+
"opposite_trends": (ret_trend > 0 and churn_trend < 0) or (ret_trend < 0 and churn_trend > 0),
|
|
1718
|
+
}
|
|
1719
|
+
|
|
1720
|
+
@staticmethod
|
|
1721
|
+
def _compute_period_variance(retained: np.ndarray, churned: np.ndarray) -> Dict[str, Any]:
|
|
1722
|
+
ret_var, churn_var = float(np.var(retained)), float(np.var(churned))
|
|
1723
|
+
variance_ratio = ret_var / churn_var if churn_var > 0.001 else (10.0 if ret_var > 0.001 else 1.0)
|
|
1724
|
+
return {
|
|
1725
|
+
"variance_ratio": float(variance_ratio),
|
|
1726
|
+
"high_variance": bool(ret_var > 1.0 or churn_var > 1.0),
|
|
1727
|
+
}
|
|
1728
|
+
|
|
1729
|
+
def _analyze_period(self, data: Dict, period: str) -> Dict[str, Any]:
|
|
1730
|
+
retained = np.array(data["retained"][period])
|
|
1731
|
+
churned = np.array(data["churned"][period])
|
|
1732
|
+
result = {
|
|
1733
|
+
"divergence": self._compute_divergence(retained, churned),
|
|
1734
|
+
"effect_size": self._compute_cohens_d(retained, churned),
|
|
1735
|
+
"seasonality_detected": self._detect_seasonality(retained) or self._detect_seasonality(churned),
|
|
1736
|
+
}
|
|
1737
|
+
result.update(self._compute_period_trends(retained, churned))
|
|
1738
|
+
result.update(self._compute_period_variance(retained, churned))
|
|
1739
|
+
return result
|
|
1740
|
+
|
|
1741
|
+
def _compute_trend_slope(self, values: np.ndarray) -> float:
|
|
1742
|
+
if len(values) < 2:
|
|
1743
|
+
return 0.0
|
|
1744
|
+
x = np.arange(len(values))
|
|
1745
|
+
return float(np.polyfit(x, values, 1)[0])
|
|
1746
|
+
|
|
1747
|
+
def _compute_divergence(self, retained: np.ndarray, churned: np.ndarray) -> float:
|
|
1748
|
+
if len(retained) == 0 or len(churned) == 0:
|
|
1749
|
+
return 0.0
|
|
1750
|
+
combined_std = max(np.std(np.concatenate([retained, churned])), 0.001)
|
|
1751
|
+
return float(abs(np.mean(retained) - np.mean(churned)) / combined_std)
|
|
1752
|
+
|
|
1753
|
+
def _compute_cohens_d(self, retained: np.ndarray, churned: np.ndarray) -> float:
|
|
1754
|
+
if len(retained) < 2 or len(churned) < 2:
|
|
1755
|
+
return 0.0
|
|
1756
|
+
pooled_std = np.sqrt((np.var(retained) + np.var(churned)) / 2)
|
|
1757
|
+
if pooled_std < 0.001:
|
|
1758
|
+
return 0.0
|
|
1759
|
+
return float((np.mean(retained) - np.mean(churned)) / pooled_std)
|
|
1760
|
+
|
|
1761
|
+
def _find_best_period(self, periods: Dict[str, Dict]) -> Optional[str]:
|
|
1762
|
+
if not periods:
|
|
1763
|
+
return None
|
|
1764
|
+
return max(periods.keys(), key=lambda p: abs(periods[p].get("divergence", 0)))
|
|
1765
|
+
|
|
1766
|
+
def _generate_trend_recommendation(self, feature: str, periods: Dict, best: Optional[str]) -> str:
|
|
1767
|
+
if not best or best not in periods:
|
|
1768
|
+
return f"Insufficient data for {feature} trend analysis"
|
|
1769
|
+
|
|
1770
|
+
analysis = periods[best]
|
|
1771
|
+
div, eff = analysis["divergence"], abs(analysis["effect_size"])
|
|
1772
|
+
opposite = analysis["opposite_trends"]
|
|
1773
|
+
|
|
1774
|
+
if div > 1.5 or eff > 0.8:
|
|
1775
|
+
strength = "Strong"
|
|
1776
|
+
action = "high-priority feature for churn prediction"
|
|
1777
|
+
elif div > 0.8 or eff > 0.5:
|
|
1778
|
+
strength = "Moderate"
|
|
1779
|
+
action = "useful discriminator between cohorts"
|
|
1780
|
+
elif div > 0.3 or eff > 0.2:
|
|
1781
|
+
strength = "Weak"
|
|
1782
|
+
action = "consider combining with other features"
|
|
1783
|
+
else:
|
|
1784
|
+
return f"{feature}: No significant separation between retained and churned"
|
|
1785
|
+
|
|
1786
|
+
trend_note = " with opposite trend directions" if opposite else ""
|
|
1787
|
+
period_label = {"weekly": "Weekly", "monthly": "Monthly", "yearly": "Yearly"}[best]
|
|
1788
|
+
return f"{feature}: {strength} separation (d={eff:.2f}) at {period_label} scale{trend_note} - {action}"
|
|
1789
|
+
|
|
1790
|
+
def _detect_seasonality(self, values: np.ndarray) -> bool:
|
|
1791
|
+
if len(values) < 6:
|
|
1792
|
+
return False
|
|
1793
|
+
detrended = values - np.linspace(values[0], values[-1], len(values))
|
|
1794
|
+
autocorr = np.correlate(detrended, detrended, mode='full')
|
|
1795
|
+
autocorr = autocorr[len(autocorr) // 2:]
|
|
1796
|
+
if len(autocorr) < 3 or autocorr[0] < 0.001:
|
|
1797
|
+
return False
|
|
1798
|
+
normalized = autocorr / autocorr[0]
|
|
1799
|
+
peaks = [i for i in range(2, len(normalized) - 1)
|
|
1800
|
+
if normalized[i] > normalized[i-1] and normalized[i] > normalized[i+1]]
|
|
1801
|
+
return any(normalized[p] > 0.3 for p in peaks[:3]) if peaks else False
|
|
1802
|
+
|
|
1803
|
+
def _generate_actions(self, feature: str, periods: Dict, best: Optional[str]) -> List[Dict[str, Any]]:
|
|
1804
|
+
actions = []
|
|
1805
|
+
if not periods:
|
|
1806
|
+
return actions
|
|
1807
|
+
|
|
1808
|
+
any_seasonality = any(p.get("seasonality_detected") for p in periods.values())
|
|
1809
|
+
any_high_variance = any(p.get("high_variance") for p in periods.values())
|
|
1810
|
+
|
|
1811
|
+
if best and periods.get(best, {}).get("opposite_trends"):
|
|
1812
|
+
actions.append({
|
|
1813
|
+
"action_type": "add_trend_feature",
|
|
1814
|
+
"feature": feature,
|
|
1815
|
+
"reason": f"Opposite trends detected at {best} scale",
|
|
1816
|
+
"params": {"period": best, "method": "slope"},
|
|
1817
|
+
})
|
|
1818
|
+
|
|
1819
|
+
if any_seasonality:
|
|
1820
|
+
period_with_season = next((k for k, v in periods.items() if v.get("seasonality_detected")), None)
|
|
1821
|
+
actions.append({
|
|
1822
|
+
"action_type": "add_time_indicator",
|
|
1823
|
+
"feature": feature,
|
|
1824
|
+
"reason": f"Seasonality detected at {period_with_season} scale",
|
|
1825
|
+
"params": {"period": period_with_season, "indicators": ["cyclical_encoding"]},
|
|
1826
|
+
})
|
|
1827
|
+
|
|
1828
|
+
if any_high_variance:
|
|
1829
|
+
max_var_period = max(periods.keys(), key=lambda k: periods[k].get("variance_ratio", 1.0))
|
|
1830
|
+
var_ratio = periods[max_var_period].get("variance_ratio", 1.0)
|
|
1831
|
+
if var_ratio > 2.0:
|
|
1832
|
+
actions.append({
|
|
1833
|
+
"action_type": "robust_scale",
|
|
1834
|
+
"feature": feature,
|
|
1835
|
+
"reason": f"High variance ratio ({var_ratio:.1f}x) between cohorts",
|
|
1836
|
+
"params": {"method": "robust_scaler"},
|
|
1837
|
+
})
|
|
1838
|
+
elif any_high_variance:
|
|
1839
|
+
actions.append({
|
|
1840
|
+
"action_type": "normalize",
|
|
1841
|
+
"feature": feature,
|
|
1842
|
+
"reason": "High variance in temporal trends",
|
|
1843
|
+
"params": {"method": "standard_scaler"},
|
|
1844
|
+
})
|
|
1845
|
+
|
|
1846
|
+
return actions
|
|
1847
|
+
|
|
1848
|
+
def descriptive_stats_tiles(
|
|
1849
|
+
self,
|
|
1850
|
+
df: DataFrame,
|
|
1851
|
+
findings: Any,
|
|
1852
|
+
max_columns: int = 12,
|
|
1853
|
+
columns_per_row: int = 4,
|
|
1854
|
+
) -> go.Figure:
|
|
1855
|
+
"""Create a grid of mini chart tiles showing descriptive statistics for each column.
|
|
1856
|
+
|
|
1857
|
+
Each tile shows a type-appropriate visualization:
|
|
1858
|
+
- Numeric: histogram with mean/median markers and key stats
|
|
1859
|
+
- Categorical: top categories bar chart with cardinality
|
|
1860
|
+
- Binary: pie chart with class balance
|
|
1861
|
+
- Datetime: date range indicator
|
|
1862
|
+
- Identifier: uniqueness gauge
|
|
1863
|
+
|
|
1864
|
+
Args:
|
|
1865
|
+
df: DataFrame to visualize
|
|
1866
|
+
findings: ExplorationFindings object with column metadata
|
|
1867
|
+
max_columns: Maximum number of columns to display
|
|
1868
|
+
columns_per_row: Number of tiles per row
|
|
1869
|
+
"""
|
|
1870
|
+
from plotly.subplots import make_subplots
|
|
1871
|
+
|
|
1872
|
+
df = to_pandas(df)
|
|
1873
|
+
formatter = NumberFormatter()
|
|
1874
|
+
|
|
1875
|
+
# Exclude temporal metadata columns from visualization
|
|
1876
|
+
temporal_metadata_cols = {"feature_timestamp", "label_timestamp", "label_available_flag"}
|
|
1877
|
+
available_cols = {k: v for k, v in findings.columns.items() if k not in temporal_metadata_cols}
|
|
1878
|
+
|
|
1879
|
+
# Select columns to display (prioritize by type)
|
|
1880
|
+
type_priority = ['target', 'binary', 'numeric_continuous', 'numeric_discrete',
|
|
1881
|
+
'categorical_nominal', 'categorical_ordinal', 'datetime', 'identifier']
|
|
1882
|
+
sorted_cols = []
|
|
1883
|
+
for col_type in type_priority:
|
|
1884
|
+
for name, col in available_cols.items():
|
|
1885
|
+
if col.inferred_type.value == col_type and name not in sorted_cols:
|
|
1886
|
+
sorted_cols.append(name)
|
|
1887
|
+
for name in available_cols.keys():
|
|
1888
|
+
if name not in sorted_cols:
|
|
1889
|
+
sorted_cols.append(name)
|
|
1890
|
+
display_cols = sorted_cols[:max_columns]
|
|
1891
|
+
|
|
1892
|
+
n_cols = min(columns_per_row, len(display_cols))
|
|
1893
|
+
n_rows = (len(display_cols) + n_cols - 1) // n_cols
|
|
1894
|
+
|
|
1895
|
+
fig = make_subplots(
|
|
1896
|
+
rows=n_rows, cols=n_cols,
|
|
1897
|
+
subplot_titles=[f"<b>{c[:20]}</b>" for c in display_cols],
|
|
1898
|
+
vertical_spacing=0.12,
|
|
1899
|
+
horizontal_spacing=0.08,
|
|
1900
|
+
specs=[[{"type": "xy"} for _ in range(n_cols)] for _ in range(n_rows)]
|
|
1901
|
+
)
|
|
1902
|
+
|
|
1903
|
+
for i, col_name in enumerate(display_cols):
|
|
1904
|
+
row, col = (i // n_cols) + 1, (i % n_cols) + 1
|
|
1905
|
+
col_finding = findings.columns.get(col_name)
|
|
1906
|
+
col_type = col_finding.inferred_type.value if col_finding else "unknown"
|
|
1907
|
+
series = df[col_name] if col_name in df.columns else None
|
|
1908
|
+
|
|
1909
|
+
if series is None:
|
|
1910
|
+
continue
|
|
1911
|
+
|
|
1912
|
+
self._add_column_tile(fig, series, col_finding, col_type, row, col, formatter, n_cols)
|
|
1913
|
+
|
|
1914
|
+
fig.update_layout(
|
|
1915
|
+
height=250 * n_rows,
|
|
1916
|
+
template=self.theme,
|
|
1917
|
+
showlegend=False,
|
|
1918
|
+
margin={"t": 40, "b": 20, "l": 40, "r": 20},
|
|
1919
|
+
)
|
|
1920
|
+
|
|
1921
|
+
return fig
|
|
1922
|
+
|
|
1923
|
+
def dataset_at_a_glance(
|
|
1924
|
+
self,
|
|
1925
|
+
df: DataFrame,
|
|
1926
|
+
findings: Any,
|
|
1927
|
+
source_path: str = "",
|
|
1928
|
+
granularity: str = "entity",
|
|
1929
|
+
max_columns: int = 12,
|
|
1930
|
+
columns_per_row: int = 4,
|
|
1931
|
+
) -> go.Figure:
|
|
1932
|
+
"""Create a unified dataset overview with key metrics and column distribution tiles.
|
|
1933
|
+
|
|
1934
|
+
Combines dataset-level stats (rows, columns, format, granularity) with
|
|
1935
|
+
small multiples of column distributions for a complete first look.
|
|
1936
|
+
|
|
1937
|
+
Args:
|
|
1938
|
+
df: DataFrame to visualize
|
|
1939
|
+
findings: ExplorationFindings object with column metadata
|
|
1940
|
+
source_path: Path to data source (for format detection)
|
|
1941
|
+
granularity: Dataset granularity ("entity" or "event")
|
|
1942
|
+
max_columns: Maximum number of column tiles to display
|
|
1943
|
+
columns_per_row: Number of tiles per row
|
|
1944
|
+
"""
|
|
1945
|
+
from pathlib import Path
|
|
1946
|
+
|
|
1947
|
+
from plotly.subplots import make_subplots
|
|
1948
|
+
|
|
1949
|
+
df = to_pandas(df)
|
|
1950
|
+
formatter = NumberFormatter()
|
|
1951
|
+
|
|
1952
|
+
memory_mb = df.memory_usage(deep=True).sum() / 1024**2
|
|
1953
|
+
|
|
1954
|
+
# Detect format from path
|
|
1955
|
+
path = Path(source_path) if source_path else Path("data.csv")
|
|
1956
|
+
fmt = path.suffix.lstrip('.').upper() or "CSV"
|
|
1957
|
+
if fmt == "":
|
|
1958
|
+
fmt = "CSV"
|
|
1959
|
+
|
|
1960
|
+
# Exclude temporal metadata columns from visualization
|
|
1961
|
+
temporal_metadata_cols = {"feature_timestamp", "label_timestamp", "label_available_flag"}
|
|
1962
|
+
available_cols = {k: v for k, v in findings.columns.items() if k not in temporal_metadata_cols}
|
|
1963
|
+
|
|
1964
|
+
# Select columns to display (prioritize by type)
|
|
1965
|
+
type_priority = ['target', 'binary', 'numeric_continuous', 'numeric_discrete',
|
|
1966
|
+
'categorical_nominal', 'categorical_ordinal', 'datetime', 'identifier']
|
|
1967
|
+
sorted_cols = []
|
|
1968
|
+
for col_type in type_priority:
|
|
1969
|
+
for name, col in available_cols.items():
|
|
1970
|
+
if col.inferred_type.value == col_type and name not in sorted_cols:
|
|
1971
|
+
sorted_cols.append(name)
|
|
1972
|
+
for name in available_cols.keys():
|
|
1973
|
+
if name not in sorted_cols:
|
|
1974
|
+
sorted_cols.append(name)
|
|
1975
|
+
display_cols = sorted_cols[:max_columns]
|
|
1976
|
+
|
|
1977
|
+
n_cols = min(columns_per_row, len(display_cols))
|
|
1978
|
+
n_tile_rows = (len(display_cols) + n_cols - 1) // n_cols
|
|
1979
|
+
|
|
1980
|
+
# Build specs: 1 header row + tile rows
|
|
1981
|
+
header_specs = [{"type": "indicator"} for _ in range(n_cols)]
|
|
1982
|
+
tile_specs = [[{"type": "xy"} for _ in range(n_cols)] for _ in range(n_tile_rows)]
|
|
1983
|
+
|
|
1984
|
+
# Subplot titles: empty for header, column names for tiles
|
|
1985
|
+
titles = [""] * n_cols + [f"<b>{c[:18]}</b>" for c in display_cols]
|
|
1986
|
+
|
|
1987
|
+
fig = make_subplots(
|
|
1988
|
+
rows=1 + n_tile_rows,
|
|
1989
|
+
cols=n_cols,
|
|
1990
|
+
row_heights=[0.15] + [0.85 / n_tile_rows] * n_tile_rows,
|
|
1991
|
+
specs=[header_specs] + tile_specs,
|
|
1992
|
+
subplot_titles=titles,
|
|
1993
|
+
vertical_spacing=0.08,
|
|
1994
|
+
horizontal_spacing=0.06,
|
|
1995
|
+
)
|
|
1996
|
+
|
|
1997
|
+
# Header row: Order is Rows, Columns, Structure, Format, Memory
|
|
1998
|
+
# Use annotations for all to ensure consistent appearance
|
|
1999
|
+
structure_label = "Event" if granularity.lower() == "event" else "Entity"
|
|
2000
|
+
memory_str = f"{memory_mb:.1f} MB"
|
|
2001
|
+
|
|
2002
|
+
# Calculate header column positions for paper coordinates
|
|
2003
|
+
h_spacing = 0.06
|
|
2004
|
+
col_width = (1.0 - h_spacing * (n_cols - 1)) / n_cols
|
|
2005
|
+
|
|
2006
|
+
def get_header_x(col_idx: int) -> float:
|
|
2007
|
+
"""Get x center position for header column (1-indexed)."""
|
|
2008
|
+
return (col_idx - 1) * (col_width + h_spacing) + col_width / 2
|
|
2009
|
+
|
|
2010
|
+
# Header data: (label, value)
|
|
2011
|
+
header_items = [
|
|
2012
|
+
("Rows", f"{findings.row_count:,}"),
|
|
2013
|
+
("Columns", str(findings.column_count)),
|
|
2014
|
+
("Structure", structure_label),
|
|
2015
|
+
("Format", fmt),
|
|
2016
|
+
("Memory", memory_str),
|
|
2017
|
+
]
|
|
2018
|
+
|
|
2019
|
+
# Add placeholder indicators (needed for subplot structure)
|
|
2020
|
+
for i in range(min(n_cols, len(header_items))):
|
|
2021
|
+
fig.add_trace(go.Indicator(
|
|
2022
|
+
mode="number", value=0,
|
|
2023
|
+
number={"font": {"size": 1, "color": "rgba(0,0,0,0)"}}
|
|
2024
|
+
), row=1, col=i+1)
|
|
2025
|
+
|
|
2026
|
+
# Add labels (small, gray, top) and values (large, blue, below) as annotations
|
|
2027
|
+
label_y = 0.96
|
|
2028
|
+
value_y = 0.92
|
|
2029
|
+
|
|
2030
|
+
for i, (label, value) in enumerate(header_items[:n_cols]):
|
|
2031
|
+
x_pos = get_header_x(i + 1)
|
|
2032
|
+
|
|
2033
|
+
# Label
|
|
2034
|
+
fig.add_annotation(
|
|
2035
|
+
x=x_pos, y=label_y,
|
|
2036
|
+
xref="paper", yref="paper",
|
|
2037
|
+
text=label, showarrow=False,
|
|
2038
|
+
font={"size": 12, "color": "#666"},
|
|
2039
|
+
xanchor="center", yanchor="middle"
|
|
2040
|
+
)
|
|
2041
|
+
|
|
2042
|
+
# Value
|
|
2043
|
+
fig.add_annotation(
|
|
2044
|
+
x=x_pos, y=value_y,
|
|
2045
|
+
xref="paper", yref="paper",
|
|
2046
|
+
text=value, showarrow=False,
|
|
2047
|
+
font={"size": 28, "color": self.colors["primary"]},
|
|
2048
|
+
xanchor="center", yanchor="middle"
|
|
2049
|
+
)
|
|
2050
|
+
|
|
2051
|
+
# Column tiles (starting from row 2)
|
|
2052
|
+
for i, col_name in enumerate(display_cols):
|
|
2053
|
+
tile_row = (i // n_cols) + 2 # +2 because row 1 is header
|
|
2054
|
+
tile_col = (i % n_cols) + 1
|
|
2055
|
+
col_finding = findings.columns.get(col_name)
|
|
2056
|
+
col_type = col_finding.inferred_type.value if col_finding else "unknown"
|
|
2057
|
+
series = df[col_name] if col_name in df.columns else None
|
|
2058
|
+
|
|
2059
|
+
if series is None:
|
|
2060
|
+
continue
|
|
2061
|
+
|
|
2062
|
+
self._add_column_tile(fig, series, col_finding, col_type, tile_row, tile_col, formatter, n_cols)
|
|
2063
|
+
|
|
2064
|
+
fig.update_layout(
|
|
2065
|
+
height=120 + 220 * n_tile_rows,
|
|
2066
|
+
template=self.theme,
|
|
2067
|
+
showlegend=False,
|
|
2068
|
+
margin={"t": 30, "b": 20, "l": 40, "r": 20},
|
|
2069
|
+
)
|
|
2070
|
+
|
|
2071
|
+
return fig
|
|
2072
|
+
|
|
2073
|
+
def _add_column_tile(
|
|
2074
|
+
self,
|
|
2075
|
+
fig: go.Figure,
|
|
2076
|
+
series: Series,
|
|
2077
|
+
col_finding: Any,
|
|
2078
|
+
col_type: str,
|
|
2079
|
+
row: int,
|
|
2080
|
+
col: int,
|
|
2081
|
+
formatter: "NumberFormatter",
|
|
2082
|
+
n_cols: int = 4,
|
|
2083
|
+
) -> None:
|
|
2084
|
+
"""Add a single column tile to the subplot grid."""
|
|
2085
|
+
series = ensure_pandas_series(series)
|
|
2086
|
+
metrics = col_finding.universal_metrics if col_finding else {}
|
|
2087
|
+
type_metrics = col_finding.type_metrics if col_finding else {}
|
|
2088
|
+
|
|
2089
|
+
if col_type in ('numeric_continuous', 'numeric_discrete'):
|
|
2090
|
+
self._add_numeric_tile(fig, series, metrics, type_metrics, row, col, n_cols, formatter)
|
|
2091
|
+
elif col_type in ('categorical_nominal', 'categorical_ordinal', 'categorical_cyclical'):
|
|
2092
|
+
self._add_categorical_tile(fig, series, metrics, row, col, n_cols, formatter)
|
|
2093
|
+
elif col_type == 'binary':
|
|
2094
|
+
self._add_binary_tile(fig, series, metrics, row, col, n_cols, formatter)
|
|
2095
|
+
elif col_type in ('datetime', 'date'):
|
|
2096
|
+
self._add_datetime_tile(fig, series, metrics, row, col, n_cols)
|
|
2097
|
+
elif col_type == 'identifier':
|
|
2098
|
+
self._add_identifier_tile(fig, series, metrics, row, col, n_cols, formatter)
|
|
2099
|
+
elif col_type == 'target':
|
|
2100
|
+
self._add_target_tile(fig, series, metrics, row, col, n_cols, formatter)
|
|
2101
|
+
else:
|
|
2102
|
+
self._add_generic_tile(fig, series, metrics, row, col, n_cols, formatter)
|
|
2103
|
+
|
|
2104
|
+
def _get_axis_ref(self, row: int, col: int, n_cols: int, axis: str = "x") -> str:
|
|
2105
|
+
"""Get the correct axis reference for subplot annotations."""
|
|
2106
|
+
# Calculate linear index (0-based)
|
|
2107
|
+
idx = (row - 1) * n_cols + col
|
|
2108
|
+
# First subplot uses 'x'/'y', others use 'x2', 'x3', etc.
|
|
2109
|
+
if idx == 1:
|
|
2110
|
+
return axis
|
|
2111
|
+
return f"{axis}{idx}"
|
|
2112
|
+
|
|
2113
|
+
def _add_numeric_tile(
|
|
2114
|
+
self, fig: go.Figure, series: Series, metrics: Dict, type_metrics: Dict,
|
|
2115
|
+
row: int, col: int, n_cols: int, formatter: "NumberFormatter"
|
|
2116
|
+
) -> None:
|
|
2117
|
+
"""Add numeric column tile with histogram and stats."""
|
|
2118
|
+
clean = series.dropna()
|
|
2119
|
+
if len(clean) == 0:
|
|
2120
|
+
return
|
|
2121
|
+
|
|
2122
|
+
mean_val = type_metrics.get('mean', clean.mean())
|
|
2123
|
+
median_val = type_metrics.get('median', clean.median())
|
|
2124
|
+
std_val = type_metrics.get('std', clean.std())
|
|
2125
|
+
null_pct = metrics.get('null_percentage', 0)
|
|
2126
|
+
|
|
2127
|
+
fig.add_trace(go.Histogram(
|
|
2128
|
+
x=clean, nbinsx=20,
|
|
2129
|
+
marker_color=self.colors["primary"],
|
|
2130
|
+
opacity=0.7,
|
|
2131
|
+
hovertemplate="Range: %{x}<br>Count: %{y}<extra></extra>"
|
|
2132
|
+
), row=row, col=col)
|
|
2133
|
+
|
|
2134
|
+
xaxis_ref = self._get_axis_ref(row, col, n_cols, 'x')
|
|
2135
|
+
yaxis_ref = self._get_axis_ref(row, col, n_cols, 'y')
|
|
2136
|
+
fig.add_shape(type="line", x0=mean_val, x1=mean_val, y0=0, y1=1,
|
|
2137
|
+
xref=xaxis_ref, yref=f"{yaxis_ref} domain",
|
|
2138
|
+
line={"color": self.colors["secondary"], "width": 2, "dash": "dash"})
|
|
2139
|
+
fig.add_shape(type="line", x0=median_val, x1=median_val, y0=0, y1=1,
|
|
2140
|
+
xref=xaxis_ref, yref=f"{yaxis_ref} domain",
|
|
2141
|
+
line={"color": self.colors["success"], "width": 2, "dash": "dot"})
|
|
2142
|
+
|
|
2143
|
+
stats_text = (f"μ={formatter.compact(mean_val)} | "
|
|
2144
|
+
f"σ={formatter.compact(std_val)}" +
|
|
2145
|
+
(f"<br>null={null_pct:.0f}%" if null_pct > 0 else ""))
|
|
2146
|
+
fig.add_annotation(
|
|
2147
|
+
x=0.98, y=0.98, xref=f"{xaxis_ref} domain", yref=f"{yaxis_ref} domain",
|
|
2148
|
+
text=stats_text, showarrow=False,
|
|
2149
|
+
font={"size": 9, "color": "#666"},
|
|
2150
|
+
bgcolor="rgba(255,255,255,0.8)",
|
|
2151
|
+
xanchor="right", yanchor="top"
|
|
2152
|
+
)
|
|
2153
|
+
|
|
2154
|
+
def _add_categorical_tile(
|
|
2155
|
+
self, fig: go.Figure, series: Series, metrics: Dict,
|
|
2156
|
+
row: int, col: int, n_cols: int, formatter: "NumberFormatter"
|
|
2157
|
+
) -> None:
|
|
2158
|
+
"""Add categorical column tile with top categories bar."""
|
|
2159
|
+
value_counts = series.value_counts().head(5)
|
|
2160
|
+
|
|
2161
|
+
# Gradient colors to show rank
|
|
2162
|
+
colors = [self.colors["info"]] + [self.colors["primary"]] * (len(value_counts) - 1)
|
|
2163
|
+
|
|
2164
|
+
fig.add_trace(go.Bar(
|
|
2165
|
+
x=value_counts.values,
|
|
2166
|
+
y=[str(v)[:10] for v in value_counts.index],
|
|
2167
|
+
orientation='h',
|
|
2168
|
+
marker_color=colors[:len(value_counts)],
|
|
2169
|
+
hovertemplate="%{y}: %{x:,}<extra></extra>"
|
|
2170
|
+
), row=row, col=col)
|
|
2171
|
+
|
|
2172
|
+
def _add_binary_tile(
|
|
2173
|
+
self, fig: go.Figure, series: Series, metrics: Dict,
|
|
2174
|
+
row: int, col: int, n_cols: int, formatter: "NumberFormatter"
|
|
2175
|
+
) -> None:
|
|
2176
|
+
"""Add binary column tile with horizontal bars showing labels clearly."""
|
|
2177
|
+
value_counts = series.value_counts()
|
|
2178
|
+
if len(value_counts) == 0:
|
|
2179
|
+
return
|
|
2180
|
+
|
|
2181
|
+
labels = [str(v) for v in value_counts.index]
|
|
2182
|
+
values = value_counts.values.tolist()
|
|
2183
|
+
total = sum(values)
|
|
2184
|
+
percentages = [v/total*100 for v in values]
|
|
2185
|
+
|
|
2186
|
+
balance_ratio = max(values) / min(values) if min(values) > 0 else float('inf')
|
|
2187
|
+
balance_color = (self.colors["success"] if balance_ratio < 3
|
|
2188
|
+
else self.colors["warning"] if balance_ratio < 10
|
|
2189
|
+
else self.colors["danger"])
|
|
2190
|
+
|
|
2191
|
+
# Horizontal bars with labels on y-axis
|
|
2192
|
+
colors = [self.colors["primary"], self.colors["secondary"]]
|
|
2193
|
+
fig.add_trace(go.Bar(
|
|
2194
|
+
y=labels[:2],
|
|
2195
|
+
x=percentages[:2],
|
|
2196
|
+
orientation='h',
|
|
2197
|
+
marker_color=colors[:len(labels)],
|
|
2198
|
+
text=[f"{p:.0f}%" for p in percentages[:2]],
|
|
2199
|
+
textposition="inside",
|
|
2200
|
+
textfont={"size": 11, "color": "white"},
|
|
2201
|
+
hovertemplate="%{y}: %{x:.1f}%<extra></extra>",
|
|
2202
|
+
showlegend=False
|
|
2203
|
+
), row=row, col=col)
|
|
2204
|
+
|
|
2205
|
+
ratio_text = f"{balance_ratio:.1f}:1"
|
|
2206
|
+
xref = f"{self._get_axis_ref(row, col, n_cols, 'x')} domain"
|
|
2207
|
+
yref = f"{self._get_axis_ref(row, col, n_cols, 'y')} domain"
|
|
2208
|
+
fig.add_annotation(
|
|
2209
|
+
x=0.98, y=0.98, xref=xref, yref=yref,
|
|
2210
|
+
text=ratio_text, showarrow=False,
|
|
2211
|
+
font={"size": 10, "color": balance_color, "family": "Arial Black"},
|
|
2212
|
+
xanchor="right", yanchor="top"
|
|
2213
|
+
)
|
|
2214
|
+
|
|
2215
|
+
def _add_datetime_tile(
|
|
2216
|
+
self, fig: go.Figure, series: Series, metrics: Dict,
|
|
2217
|
+
row: int, col: int, n_cols: int
|
|
2218
|
+
) -> None:
|
|
2219
|
+
"""Add datetime column tile with date range visualization."""
|
|
2220
|
+
import warnings
|
|
2221
|
+
|
|
2222
|
+
import pandas as pd
|
|
2223
|
+
with warnings.catch_warnings():
|
|
2224
|
+
warnings.simplefilter("ignore")
|
|
2225
|
+
dates = pd.to_datetime(series, errors='coerce').dropna()
|
|
2226
|
+
if len(dates) == 0:
|
|
2227
|
+
return
|
|
2228
|
+
|
|
2229
|
+
# Monthly distribution as area chart for cleaner look
|
|
2230
|
+
counts = dates.dt.to_period('M').value_counts().sort_index()
|
|
2231
|
+
x_labels = [str(p) for p in counts.index]
|
|
2232
|
+
fig.add_trace(go.Scatter(
|
|
2233
|
+
x=x_labels,
|
|
2234
|
+
y=counts.values,
|
|
2235
|
+
mode='lines',
|
|
2236
|
+
fill='tozeroy',
|
|
2237
|
+
line={"color": self.colors["info"]},
|
|
2238
|
+
fillcolor="rgba(23, 190, 207, 0.3)",
|
|
2239
|
+
hovertemplate="%{x}: %{y:,}<extra></extra>"
|
|
2240
|
+
), row=row, col=col)
|
|
2241
|
+
|
|
2242
|
+
# Force categorical x-axis to prevent Plotly from interpreting as dates
|
|
2243
|
+
xaxis_name = f"xaxis{(row - 1) * n_cols + col}" if (row - 1) * n_cols + col > 1 else "xaxis"
|
|
2244
|
+
fig.update_layout(**{xaxis_name: {"type": "category", "tickangle": -45}})
|
|
2245
|
+
|
|
2246
|
+
def _add_identifier_tile(
|
|
2247
|
+
self, fig: go.Figure, series: Series, metrics: Dict,
|
|
2248
|
+
row: int, col: int, n_cols: int, formatter: "NumberFormatter"
|
|
2249
|
+
) -> None:
|
|
2250
|
+
"""Add identifier column tile with uniqueness gauge."""
|
|
2251
|
+
total = len(series)
|
|
2252
|
+
unique = metrics.get('distinct_count', series.nunique())
|
|
2253
|
+
unique_pct = (unique / total * 100) if total > 0 else 0
|
|
2254
|
+
|
|
2255
|
+
gauge_color = (self.colors["success"] if unique_pct >= 99
|
|
2256
|
+
else self.colors["warning"] if unique_pct >= 95
|
|
2257
|
+
else self.colors["danger"])
|
|
2258
|
+
|
|
2259
|
+
# Progress bar style for uniqueness
|
|
2260
|
+
fig.add_trace(go.Bar(
|
|
2261
|
+
x=[unique_pct], y=[""],
|
|
2262
|
+
orientation='h',
|
|
2263
|
+
marker_color=gauge_color,
|
|
2264
|
+
text=f"{unique_pct:.1f}% unique",
|
|
2265
|
+
textposition="inside",
|
|
2266
|
+
textfont={"color": "white", "size": 11},
|
|
2267
|
+
hovertemplate=f"Unique: {unique:,} / {total:,}<extra></extra>",
|
|
2268
|
+
showlegend=False
|
|
2269
|
+
), row=row, col=col)
|
|
2270
|
+
|
|
2271
|
+
fig.add_trace(go.Bar(
|
|
2272
|
+
x=[100 - unique_pct], y=[""],
|
|
2273
|
+
orientation='h',
|
|
2274
|
+
marker_color="#ecf0f1",
|
|
2275
|
+
hoverinfo="skip",
|
|
2276
|
+
showlegend=False
|
|
2277
|
+
), row=row, col=col)
|
|
2278
|
+
|
|
2279
|
+
def _add_target_tile(
|
|
2280
|
+
self, fig: go.Figure, series: Series, metrics: Dict,
|
|
2281
|
+
row: int, col: int, n_cols: int, formatter: "NumberFormatter"
|
|
2282
|
+
) -> None:
|
|
2283
|
+
"""Add target column tile with horizontal bars showing class distribution."""
|
|
2284
|
+
value_counts = series.value_counts()
|
|
2285
|
+
total = len(series)
|
|
2286
|
+
|
|
2287
|
+
colors_list = [self.colors["success"], self.colors["danger"]] + \
|
|
2288
|
+
[self.colors["warning"], self.colors["info"]]
|
|
2289
|
+
|
|
2290
|
+
labels = [str(v) for v in value_counts.head(4).index]
|
|
2291
|
+
percentages = [(c / total * 100) for c in value_counts.head(4).values]
|
|
2292
|
+
|
|
2293
|
+
# Horizontal bars with labels on y-axis
|
|
2294
|
+
fig.add_trace(go.Bar(
|
|
2295
|
+
y=labels,
|
|
2296
|
+
x=percentages,
|
|
2297
|
+
orientation='h',
|
|
2298
|
+
marker_color=colors_list[:len(labels)],
|
|
2299
|
+
text=[f"{p:.0f}%" for p in percentages],
|
|
2300
|
+
textposition="inside",
|
|
2301
|
+
textfont={"size": 11, "color": "white"},
|
|
2302
|
+
hovertemplate="%{y}: %{x:.1f}%<extra></extra>",
|
|
2303
|
+
showlegend=False
|
|
2304
|
+
), row=row, col=col)
|
|
2305
|
+
|
|
2306
|
+
xref = f"{self._get_axis_ref(row, col, n_cols, 'x')} domain"
|
|
2307
|
+
yref = f"{self._get_axis_ref(row, col, n_cols, 'y')} domain"
|
|
2308
|
+
if len(value_counts) == 2:
|
|
2309
|
+
ratio = value_counts.max() / value_counts.min() if value_counts.min() > 0 else float('inf')
|
|
2310
|
+
ratio_color = (self.colors["success"] if ratio < 3
|
|
2311
|
+
else self.colors["warning"] if ratio < 10
|
|
2312
|
+
else self.colors["danger"])
|
|
2313
|
+
fig.add_annotation(
|
|
2314
|
+
x=0.98, y=0.98, xref=xref, yref=yref,
|
|
2315
|
+
text=f"{ratio:.1f}:1",
|
|
2316
|
+
showarrow=False, font={"size": 10, "color": ratio_color, "family": "Arial Black"},
|
|
2317
|
+
xanchor="right", yanchor="top"
|
|
2318
|
+
)
|
|
2319
|
+
|
|
2320
|
+
def _add_generic_tile(
|
|
2321
|
+
self, fig: go.Figure, series: Series, metrics: Dict,
|
|
2322
|
+
row: int, col: int, n_cols: int, formatter: "NumberFormatter"
|
|
2323
|
+
) -> None:
|
|
2324
|
+
"""Add generic tile for unknown column types."""
|
|
2325
|
+
value_counts = series.value_counts().head(5)
|
|
2326
|
+
|
|
2327
|
+
fig.add_trace(go.Bar(
|
|
2328
|
+
x=value_counts.values,
|
|
2329
|
+
y=[str(v)[:10] for v in value_counts.index],
|
|
2330
|
+
orientation='h',
|
|
2331
|
+
marker_color=self.colors["primary"],
|
|
2332
|
+
hovertemplate="%{y}: %{x:,}<extra></extra>"
|
|
2333
|
+
), row=row, col=col)
|
|
2334
|
+
|
|
2335
|
+
def cutoff_selection_chart(
|
|
2336
|
+
self, cutoff_analysis: "CutoffAnalysis", suggested_cutoff: Optional[datetime] = None,
|
|
2337
|
+
current_cutoff: Optional[datetime] = None, title: str = "Point-in-Time Cutoff Selection"
|
|
2338
|
+
) -> go.Figure:
|
|
2339
|
+
df = cutoff_analysis.to_dataframe()
|
|
2340
|
+
if len(df) == 0:
|
|
2341
|
+
return go.Figure().add_annotation(text="No temporal data available", showarrow=False)
|
|
2342
|
+
|
|
2343
|
+
# Get data date range to check if cutoffs are within bounds
|
|
2344
|
+
min_date = df["date"].min()
|
|
2345
|
+
max_date = df["date"].max()
|
|
2346
|
+
|
|
2347
|
+
fig = go.Figure()
|
|
2348
|
+
|
|
2349
|
+
# Add 100% baseline first (invisible, for fill reference)
|
|
2350
|
+
fig.add_trace(go.Scatter(
|
|
2351
|
+
x=df["date"], y=[100] * len(df), name="_baseline",
|
|
2352
|
+
mode="lines", line={"color": "rgba(0,0,0,0)", "width": 0},
|
|
2353
|
+
showlegend=False, hoverinfo="skip"
|
|
2354
|
+
))
|
|
2355
|
+
|
|
2356
|
+
# Score area fills from 100% down to train_pct line
|
|
2357
|
+
fig.add_trace(go.Scatter(
|
|
2358
|
+
x=df["date"], y=df["train_pct"], name="Score Set %",
|
|
2359
|
+
mode="lines", line={"color": self.colors["warning"], "width": 2},
|
|
2360
|
+
fill="tonexty", fillcolor="rgba(255, 193, 7, 0.3)",
|
|
2361
|
+
hovertemplate="Cutoff: %{x|%Y-%m-%d}<br>Score: %{customdata:.1f}%<extra></extra>",
|
|
2362
|
+
customdata=df["score_pct"], showlegend=True
|
|
2363
|
+
))
|
|
2364
|
+
|
|
2365
|
+
# Train area fills from train_pct down to 0
|
|
2366
|
+
fig.add_trace(go.Scatter(
|
|
2367
|
+
x=df["date"], y=df["train_pct"], name="Training Set %",
|
|
2368
|
+
mode="lines", line={"color": self.colors["success"], "width": 2},
|
|
2369
|
+
fill="tozeroy", fillcolor="rgba(40, 167, 69, 0.3)",
|
|
2370
|
+
hovertemplate="Cutoff: %{x|%Y-%m-%d}<br>Train: %{y:.1f}%<extra></extra>",
|
|
2371
|
+
showlegend=True
|
|
2372
|
+
))
|
|
2373
|
+
|
|
2374
|
+
milestones = cutoff_analysis.get_percentage_milestones(step=5)
|
|
2375
|
+
if milestones:
|
|
2376
|
+
milestone_dates = [m["date"] for m in milestones]
|
|
2377
|
+
milestone_pcts = [m["train_pct"] for m in milestones]
|
|
2378
|
+
fig.add_trace(go.Scatter(
|
|
2379
|
+
x=milestone_dates, y=milestone_pcts, name="Train % Reference",
|
|
2380
|
+
mode="markers+text", marker={"size": 8, "color": self.colors["success"], "symbol": "circle"},
|
|
2381
|
+
text=[f"{int(p)}%" for p in milestone_pcts], textposition="top center",
|
|
2382
|
+
textfont={"size": 8, "color": self.colors["success"]},
|
|
2383
|
+
hovertemplate="Date: %{x|%Y-%m-%d}<br>Train: %{y:.0f}%<extra></extra>",
|
|
2384
|
+
showlegend=False
|
|
2385
|
+
))
|
|
2386
|
+
|
|
2387
|
+
# Add cutoff lines - only if within data range
|
|
2388
|
+
if suggested_cutoff:
|
|
2389
|
+
split = cutoff_analysis.get_split_at_date(suggested_cutoff)
|
|
2390
|
+
# Check if suggested cutoff is within data range
|
|
2391
|
+
if min_date <= suggested_cutoff <= max_date:
|
|
2392
|
+
fig.add_vline(
|
|
2393
|
+
x=suggested_cutoff, line={"color": self.colors["info"], "dash": "dash", "width": 2}
|
|
2394
|
+
)
|
|
2395
|
+
# Add text annotation label on chart for selected cutoff
|
|
2396
|
+
fig.add_annotation(
|
|
2397
|
+
x=suggested_cutoff, y=1.02, xref="x", yref="paper",
|
|
2398
|
+
text=f"Selected: {suggested_cutoff.strftime('%Y-%m-%d')}",
|
|
2399
|
+
showarrow=False, font={"size": 9, "color": self.colors["info"]},
|
|
2400
|
+
xanchor="center", yanchor="bottom"
|
|
2401
|
+
)
|
|
2402
|
+
# Add legend entry with visible line sample
|
|
2403
|
+
fig.add_trace(go.Scatter(
|
|
2404
|
+
x=[None], y=[None], mode="lines",
|
|
2405
|
+
line={"color": self.colors["info"], "dash": "dash", "width": 2},
|
|
2406
|
+
name=f"Selected: {suggested_cutoff.strftime('%Y-%m-%d')} ({split['train_pct']:.0f}% train)",
|
|
2407
|
+
showlegend=True
|
|
2408
|
+
))
|
|
2409
|
+
|
|
2410
|
+
if current_cutoff:
|
|
2411
|
+
split = cutoff_analysis.get_split_at_date(current_cutoff)
|
|
2412
|
+
# Check if registry cutoff is within data range
|
|
2413
|
+
cutoff_in_range = min_date <= current_cutoff <= max_date
|
|
2414
|
+
# Determine if registry and selected cutoffs are at the same position
|
|
2415
|
+
same_as_selected = suggested_cutoff and current_cutoff == suggested_cutoff
|
|
2416
|
+
if cutoff_in_range:
|
|
2417
|
+
fig.add_vline(
|
|
2418
|
+
x=current_cutoff, line={"color": self.colors["danger"], "dash": "dot", "width": 2}
|
|
2419
|
+
)
|
|
2420
|
+
# Add text annotation label on chart for registry cutoff
|
|
2421
|
+
# Offset vertically if same as selected to avoid overlap
|
|
2422
|
+
annotation_y = 1.08 if same_as_selected else 1.02
|
|
2423
|
+
fig.add_annotation(
|
|
2424
|
+
x=current_cutoff, y=annotation_y, xref="x", yref="paper",
|
|
2425
|
+
text=f"Registry: {current_cutoff.strftime('%Y-%m-%d')}",
|
|
2426
|
+
showarrow=False, font={"size": 9, "color": self.colors["danger"]},
|
|
2427
|
+
xanchor="center", yanchor="bottom"
|
|
2428
|
+
)
|
|
2429
|
+
legend_label = f"Registry: {current_cutoff.strftime('%Y-%m-%d')} ({split['train_pct']:.0f}% train)"
|
|
2430
|
+
else:
|
|
2431
|
+
# Registry cutoff is outside data range
|
|
2432
|
+
legend_label = f"Registry: {current_cutoff.strftime('%Y-%m-%d')} (outside data range)"
|
|
2433
|
+
# Add legend entry
|
|
2434
|
+
fig.add_trace(go.Scatter(
|
|
2435
|
+
x=[None], y=[None], mode="lines",
|
|
2436
|
+
line={"color": self.colors["danger"], "dash": "dot", "width": 2},
|
|
2437
|
+
name=legend_label,
|
|
2438
|
+
showlegend=True
|
|
2439
|
+
))
|
|
2440
|
+
|
|
2441
|
+
fig.update_layout(
|
|
2442
|
+
title={"text": "Train/Score Split by Cutoff Date", "x": 0.5, "xanchor": "center"},
|
|
2443
|
+
width=800, height=300, autosize=False, template=self.theme, showlegend=True,
|
|
2444
|
+
legend={
|
|
2445
|
+
"orientation": "h", "yanchor": "top", "y": -0.15,
|
|
2446
|
+
"xanchor": "center", "x": 0.5, "bgcolor": "rgba(255,255,255,0.8)",
|
|
2447
|
+
"font": {"size": 9}
|
|
2448
|
+
},
|
|
2449
|
+
margin={"t": 40, "b": 60, "l": 55, "r": 55},
|
|
2450
|
+
yaxis={"title": "Percentage", "range": [0, 100]},
|
|
2451
|
+
xaxis={"title": ""},
|
|
2452
|
+
)
|
|
2453
|
+
|
|
2454
|
+
return fig
|
|
2455
|
+
|
|
2456
|
+
def recency_analysis_panel(
|
|
2457
|
+
self, retained_recency: np.ndarray, churned_recency: np.ndarray,
|
|
2458
|
+
bucket_stats: list, retained_median: float, churned_median: float,
|
|
2459
|
+
cap_value: Optional[float] = None
|
|
2460
|
+
) -> go.Figure:
|
|
2461
|
+
from plotly.subplots import make_subplots
|
|
2462
|
+
from scipy.stats import gaussian_kde
|
|
2463
|
+
fig = make_subplots(
|
|
2464
|
+
rows=2, cols=2,
|
|
2465
|
+
subplot_titles=["Retained Distribution", "Target Rate by Recency",
|
|
2466
|
+
"Churned Distribution", "Density Comparison"],
|
|
2467
|
+
row_heights=[0.5, 0.5], column_widths=[0.5, 0.5],
|
|
2468
|
+
horizontal_spacing=0.08, vertical_spacing=0.15,
|
|
2469
|
+
specs=[[{}, {"secondary_y": True}], [{}, {}]]
|
|
2470
|
+
)
|
|
2471
|
+
color_retained, color_churned = "rgba(46,204,113,0.7)", "rgba(231,76,60,0.7)"
|
|
2472
|
+
cap = cap_value or max(np.max(retained_recency), np.max(churned_recency))
|
|
2473
|
+
x_range = [0, cap * 1.05]
|
|
2474
|
+
fig.add_trace(go.Histogram(
|
|
2475
|
+
x=retained_recency, nbinsx=30, marker_color=color_retained, showlegend=False,
|
|
2476
|
+
hovertemplate="Days: %{x}<br>Count: %{y}<extra></extra>"
|
|
2477
|
+
), row=1, col=1)
|
|
2478
|
+
fig.add_vline(x=retained_median, line_dash="solid", line_color="green",
|
|
2479
|
+
annotation_text=f"Med: {retained_median:.0f}d", row=1, col=1)
|
|
2480
|
+
fig.add_trace(go.Histogram(
|
|
2481
|
+
x=churned_recency, nbinsx=30, marker_color=color_churned, showlegend=False,
|
|
2482
|
+
hovertemplate="Days: %{x}<br>Count: %{y}<extra></extra>"
|
|
2483
|
+
), row=2, col=1)
|
|
2484
|
+
fig.add_vline(x=churned_median, line_dash="solid", line_color="red",
|
|
2485
|
+
annotation_text=f"Med: {churned_median:.0f}d", row=2, col=1)
|
|
2486
|
+
if bucket_stats:
|
|
2487
|
+
labels = [b.bucket_label for b in bucket_stats]
|
|
2488
|
+
counts = [b.entity_count for b in bucket_stats]
|
|
2489
|
+
rates = [b.target_rate * 100 for b in bucket_stats]
|
|
2490
|
+
fig.add_trace(go.Bar(
|
|
2491
|
+
x=labels, y=counts, name="Entity Count", marker_color="lightsteelblue", opacity=0.7,
|
|
2492
|
+
hovertemplate="Bucket: %{x}<br>Count: %{y}<extra></extra>"
|
|
2493
|
+
), row=1, col=2)
|
|
2494
|
+
fig.add_trace(go.Scatter(
|
|
2495
|
+
x=labels, y=rates, mode="lines+markers", name="Target Rate %",
|
|
2496
|
+
line={"color": "red", "width": 3}, marker={"size": 8},
|
|
2497
|
+
hovertemplate="Bucket: %{x}<br>Rate: %{y:.1f}%<extra></extra>"
|
|
2498
|
+
), row=1, col=2, secondary_y=True)
|
|
2499
|
+
x_density = np.linspace(0, cap, 200)
|
|
2500
|
+
if len(retained_recency) > 5 and len(churned_recency) > 5:
|
|
2501
|
+
kde_retained = gaussian_kde(retained_recency, bw_method=0.3)
|
|
2502
|
+
kde_churned = gaussian_kde(churned_recency, bw_method=0.3)
|
|
2503
|
+
fig.add_trace(go.Scatter(
|
|
2504
|
+
x=x_density, y=kde_retained(x_density), mode="lines", name="Retained",
|
|
2505
|
+
line={"color": "green", "width": 2}, fill="tozeroy", fillcolor="rgba(46,204,113,0.3)",
|
|
2506
|
+
hovertemplate="Days: %{x:.0f}<br>Density: %{y:.4f}<extra></extra>"
|
|
2507
|
+
), row=2, col=2)
|
|
2508
|
+
fig.add_trace(go.Scatter(
|
|
2509
|
+
x=x_density, y=kde_churned(x_density), mode="lines", name="Churned",
|
|
2510
|
+
line={"color": "red", "width": 2}, fill="tozeroy", fillcolor="rgba(231,76,60,0.3)",
|
|
2511
|
+
hovertemplate="Days: %{x:.0f}<br>Density: %{y:.4f}<extra></extra>"
|
|
2512
|
+
), row=2, col=2)
|
|
2513
|
+
fig.add_vline(x=retained_median, line_dash="dash", line_color="green", line_width=1, row=2, col=2)
|
|
2514
|
+
fig.add_vline(x=churned_median, line_dash="dash", line_color="red", line_width=1, row=2, col=2)
|
|
2515
|
+
separation = self._compute_distribution_separation(kde_retained, kde_churned, x_density)
|
|
2516
|
+
fig.add_annotation(x=0.95, y=0.95, xref="x4 domain", yref="y4 domain",
|
|
2517
|
+
text=f"Separation: {separation:.0%}", showarrow=False,
|
|
2518
|
+
font={"size": 11}, bgcolor="rgba(255,255,255,0.8)", xanchor="right")
|
|
2519
|
+
fig.update_xaxes(range=x_range, row=1, col=1)
|
|
2520
|
+
fig.update_xaxes(range=x_range, row=2, col=1)
|
|
2521
|
+
fig.update_xaxes(range=x_range, row=2, col=2)
|
|
2522
|
+
fig.update_xaxes(title_text="Days Since Last Event", row=2, col=1)
|
|
2523
|
+
fig.update_xaxes(title_text="Recency Bucket", row=1, col=2)
|
|
2524
|
+
fig.update_xaxes(title_text="Days Since Last Event", row=2, col=2)
|
|
2525
|
+
fig.update_yaxes(title_text="Count", row=1, col=1)
|
|
2526
|
+
fig.update_yaxes(title_text="Count", row=2, col=1)
|
|
2527
|
+
fig.update_yaxes(title_text="Entity Count", row=1, col=2)
|
|
2528
|
+
fig.update_yaxes(title_text="Target Rate %", row=1, col=2, secondary_y=True)
|
|
2529
|
+
fig.update_yaxes(title_text="Density", row=2, col=2)
|
|
2530
|
+
fig.update_layout(
|
|
2531
|
+
title={"text": "Recency Analysis: Distribution Comparison & Target Rate", "x": 0.5},
|
|
2532
|
+
template=self.theme, height=550, showlegend=True, autosize=True,
|
|
2533
|
+
legend={"orientation": "h", "yanchor": "top", "y": -0.08, "xanchor": "center", "x": 0.5},
|
|
2534
|
+
margin={"l": 60, "r": 60, "t": 50, "b": 80}
|
|
2535
|
+
)
|
|
2536
|
+
return fig
|
|
2537
|
+
|
|
2538
|
+
def _compute_distribution_separation(self, kde1, kde2, x_values: np.ndarray) -> float:
|
|
2539
|
+
y1, y2 = kde1(x_values), kde2(x_values)
|
|
2540
|
+
overlap = np.trapezoid(np.minimum(y1, y2), x_values)
|
|
2541
|
+
return 1.0 - overlap
|
|
2542
|
+
|
|
2543
|
+
def categorical_analysis_panel(
|
|
2544
|
+
self, insights: list, overall_rate: float, max_features: int = 6
|
|
2545
|
+
) -> go.Figure:
|
|
2546
|
+
from plotly.subplots import make_subplots
|
|
2547
|
+
if not insights:
|
|
2548
|
+
fig = go.Figure()
|
|
2549
|
+
fig.add_annotation(text="No categorical features to analyze", showarrow=False,
|
|
2550
|
+
xref="paper", yref="paper", x=0.5, y=0.5, font={"size": 16})
|
|
2551
|
+
return fig
|
|
2552
|
+
insights = sorted(insights, key=lambda x: x.cramers_v, reverse=True)[:max_features]
|
|
2553
|
+
fig = make_subplots(
|
|
2554
|
+
rows=2, cols=2,
|
|
2555
|
+
subplot_titles=["Feature Association Strength (Cramér's V)", "Effect Strength Distribution",
|
|
2556
|
+
"High/Low Risk Category Counts", "Top Feature: Category Target Rates"],
|
|
2557
|
+
row_heights=[0.5, 0.5], column_widths=[0.5, 0.5],
|
|
2558
|
+
horizontal_spacing=0.12, vertical_spacing=0.18
|
|
2559
|
+
)
|
|
2560
|
+
features = [i.feature_name for i in insights]
|
|
2561
|
+
cramers_values = [i.cramers_v for i in insights]
|
|
2562
|
+
# Top-Left: Strength gradient (red=strong, orange=moderate, light blue=weak)
|
|
2563
|
+
strength_colors = ["#c0392b" if v >= 0.3 else "#e67e22" if v >= 0.1 else "#85c1e9" for v in cramers_values]
|
|
2564
|
+
fig.add_trace(go.Bar(
|
|
2565
|
+
y=features, x=cramers_values, orientation="h", marker_color=strength_colors,
|
|
2566
|
+
hovertemplate="Feature: %{y}<br>Cramér's V: %{x:.3f}<extra></extra>", showlegend=False
|
|
2567
|
+
), row=1, col=1)
|
|
2568
|
+
fig.add_vline(x=0.3, line_dash="dash", line_color="#c0392b", annotation_text="Strong",
|
|
2569
|
+
annotation_position="top right", row=1, col=1)
|
|
2570
|
+
fig.add_vline(x=0.1, line_dash="dash", line_color="#e67e22", annotation_text="Moderate",
|
|
2571
|
+
annotation_position="top left", row=1, col=1)
|
|
2572
|
+
# Top-Right: Count distribution (purple palette - distinct from strength colors)
|
|
2573
|
+
effect_counts = {"strong": 0, "moderate": 0, "weak": 0, "negligible": 0}
|
|
2574
|
+
for i in insights:
|
|
2575
|
+
effect_counts[i.effect_strength] = effect_counts.get(i.effect_strength, 0) + 1
|
|
2576
|
+
effect_labels = list(effect_counts.keys())
|
|
2577
|
+
effect_values = list(effect_counts.values())
|
|
2578
|
+
# Purple gradient for counts (darker = more significant category)
|
|
2579
|
+
count_colors = ["#6c3483", "#8e44ad", "#a569bd", "#d2b4de"]
|
|
2580
|
+
fig.add_trace(go.Bar(
|
|
2581
|
+
x=effect_labels, y=effect_values, marker_color=count_colors, showlegend=False,
|
|
2582
|
+
hovertemplate="Effect: %{x}<br>Count: %{y}<extra></extra>"
|
|
2583
|
+
), row=1, col=2)
|
|
2584
|
+
high_risk = [len(i.high_risk_categories) for i in insights]
|
|
2585
|
+
low_risk = [len(i.low_risk_categories) for i in insights]
|
|
2586
|
+
fig.add_trace(go.Bar(
|
|
2587
|
+
y=features, x=high_risk, orientation="h", name="High Risk Categories",
|
|
2588
|
+
marker_color="rgba(231,76,60,0.7)", hovertemplate="%{y}: %{x} high-risk<extra></extra>"
|
|
2589
|
+
), row=2, col=1)
|
|
2590
|
+
fig.add_trace(go.Bar(
|
|
2591
|
+
y=features, x=low_risk, orientation="h", name="Low Risk Categories",
|
|
2592
|
+
marker_color="rgba(46,204,113,0.7)", hovertemplate="%{y}: %{x} low-risk<extra></extra>"
|
|
2593
|
+
), row=2, col=1)
|
|
2594
|
+
top_insight = insights[0]
|
|
2595
|
+
if not top_insight.category_stats.empty:
|
|
2596
|
+
stats = top_insight.category_stats.head(10)
|
|
2597
|
+
categories = stats["category"].astype(str).tolist()
|
|
2598
|
+
rates = (stats["retention_rate"] * 100).tolist()
|
|
2599
|
+
bar_colors = ["#e74c3c" if r < overall_rate * 100 * 0.9 else
|
|
2600
|
+
"#2ecc71" if r > overall_rate * 100 * 1.1 else "#3498db" for r in rates]
|
|
2601
|
+
fig.add_trace(go.Bar(
|
|
2602
|
+
x=categories, y=rates, marker_color=bar_colors, showlegend=False,
|
|
2603
|
+
hovertemplate="Category: %{x}<br>Target Rate: %{y:.1f}%<extra></extra>"
|
|
2604
|
+
), row=2, col=2)
|
|
2605
|
+
fig.add_hline(y=overall_rate * 100, line_dash="dash", line_color="gray",
|
|
2606
|
+
annotation_text=f"Overall: {overall_rate*100:.1f}%", row=2, col=2)
|
|
2607
|
+
fig.update_xaxes(title_text="Cramér's V", row=1, col=1)
|
|
2608
|
+
fig.update_xaxes(title_text="Category Count", row=2, col=1)
|
|
2609
|
+
fig.update_xaxes(title_text="Category", row=2, col=2, tickangle=45)
|
|
2610
|
+
fig.update_yaxes(title_text="Feature", row=1, col=1)
|
|
2611
|
+
fig.update_yaxes(title_text="Feature", row=2, col=1)
|
|
2612
|
+
fig.update_yaxes(title_text="Target Rate %", row=2, col=2)
|
|
2613
|
+
fig.update_layout(
|
|
2614
|
+
title={"text": "Categorical Feature Analysis", "x": 0.5},
|
|
2615
|
+
template=self.theme, height=600, showlegend=True, autosize=True, barmode="group",
|
|
2616
|
+
legend={"orientation": "h", "yanchor": "top", "y": -0.1, "xanchor": "center", "x": 0.5},
|
|
2617
|
+
margin={"l": 120, "r": 60, "t": 60, "b": 100}
|
|
2618
|
+
)
|
|
2619
|
+
return fig
|