churnkit 0.75.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
- churnkit-0.75.0a1.dist-info/METADATA +229 -0
- churnkit-0.75.0a1.dist-info/RECORD +302 -0
- churnkit-0.75.0a1.dist-info/WHEEL +4 -0
- churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
- churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
- customer_retention/__init__.py +37 -0
- customer_retention/analysis/__init__.py +0 -0
- customer_retention/analysis/auto_explorer/__init__.py +62 -0
- customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
- customer_retention/analysis/auto_explorer/explorer.py +258 -0
- customer_retention/analysis/auto_explorer/findings.py +291 -0
- customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
- customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
- customer_retention/analysis/auto_explorer/recommendations.py +418 -0
- customer_retention/analysis/business/__init__.py +26 -0
- customer_retention/analysis/business/ab_test_designer.py +144 -0
- customer_retention/analysis/business/fairness_analyzer.py +166 -0
- customer_retention/analysis/business/intervention_matcher.py +121 -0
- customer_retention/analysis/business/report_generator.py +222 -0
- customer_retention/analysis/business/risk_profile.py +199 -0
- customer_retention/analysis/business/roi_analyzer.py +139 -0
- customer_retention/analysis/diagnostics/__init__.py +20 -0
- customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
- customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
- customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
- customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
- customer_retention/analysis/diagnostics/noise_tester.py +140 -0
- customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
- customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
- customer_retention/analysis/discovery/__init__.py +8 -0
- customer_retention/analysis/discovery/config_generator.py +49 -0
- customer_retention/analysis/discovery/discovery_flow.py +19 -0
- customer_retention/analysis/discovery/type_inferencer.py +147 -0
- customer_retention/analysis/interpretability/__init__.py +13 -0
- customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
- customer_retention/analysis/interpretability/counterfactual.py +175 -0
- customer_retention/analysis/interpretability/individual_explainer.py +141 -0
- customer_retention/analysis/interpretability/pdp_generator.py +103 -0
- customer_retention/analysis/interpretability/shap_explainer.py +106 -0
- customer_retention/analysis/jupyter_save_hook.py +28 -0
- customer_retention/analysis/notebook_html_exporter.py +136 -0
- customer_retention/analysis/notebook_progress.py +60 -0
- customer_retention/analysis/plotly_preprocessor.py +154 -0
- customer_retention/analysis/recommendations/__init__.py +54 -0
- customer_retention/analysis/recommendations/base.py +158 -0
- customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
- customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
- customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
- customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
- customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
- customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
- customer_retention/analysis/recommendations/datetime/extract.py +149 -0
- customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
- customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
- customer_retention/analysis/recommendations/pipeline.py +74 -0
- customer_retention/analysis/recommendations/registry.py +76 -0
- customer_retention/analysis/recommendations/selection/__init__.py +3 -0
- customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
- customer_retention/analysis/recommendations/transform/__init__.py +4 -0
- customer_retention/analysis/recommendations/transform/power.py +94 -0
- customer_retention/analysis/recommendations/transform/scale.py +112 -0
- customer_retention/analysis/visualization/__init__.py +15 -0
- customer_retention/analysis/visualization/chart_builder.py +2619 -0
- customer_retention/analysis/visualization/console.py +122 -0
- customer_retention/analysis/visualization/display.py +171 -0
- customer_retention/analysis/visualization/number_formatter.py +36 -0
- customer_retention/artifacts/__init__.py +3 -0
- customer_retention/artifacts/fit_artifact_registry.py +146 -0
- customer_retention/cli.py +93 -0
- customer_retention/core/__init__.py +0 -0
- customer_retention/core/compat/__init__.py +193 -0
- customer_retention/core/compat/detection.py +99 -0
- customer_retention/core/compat/ops.py +48 -0
- customer_retention/core/compat/pandas_backend.py +57 -0
- customer_retention/core/compat/spark_backend.py +75 -0
- customer_retention/core/components/__init__.py +11 -0
- customer_retention/core/components/base.py +79 -0
- customer_retention/core/components/components/__init__.py +13 -0
- customer_retention/core/components/components/deployer.py +26 -0
- customer_retention/core/components/components/explainer.py +26 -0
- customer_retention/core/components/components/feature_eng.py +33 -0
- customer_retention/core/components/components/ingester.py +34 -0
- customer_retention/core/components/components/profiler.py +34 -0
- customer_retention/core/components/components/trainer.py +38 -0
- customer_retention/core/components/components/transformer.py +36 -0
- customer_retention/core/components/components/validator.py +37 -0
- customer_retention/core/components/enums.py +33 -0
- customer_retention/core/components/orchestrator.py +94 -0
- customer_retention/core/components/registry.py +59 -0
- customer_retention/core/config/__init__.py +39 -0
- customer_retention/core/config/column_config.py +95 -0
- customer_retention/core/config/experiments.py +71 -0
- customer_retention/core/config/pipeline_config.py +117 -0
- customer_retention/core/config/source_config.py +83 -0
- customer_retention/core/utils/__init__.py +28 -0
- customer_retention/core/utils/leakage.py +85 -0
- customer_retention/core/utils/severity.py +53 -0
- customer_retention/core/utils/statistics.py +90 -0
- customer_retention/generators/__init__.py +0 -0
- customer_retention/generators/notebook_generator/__init__.py +167 -0
- customer_retention/generators/notebook_generator/base.py +55 -0
- customer_retention/generators/notebook_generator/cell_builder.py +49 -0
- customer_retention/generators/notebook_generator/config.py +47 -0
- customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
- customer_retention/generators/notebook_generator/local_generator.py +48 -0
- customer_retention/generators/notebook_generator/project_init.py +174 -0
- customer_retention/generators/notebook_generator/runner.py +150 -0
- customer_retention/generators/notebook_generator/script_generator.py +110 -0
- customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
- customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
- customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
- customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
- customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
- customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
- customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
- customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
- customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
- customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
- customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
- customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
- customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
- customer_retention/generators/orchestration/__init__.py +23 -0
- customer_retention/generators/orchestration/code_generator.py +196 -0
- customer_retention/generators/orchestration/context.py +147 -0
- customer_retention/generators/orchestration/data_materializer.py +188 -0
- customer_retention/generators/orchestration/databricks_exporter.py +411 -0
- customer_retention/generators/orchestration/doc_generator.py +311 -0
- customer_retention/generators/pipeline_generator/__init__.py +26 -0
- customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
- customer_retention/generators/pipeline_generator/generator.py +142 -0
- customer_retention/generators/pipeline_generator/models.py +166 -0
- customer_retention/generators/pipeline_generator/renderer.py +2125 -0
- customer_retention/generators/spec_generator/__init__.py +37 -0
- customer_retention/generators/spec_generator/databricks_generator.py +433 -0
- customer_retention/generators/spec_generator/generic_generator.py +373 -0
- customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
- customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
- customer_retention/integrations/__init__.py +0 -0
- customer_retention/integrations/adapters/__init__.py +13 -0
- customer_retention/integrations/adapters/base.py +10 -0
- customer_retention/integrations/adapters/factory.py +25 -0
- customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
- customer_retention/integrations/adapters/feature_store/base.py +57 -0
- customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
- customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
- customer_retention/integrations/adapters/feature_store/local.py +75 -0
- customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
- customer_retention/integrations/adapters/mlflow/base.py +32 -0
- customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
- customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
- customer_retention/integrations/adapters/mlflow/local.py +50 -0
- customer_retention/integrations/adapters/storage/__init__.py +5 -0
- customer_retention/integrations/adapters/storage/base.py +33 -0
- customer_retention/integrations/adapters/storage/databricks.py +76 -0
- customer_retention/integrations/adapters/storage/local.py +59 -0
- customer_retention/integrations/feature_store/__init__.py +47 -0
- customer_retention/integrations/feature_store/definitions.py +215 -0
- customer_retention/integrations/feature_store/manager.py +744 -0
- customer_retention/integrations/feature_store/registry.py +412 -0
- customer_retention/integrations/iteration/__init__.py +28 -0
- customer_retention/integrations/iteration/context.py +212 -0
- customer_retention/integrations/iteration/feedback_collector.py +184 -0
- customer_retention/integrations/iteration/orchestrator.py +168 -0
- customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
- customer_retention/integrations/iteration/signals.py +212 -0
- customer_retention/integrations/llm_context/__init__.py +4 -0
- customer_retention/integrations/llm_context/context_builder.py +201 -0
- customer_retention/integrations/llm_context/prompts.py +100 -0
- customer_retention/integrations/streaming/__init__.py +103 -0
- customer_retention/integrations/streaming/batch_integration.py +149 -0
- customer_retention/integrations/streaming/early_warning_model.py +227 -0
- customer_retention/integrations/streaming/event_schema.py +214 -0
- customer_retention/integrations/streaming/online_store_writer.py +249 -0
- customer_retention/integrations/streaming/realtime_scorer.py +261 -0
- customer_retention/integrations/streaming/trigger_engine.py +293 -0
- customer_retention/integrations/streaming/window_aggregator.py +393 -0
- customer_retention/stages/__init__.py +0 -0
- customer_retention/stages/cleaning/__init__.py +9 -0
- customer_retention/stages/cleaning/base.py +28 -0
- customer_retention/stages/cleaning/missing_handler.py +160 -0
- customer_retention/stages/cleaning/outlier_handler.py +204 -0
- customer_retention/stages/deployment/__init__.py +28 -0
- customer_retention/stages/deployment/batch_scorer.py +106 -0
- customer_retention/stages/deployment/champion_challenger.py +299 -0
- customer_retention/stages/deployment/model_registry.py +182 -0
- customer_retention/stages/deployment/retraining_trigger.py +245 -0
- customer_retention/stages/features/__init__.py +73 -0
- customer_retention/stages/features/behavioral_features.py +266 -0
- customer_retention/stages/features/customer_segmentation.py +505 -0
- customer_retention/stages/features/feature_definitions.py +265 -0
- customer_retention/stages/features/feature_engineer.py +551 -0
- customer_retention/stages/features/feature_manifest.py +340 -0
- customer_retention/stages/features/feature_selector.py +239 -0
- customer_retention/stages/features/interaction_features.py +160 -0
- customer_retention/stages/features/temporal_features.py +243 -0
- customer_retention/stages/ingestion/__init__.py +9 -0
- customer_retention/stages/ingestion/load_result.py +32 -0
- customer_retention/stages/ingestion/loaders.py +195 -0
- customer_retention/stages/ingestion/source_registry.py +130 -0
- customer_retention/stages/modeling/__init__.py +31 -0
- customer_retention/stages/modeling/baseline_trainer.py +139 -0
- customer_retention/stages/modeling/cross_validator.py +125 -0
- customer_retention/stages/modeling/data_splitter.py +205 -0
- customer_retention/stages/modeling/feature_scaler.py +99 -0
- customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
- customer_retention/stages/modeling/imbalance_handler.py +282 -0
- customer_retention/stages/modeling/mlflow_logger.py +95 -0
- customer_retention/stages/modeling/model_comparator.py +149 -0
- customer_retention/stages/modeling/model_evaluator.py +138 -0
- customer_retention/stages/modeling/threshold_optimizer.py +131 -0
- customer_retention/stages/monitoring/__init__.py +37 -0
- customer_retention/stages/monitoring/alert_manager.py +328 -0
- customer_retention/stages/monitoring/drift_detector.py +201 -0
- customer_retention/stages/monitoring/performance_monitor.py +242 -0
- customer_retention/stages/preprocessing/__init__.py +5 -0
- customer_retention/stages/preprocessing/transformer_manager.py +284 -0
- customer_retention/stages/profiling/__init__.py +256 -0
- customer_retention/stages/profiling/categorical_distribution.py +269 -0
- customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
- customer_retention/stages/profiling/column_profiler.py +527 -0
- customer_retention/stages/profiling/distribution_analysis.py +483 -0
- customer_retention/stages/profiling/drift_detector.py +310 -0
- customer_retention/stages/profiling/feature_capacity.py +507 -0
- customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
- customer_retention/stages/profiling/profile_result.py +212 -0
- customer_retention/stages/profiling/quality_checks.py +1632 -0
- customer_retention/stages/profiling/relationship_detector.py +256 -0
- customer_retention/stages/profiling/relationship_recommender.py +454 -0
- customer_retention/stages/profiling/report_generator.py +520 -0
- customer_retention/stages/profiling/scd_analyzer.py +151 -0
- customer_retention/stages/profiling/segment_analyzer.py +632 -0
- customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
- customer_retention/stages/profiling/target_level_analyzer.py +217 -0
- customer_retention/stages/profiling/temporal_analyzer.py +388 -0
- customer_retention/stages/profiling/temporal_coverage.py +488 -0
- customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
- customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
- customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
- customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
- customer_retention/stages/profiling/text_embedder.py +87 -0
- customer_retention/stages/profiling/text_processor.py +115 -0
- customer_retention/stages/profiling/text_reducer.py +60 -0
- customer_retention/stages/profiling/time_series_profiler.py +303 -0
- customer_retention/stages/profiling/time_window_aggregator.py +376 -0
- customer_retention/stages/profiling/type_detector.py +382 -0
- customer_retention/stages/profiling/window_recommendation.py +288 -0
- customer_retention/stages/temporal/__init__.py +166 -0
- customer_retention/stages/temporal/access_guard.py +180 -0
- customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
- customer_retention/stages/temporal/data_preparer.py +178 -0
- customer_retention/stages/temporal/point_in_time_join.py +134 -0
- customer_retention/stages/temporal/point_in_time_registry.py +148 -0
- customer_retention/stages/temporal/scenario_detector.py +163 -0
- customer_retention/stages/temporal/snapshot_manager.py +259 -0
- customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
- customer_retention/stages/temporal/timestamp_discovery.py +531 -0
- customer_retention/stages/temporal/timestamp_manager.py +255 -0
- customer_retention/stages/transformation/__init__.py +13 -0
- customer_retention/stages/transformation/binary_handler.py +85 -0
- customer_retention/stages/transformation/categorical_encoder.py +245 -0
- customer_retention/stages/transformation/datetime_transformer.py +97 -0
- customer_retention/stages/transformation/numeric_transformer.py +181 -0
- customer_retention/stages/transformation/pipeline.py +257 -0
- customer_retention/stages/validation/__init__.py +60 -0
- customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
- customer_retention/stages/validation/business_sense_gate.py +173 -0
- customer_retention/stages/validation/data_quality_gate.py +235 -0
- customer_retention/stages/validation/data_validators.py +511 -0
- customer_retention/stages/validation/feature_quality_gate.py +183 -0
- customer_retention/stages/validation/gates.py +117 -0
- customer_retention/stages/validation/leakage_gate.py +352 -0
- customer_retention/stages/validation/model_validity_gate.py +213 -0
- customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
- customer_retention/stages/validation/quality_scorer.py +544 -0
- customer_retention/stages/validation/rule_generator.py +57 -0
- customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
- customer_retention/stages/validation/timeseries_detector.py +769 -0
- customer_retention/transforms/__init__.py +47 -0
- customer_retention/transforms/artifact_store.py +50 -0
- customer_retention/transforms/executor.py +157 -0
- customer_retention/transforms/fitted.py +92 -0
- customer_retention/transforms/ops.py +148 -0
|
@@ -0,0 +1,520 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, Dict, Optional
|
|
4
|
+
|
|
5
|
+
from customer_retention.core.compat import pd
|
|
6
|
+
from customer_retention.core.config import ColumnType
|
|
7
|
+
|
|
8
|
+
from .profile_result import ProfileResult
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ReportGenerator:
|
|
12
|
+
"""Generate profiling reports in multiple formats (JSON, HTML, Markdown)."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, profile: Optional[ProfileResult] = None):
|
|
15
|
+
self.profile = profile
|
|
16
|
+
|
|
17
|
+
def to_json(self, indent: int = 2) -> str:
|
|
18
|
+
"""Generate JSON report from profile result."""
|
|
19
|
+
if self.profile is None:
|
|
20
|
+
raise ValueError("No profile set. Provide profile in constructor or set profile attribute.")
|
|
21
|
+
|
|
22
|
+
# Convert profile to dict using Pydantic's model_dump
|
|
23
|
+
report_dict = self.profile.model_dump()
|
|
24
|
+
|
|
25
|
+
return json.dumps(report_dict, indent=indent, default=str)
|
|
26
|
+
|
|
27
|
+
def save_json(self, filepath: str):
|
|
28
|
+
"""Save JSON report to file."""
|
|
29
|
+
json_report = self.to_json()
|
|
30
|
+
|
|
31
|
+
with open(filepath, 'w') as f:
|
|
32
|
+
f.write(json_report)
|
|
33
|
+
|
|
34
|
+
def to_html(self) -> str:
|
|
35
|
+
"""Generate HTML report from profile result."""
|
|
36
|
+
if self.profile is None:
|
|
37
|
+
raise ValueError("No profile set.")
|
|
38
|
+
|
|
39
|
+
summary = self.generate_executive_summary()
|
|
40
|
+
|
|
41
|
+
html = self._generate_html_template(summary)
|
|
42
|
+
|
|
43
|
+
return html
|
|
44
|
+
|
|
45
|
+
def save_html(self, filepath: str):
|
|
46
|
+
"""Save HTML report to file."""
|
|
47
|
+
html_report = self.to_html()
|
|
48
|
+
|
|
49
|
+
with open(filepath, 'w') as f:
|
|
50
|
+
f.write(html_report)
|
|
51
|
+
|
|
52
|
+
def to_markdown(self) -> str:
|
|
53
|
+
"""Generate Markdown report from profile result."""
|
|
54
|
+
if self.profile is None:
|
|
55
|
+
raise ValueError("No profile set.")
|
|
56
|
+
|
|
57
|
+
summary = self.generate_executive_summary()
|
|
58
|
+
|
|
59
|
+
md = self._generate_markdown_template(summary)
|
|
60
|
+
|
|
61
|
+
return md
|
|
62
|
+
|
|
63
|
+
def save_markdown(self, filepath: str):
|
|
64
|
+
"""Save Markdown report to file."""
|
|
65
|
+
md_report = self.to_markdown()
|
|
66
|
+
|
|
67
|
+
with open(filepath, 'w') as f:
|
|
68
|
+
f.write(md_report)
|
|
69
|
+
|
|
70
|
+
def save_all_formats(self, directory: str, base_filename: str):
|
|
71
|
+
"""Save reports in all formats to a directory."""
|
|
72
|
+
dir_path = Path(directory)
|
|
73
|
+
dir_path.mkdir(parents=True, exist_ok=True)
|
|
74
|
+
|
|
75
|
+
self.save_json(str(dir_path / f"{base_filename}.json"))
|
|
76
|
+
self.save_html(str(dir_path / f"{base_filename}.html"))
|
|
77
|
+
self.save_markdown(str(dir_path / f"{base_filename}.md"))
|
|
78
|
+
|
|
79
|
+
def generate_executive_summary(self) -> Dict[str, Any]:
|
|
80
|
+
"""Generate executive summary of profiling results."""
|
|
81
|
+
if self.profile is None:
|
|
82
|
+
raise ValueError("No profile set.")
|
|
83
|
+
|
|
84
|
+
# Basic dataset info
|
|
85
|
+
summary = {
|
|
86
|
+
"dataset_name": self.profile.dataset_name,
|
|
87
|
+
"total_rows": self.profile.total_rows,
|
|
88
|
+
"total_columns": self.profile.total_columns,
|
|
89
|
+
"profiling_timestamp": self.profile.profiling_timestamp,
|
|
90
|
+
"profiling_duration_seconds": self.profile.profiling_duration_seconds,
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
# Column type breakdown
|
|
94
|
+
type_counts = {}
|
|
95
|
+
for col_profile in self.profile.column_profiles.values():
|
|
96
|
+
col_type = col_profile.configured_type.value
|
|
97
|
+
type_counts[col_type] = type_counts.get(col_type, 0) + 1
|
|
98
|
+
|
|
99
|
+
summary["column_types"] = type_counts
|
|
100
|
+
|
|
101
|
+
# Missing data summary
|
|
102
|
+
total_missing = 0
|
|
103
|
+
columns_with_missing = 0
|
|
104
|
+
|
|
105
|
+
for col_profile in self.profile.column_profiles.values():
|
|
106
|
+
if col_profile.universal_metrics.null_count > 0:
|
|
107
|
+
columns_with_missing += 1
|
|
108
|
+
total_missing += col_profile.universal_metrics.null_count
|
|
109
|
+
|
|
110
|
+
total_cells = self.profile.total_rows * self.profile.total_columns
|
|
111
|
+
missing_percentage = (total_missing / total_cells * 100) if total_cells > 0 else 0
|
|
112
|
+
|
|
113
|
+
summary["total_missing_cells"] = total_missing
|
|
114
|
+
summary["columns_with_missing"] = columns_with_missing
|
|
115
|
+
summary["missing_percentage"] = round(missing_percentage, 2)
|
|
116
|
+
|
|
117
|
+
# Quality score calculation (0-100)
|
|
118
|
+
quality_score = self._calculate_quality_score()
|
|
119
|
+
summary["quality_score"] = quality_score
|
|
120
|
+
|
|
121
|
+
# Memory usage estimate
|
|
122
|
+
total_memory = sum(
|
|
123
|
+
col_profile.universal_metrics.memory_size_bytes
|
|
124
|
+
for col_profile in self.profile.column_profiles.values()
|
|
125
|
+
if hasattr(col_profile.universal_metrics, 'memory_size_bytes') and
|
|
126
|
+
col_profile.universal_metrics.memory_size_bytes is not None
|
|
127
|
+
)
|
|
128
|
+
summary["estimated_memory_mb"] = round(total_memory / (1024 * 1024), 2) if total_memory > 0 else 0.0
|
|
129
|
+
|
|
130
|
+
return summary
|
|
131
|
+
|
|
132
|
+
def _calculate_quality_score(self) -> int:
|
|
133
|
+
"""Calculate overall data quality score (0-100)."""
|
|
134
|
+
if not self.profile or not self.profile.column_profiles:
|
|
135
|
+
return 0
|
|
136
|
+
|
|
137
|
+
penalties = 0
|
|
138
|
+
max_penalties = 100
|
|
139
|
+
|
|
140
|
+
for col_profile in self.profile.column_profiles.values():
|
|
141
|
+
metrics = col_profile.universal_metrics
|
|
142
|
+
|
|
143
|
+
# Penalize missing values
|
|
144
|
+
if metrics.null_percentage > 50:
|
|
145
|
+
penalties += 20
|
|
146
|
+
elif metrics.null_percentage > 20:
|
|
147
|
+
penalties += 10
|
|
148
|
+
elif metrics.null_percentage > 5:
|
|
149
|
+
penalties += 5
|
|
150
|
+
|
|
151
|
+
# Penalize constant columns
|
|
152
|
+
if metrics.distinct_count == 1:
|
|
153
|
+
penalties += 15
|
|
154
|
+
|
|
155
|
+
# Penalize very high cardinality (possible identifiers)
|
|
156
|
+
if metrics.distinct_percentage > 95 and col_profile.configured_type not in [
|
|
157
|
+
ColumnType.IDENTIFIER, ColumnType.TEXT
|
|
158
|
+
]:
|
|
159
|
+
penalties += 5
|
|
160
|
+
|
|
161
|
+
# Cap penalties at max
|
|
162
|
+
penalties = min(penalties, max_penalties)
|
|
163
|
+
|
|
164
|
+
return max(0, 100 - penalties)
|
|
165
|
+
|
|
166
|
+
def calculate_correlations(self, df: pd.DataFrame) -> Optional[Dict[str, Any]]:
|
|
167
|
+
"""Calculate correlation matrix for numeric columns."""
|
|
168
|
+
if self.profile is None:
|
|
169
|
+
return None
|
|
170
|
+
|
|
171
|
+
# Get numeric columns from profile
|
|
172
|
+
numeric_columns = [
|
|
173
|
+
col_name for col_name, col_profile in self.profile.column_profiles.items()
|
|
174
|
+
if col_profile.configured_type in [
|
|
175
|
+
ColumnType.NUMERIC_CONTINUOUS,
|
|
176
|
+
ColumnType.NUMERIC_DISCRETE
|
|
177
|
+
]
|
|
178
|
+
]
|
|
179
|
+
|
|
180
|
+
if len(numeric_columns) < 2:
|
|
181
|
+
return None
|
|
182
|
+
|
|
183
|
+
# Filter dataframe to numeric columns that exist
|
|
184
|
+
numeric_cols_in_df = [col for col in numeric_columns if col in df.columns]
|
|
185
|
+
|
|
186
|
+
if len(numeric_cols_in_df) < 2:
|
|
187
|
+
return None
|
|
188
|
+
|
|
189
|
+
# Calculate correlations
|
|
190
|
+
corr_matrix = df[numeric_cols_in_df].corr()
|
|
191
|
+
|
|
192
|
+
# Convert to dictionary
|
|
193
|
+
correlations = {
|
|
194
|
+
"matrix": corr_matrix.to_dict(),
|
|
195
|
+
"high_correlations": []
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
# Find high correlations (>0.8 or <-0.8)
|
|
199
|
+
for i, col1 in enumerate(numeric_cols_in_df):
|
|
200
|
+
for col2 in numeric_cols_in_df[i + 1:]:
|
|
201
|
+
corr_value = corr_matrix.loc[col1, col2]
|
|
202
|
+
if abs(corr_value) > 0.8:
|
|
203
|
+
correlations["high_correlations"].append({
|
|
204
|
+
"column1": col1,
|
|
205
|
+
"column2": col2,
|
|
206
|
+
"correlation": round(corr_value, 3)
|
|
207
|
+
})
|
|
208
|
+
|
|
209
|
+
return correlations
|
|
210
|
+
|
|
211
|
+
def _generate_html_template(self, summary: Dict[str, Any]) -> str:
|
|
212
|
+
"""Generate HTML report template."""
|
|
213
|
+
html = f"""<!DOCTYPE html>
|
|
214
|
+
<html lang="en">
|
|
215
|
+
<head>
|
|
216
|
+
<meta charset="UTF-8">
|
|
217
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
218
|
+
<title>Profiling Report - {self.profile.dataset_name}</title>
|
|
219
|
+
<style>
|
|
220
|
+
body {{
|
|
221
|
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Arial, sans-serif;
|
|
222
|
+
line-height: 1.6;
|
|
223
|
+
max-width: 1200px;
|
|
224
|
+
margin: 0 auto;
|
|
225
|
+
padding: 20px;
|
|
226
|
+
background-color: #f5f5f5;
|
|
227
|
+
}}
|
|
228
|
+
.header {{
|
|
229
|
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
|
230
|
+
color: white;
|
|
231
|
+
padding: 30px;
|
|
232
|
+
border-radius: 10px;
|
|
233
|
+
margin-bottom: 30px;
|
|
234
|
+
}}
|
|
235
|
+
.summary {{
|
|
236
|
+
background: white;
|
|
237
|
+
padding: 25px;
|
|
238
|
+
border-radius: 10px;
|
|
239
|
+
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
|
240
|
+
margin-bottom: 30px;
|
|
241
|
+
}}
|
|
242
|
+
.summary h2 {{
|
|
243
|
+
margin-top: 0;
|
|
244
|
+
color: #333;
|
|
245
|
+
}}
|
|
246
|
+
.metric {{
|
|
247
|
+
display: inline-block;
|
|
248
|
+
margin: 10px 20px 10px 0;
|
|
249
|
+
}}
|
|
250
|
+
.metric-label {{
|
|
251
|
+
font-size: 0.9em;
|
|
252
|
+
color: #666;
|
|
253
|
+
}}
|
|
254
|
+
.metric-value {{
|
|
255
|
+
font-size: 1.5em;
|
|
256
|
+
font-weight: bold;
|
|
257
|
+
color: #667eea;
|
|
258
|
+
}}
|
|
259
|
+
.quality-score {{
|
|
260
|
+
font-size: 3em;
|
|
261
|
+
font-weight: bold;
|
|
262
|
+
color: {self._get_quality_color(summary['quality_score'])};
|
|
263
|
+
text-align: center;
|
|
264
|
+
}}
|
|
265
|
+
.column-section {{
|
|
266
|
+
background: white;
|
|
267
|
+
padding: 20px;
|
|
268
|
+
border-radius: 10px;
|
|
269
|
+
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
|
270
|
+
margin-bottom: 20px;
|
|
271
|
+
}}
|
|
272
|
+
.column-header {{
|
|
273
|
+
font-size: 1.3em;
|
|
274
|
+
color: #333;
|
|
275
|
+
border-bottom: 2px solid #667eea;
|
|
276
|
+
padding-bottom: 10px;
|
|
277
|
+
margin-bottom: 15px;
|
|
278
|
+
}}
|
|
279
|
+
.column-type {{
|
|
280
|
+
display: inline-block;
|
|
281
|
+
background: #667eea;
|
|
282
|
+
color: white;
|
|
283
|
+
padding: 3px 10px;
|
|
284
|
+
border-radius: 5px;
|
|
285
|
+
font-size: 0.8em;
|
|
286
|
+
margin-left: 10px;
|
|
287
|
+
}}
|
|
288
|
+
table {{
|
|
289
|
+
width: 100%;
|
|
290
|
+
border-collapse: collapse;
|
|
291
|
+
margin: 15px 0;
|
|
292
|
+
}}
|
|
293
|
+
th, td {{
|
|
294
|
+
text-align: left;
|
|
295
|
+
padding: 10px;
|
|
296
|
+
border-bottom: 1px solid #ddd;
|
|
297
|
+
}}
|
|
298
|
+
th {{
|
|
299
|
+
background-color: #f8f9fa;
|
|
300
|
+
font-weight: 600;
|
|
301
|
+
}}
|
|
302
|
+
.progress-bar {{
|
|
303
|
+
width: 100%;
|
|
304
|
+
height: 20px;
|
|
305
|
+
background: #e9ecef;
|
|
306
|
+
border-radius: 10px;
|
|
307
|
+
overflow: hidden;
|
|
308
|
+
}}
|
|
309
|
+
.progress-fill {{
|
|
310
|
+
height: 100%;
|
|
311
|
+
background: #667eea;
|
|
312
|
+
transition: width 0.3s ease;
|
|
313
|
+
}}
|
|
314
|
+
</style>
|
|
315
|
+
</head>
|
|
316
|
+
<body>
|
|
317
|
+
<div class="header">
|
|
318
|
+
<h1>Data Profiling Report</h1>
|
|
319
|
+
<p>{self.profile.dataset_name}</p>
|
|
320
|
+
<p style="font-size: 0.9em; opacity: 0.9;">Generated on {summary['profiling_timestamp']}</p>
|
|
321
|
+
</div>
|
|
322
|
+
|
|
323
|
+
<div class="summary">
|
|
324
|
+
<h2>Executive Summary</h2>
|
|
325
|
+
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 20px;">
|
|
326
|
+
<div class="metric">
|
|
327
|
+
<div class="metric-label">Total Rows</div>
|
|
328
|
+
<div class="metric-value">{summary['total_rows']:,}</div>
|
|
329
|
+
</div>
|
|
330
|
+
<div class="metric">
|
|
331
|
+
<div class="metric-label">Total Columns</div>
|
|
332
|
+
<div class="metric-value">{summary['total_columns']}</div>
|
|
333
|
+
</div>
|
|
334
|
+
<div class="metric">
|
|
335
|
+
<div class="metric-label">Missing Data</div>
|
|
336
|
+
<div class="metric-value">{summary['missing_percentage']}%</div>
|
|
337
|
+
</div>
|
|
338
|
+
<div class="metric">
|
|
339
|
+
<div class="metric-label">Quality Score</div>
|
|
340
|
+
<div class="quality-score">{summary['quality_score']}</div>
|
|
341
|
+
</div>
|
|
342
|
+
</div>
|
|
343
|
+
|
|
344
|
+
<h3>Column Types</h3>
|
|
345
|
+
<table>
|
|
346
|
+
<tr>
|
|
347
|
+
<th>Type</th>
|
|
348
|
+
<th>Count</th>
|
|
349
|
+
</tr>
|
|
350
|
+
"""
|
|
351
|
+
|
|
352
|
+
for col_type, count in summary['column_types'].items():
|
|
353
|
+
html += f""" <tr>
|
|
354
|
+
<td>{col_type}</td>
|
|
355
|
+
<td>{count}</td>
|
|
356
|
+
</tr>
|
|
357
|
+
"""
|
|
358
|
+
|
|
359
|
+
html += """ </table>
|
|
360
|
+
</div>
|
|
361
|
+
|
|
362
|
+
<h2>Column Details</h2>
|
|
363
|
+
"""
|
|
364
|
+
|
|
365
|
+
# Add column sections
|
|
366
|
+
for col_name, col_profile in self.profile.column_profiles.items():
|
|
367
|
+
html += self._generate_column_section_html(col_name, col_profile)
|
|
368
|
+
|
|
369
|
+
html += """
|
|
370
|
+
</body>
|
|
371
|
+
</html>"""
|
|
372
|
+
|
|
373
|
+
return html
|
|
374
|
+
|
|
375
|
+
def _generate_column_section_html(self, col_name: str, col_profile) -> str:
|
|
376
|
+
"""Generate HTML section for a single column."""
|
|
377
|
+
metrics = col_profile.universal_metrics
|
|
378
|
+
|
|
379
|
+
html = f"""
|
|
380
|
+
<div class="column-section">
|
|
381
|
+
<div class="column-header">
|
|
382
|
+
{col_name}
|
|
383
|
+
<span class="column-type">{col_profile.configured_type.value}</span>
|
|
384
|
+
</div>
|
|
385
|
+
|
|
386
|
+
<table>
|
|
387
|
+
<tr>
|
|
388
|
+
<th>Metric</th>
|
|
389
|
+
<th>Value</th>
|
|
390
|
+
</tr>
|
|
391
|
+
<tr>
|
|
392
|
+
<td>Total Count</td>
|
|
393
|
+
<td>{metrics.total_count:,}</td>
|
|
394
|
+
</tr>
|
|
395
|
+
<tr>
|
|
396
|
+
<td>Missing Values</td>
|
|
397
|
+
<td>{metrics.null_count:,} ({metrics.null_percentage:.1f}%)</td>
|
|
398
|
+
</tr>
|
|
399
|
+
<tr>
|
|
400
|
+
<td>Unique Values</td>
|
|
401
|
+
<td>{metrics.distinct_count:,} ({metrics.distinct_percentage:.1f}%)</td>
|
|
402
|
+
</tr>
|
|
403
|
+
"""
|
|
404
|
+
|
|
405
|
+
# Add type-specific metrics
|
|
406
|
+
if col_profile.numeric_metrics:
|
|
407
|
+
nm = col_profile.numeric_metrics
|
|
408
|
+
html += f""" <tr>
|
|
409
|
+
<td>Mean</td>
|
|
410
|
+
<td>{nm.mean:.2f}</td>
|
|
411
|
+
</tr>
|
|
412
|
+
<tr>
|
|
413
|
+
<td>Std Dev</td>
|
|
414
|
+
<td>{nm.std:.2f}</td>
|
|
415
|
+
</tr>
|
|
416
|
+
<tr>
|
|
417
|
+
<td>Min / Max</td>
|
|
418
|
+
<td>{nm.min_value:.2f} / {nm.max_value:.2f}</td>
|
|
419
|
+
</tr>
|
|
420
|
+
"""
|
|
421
|
+
|
|
422
|
+
elif col_profile.categorical_metrics:
|
|
423
|
+
cm = col_profile.categorical_metrics
|
|
424
|
+
top_cats = ', '.join(f"{cat}({count})" for cat, count in cm.top_categories[:5])
|
|
425
|
+
html += f""" <tr>
|
|
426
|
+
<td>Cardinality</td>
|
|
427
|
+
<td>{cm.cardinality}</td>
|
|
428
|
+
</tr>
|
|
429
|
+
<tr>
|
|
430
|
+
<td>Top Categories</td>
|
|
431
|
+
<td>{top_cats}</td>
|
|
432
|
+
</tr>
|
|
433
|
+
"""
|
|
434
|
+
|
|
435
|
+
html += """ </table>
|
|
436
|
+
</div>
|
|
437
|
+
"""
|
|
438
|
+
|
|
439
|
+
return html
|
|
440
|
+
|
|
441
|
+
def _generate_markdown_template(self, summary: Dict[str, Any]) -> str:
|
|
442
|
+
"""Generate Markdown report template."""
|
|
443
|
+
md = f"""# Data Profiling Report
|
|
444
|
+
|
|
445
|
+
## Dataset: {self.profile.dataset_name}
|
|
446
|
+
|
|
447
|
+
**Generated:** {summary['profiling_timestamp']}
|
|
448
|
+
**Duration:** {summary['profiling_duration_seconds']:.2f} seconds
|
|
449
|
+
|
|
450
|
+
---
|
|
451
|
+
|
|
452
|
+
## Executive Summary
|
|
453
|
+
|
|
454
|
+
| Metric | Value |
|
|
455
|
+
|--------|-------|
|
|
456
|
+
| Total Rows | {summary['total_rows']:,} |
|
|
457
|
+
| Total Columns | {summary['total_columns']} |
|
|
458
|
+
| Missing Data | {summary['missing_percentage']}% |
|
|
459
|
+
| Quality Score | **{summary['quality_score']}/100** |
|
|
460
|
+
|
|
461
|
+
### Column Types
|
|
462
|
+
|
|
463
|
+
| Type | Count |
|
|
464
|
+
|------|-------|
|
|
465
|
+
"""
|
|
466
|
+
|
|
467
|
+
for col_type, count in summary['column_types'].items():
|
|
468
|
+
md += f"| {col_type} | {count} |\n"
|
|
469
|
+
|
|
470
|
+
md += "\n---\n\n## Column Details\n\n"
|
|
471
|
+
|
|
472
|
+
# Add column sections
|
|
473
|
+
for col_name, col_profile in self.profile.column_profiles.items():
|
|
474
|
+
md += self._generate_column_section_markdown(col_name, col_profile)
|
|
475
|
+
|
|
476
|
+
return md
|
|
477
|
+
|
|
478
|
+
def _generate_column_section_markdown(self, col_name: str, col_profile) -> str:
|
|
479
|
+
"""Generate Markdown section for a single column."""
|
|
480
|
+
metrics = col_profile.universal_metrics
|
|
481
|
+
|
|
482
|
+
md = f"""### {col_name} `({col_profile.configured_type.value})`
|
|
483
|
+
|
|
484
|
+
| Metric | Value |
|
|
485
|
+
|--------|-------|
|
|
486
|
+
| Total Count | {metrics.total_count:,} |
|
|
487
|
+
| Missing Values | {metrics.null_count:,} ({metrics.null_percentage:.1f}%) |
|
|
488
|
+
| Unique Values | {metrics.distinct_count:,} ({metrics.distinct_percentage:.1f}%) |
|
|
489
|
+
"""
|
|
490
|
+
|
|
491
|
+
# Add type-specific metrics
|
|
492
|
+
if col_profile.numeric_metrics:
|
|
493
|
+
nm = col_profile.numeric_metrics
|
|
494
|
+
md += f"""| Mean | {nm.mean:.2f} |
|
|
495
|
+
| Std Dev | {nm.std:.2f} |
|
|
496
|
+
| Min / Max | {nm.min_value:.2f} / {nm.max_value:.2f} |
|
|
497
|
+
| Median | {nm.median:.2f} |
|
|
498
|
+
"""
|
|
499
|
+
|
|
500
|
+
elif col_profile.categorical_metrics:
|
|
501
|
+
cm = col_profile.categorical_metrics
|
|
502
|
+
top_cats = ', '.join(f"{cat}({count})" for cat, count in cm.top_categories[:5])
|
|
503
|
+
md += f"""| Cardinality | {cm.cardinality} |
|
|
504
|
+
| Top Categories | {top_cats} |
|
|
505
|
+
"""
|
|
506
|
+
|
|
507
|
+
md += "\n"
|
|
508
|
+
|
|
509
|
+
return md
|
|
510
|
+
|
|
511
|
+
def _get_quality_color(self, score: int) -> str:
|
|
512
|
+
"""Get color based on quality score."""
|
|
513
|
+
if score >= 90:
|
|
514
|
+
return "#28a745" # Green
|
|
515
|
+
elif score >= 70:
|
|
516
|
+
return "#ffc107" # Yellow
|
|
517
|
+
elif score >= 50:
|
|
518
|
+
return "#fd7e14" # Orange
|
|
519
|
+
else:
|
|
520
|
+
return "#dc3545" # Red
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
from dataclasses import asdict, dataclass
|
|
2
|
+
from typing import Any, Dict, Optional
|
|
3
|
+
|
|
4
|
+
from customer_retention.core.compat import pd
|
|
5
|
+
from customer_retention.core.config import DataSourceConfig
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class SCDResult:
|
|
10
|
+
"""Result of SCD analysis for a column."""
|
|
11
|
+
column_name: str
|
|
12
|
+
changes_detected: bool
|
|
13
|
+
entities_with_change: int
|
|
14
|
+
change_percentage: float
|
|
15
|
+
max_changes: int
|
|
16
|
+
avg_changes_per_entity: float
|
|
17
|
+
scd_type_recommendation: str
|
|
18
|
+
|
|
19
|
+
def to_dict(self) -> dict:
|
|
20
|
+
"""Convert to dictionary."""
|
|
21
|
+
return asdict(self)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class SCDAnalyzer:
|
|
25
|
+
"""Analyzes Slowly Changing Dimension patterns in data."""
|
|
26
|
+
|
|
27
|
+
def __init__(self, entity_key: Optional[str] = None):
|
|
28
|
+
"""
|
|
29
|
+
Initialize SCD Analyzer.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
entity_key: Column name that identifies unique entities (e.g., customer_id)
|
|
33
|
+
"""
|
|
34
|
+
self.entity_key = entity_key
|
|
35
|
+
|
|
36
|
+
def analyze(self, df: pd.DataFrame, columns: Optional[list] = None) -> Dict[str, Dict[str, Any]]:
|
|
37
|
+
"""
|
|
38
|
+
Analyze SCD patterns in dataframe.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
df: DataFrame with multi-row per entity data
|
|
42
|
+
columns: List of columns to analyze (if None, analyze all except entity_key)
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
Dictionary mapping column names to SCD metrics
|
|
46
|
+
"""
|
|
47
|
+
if self.entity_key is None:
|
|
48
|
+
raise ValueError("entity_key must be set to analyze SCD patterns")
|
|
49
|
+
|
|
50
|
+
if self.entity_key not in df.columns:
|
|
51
|
+
raise ValueError(f"entity_key '{self.entity_key}' not found in dataframe")
|
|
52
|
+
|
|
53
|
+
# Determine columns to analyze
|
|
54
|
+
if columns is None:
|
|
55
|
+
columns = [col for col in df.columns if col != self.entity_key]
|
|
56
|
+
|
|
57
|
+
results = {}
|
|
58
|
+
|
|
59
|
+
for column in columns:
|
|
60
|
+
metrics = self._analyze_column(df, column)
|
|
61
|
+
results[column] = metrics
|
|
62
|
+
|
|
63
|
+
return results
|
|
64
|
+
|
|
65
|
+
def _analyze_column(self, df: pd.DataFrame, column: str) -> Dict[str, Any]:
|
|
66
|
+
"""Analyze SCD pattern for a single column."""
|
|
67
|
+
# Group by entity and count distinct values per entity
|
|
68
|
+
entity_changes = df.groupby(self.entity_key)[column].nunique()
|
|
69
|
+
|
|
70
|
+
# Entities with more than 1 value = changes detected
|
|
71
|
+
entities_with_change = (entity_changes > 1).sum()
|
|
72
|
+
total_entities = len(entity_changes)
|
|
73
|
+
|
|
74
|
+
change_percentage = (entities_with_change / total_entities * 100) if total_entities > 0 else 0.0
|
|
75
|
+
|
|
76
|
+
# Max changes for any entity
|
|
77
|
+
max_changes = int(entity_changes.max() - 1) if len(entity_changes) > 0 else 0
|
|
78
|
+
|
|
79
|
+
# Average changes per entity (only for entities with changes)
|
|
80
|
+
avg_changes = float(entity_changes[entity_changes > 1].mean() - 1) if entities_with_change > 0 else 0.0
|
|
81
|
+
|
|
82
|
+
metrics = {
|
|
83
|
+
"changes_detected": bool(entities_with_change > 0), # Convert numpy bool to Python bool
|
|
84
|
+
"entities_with_change": int(entities_with_change),
|
|
85
|
+
"total_entities": int(total_entities),
|
|
86
|
+
"change_percentage": round(change_percentage, 2),
|
|
87
|
+
"max_changes": max_changes,
|
|
88
|
+
"avg_changes_per_entity": round(avg_changes, 2),
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
# Add SCD type recommendation
|
|
92
|
+
metrics["scd_type_recommendation"] = self.recommend_scd_type(metrics)
|
|
93
|
+
|
|
94
|
+
return metrics
|
|
95
|
+
|
|
96
|
+
def recommend_scd_type(self, metrics: Dict[str, Any]) -> str:
|
|
97
|
+
"""
|
|
98
|
+
Recommend SCD type based on change patterns.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
String describing recommended SCD type
|
|
102
|
+
"""
|
|
103
|
+
if not metrics["changes_detected"]:
|
|
104
|
+
return "Type 0 (Static - Never changes)"
|
|
105
|
+
|
|
106
|
+
change_pct = metrics["change_percentage"]
|
|
107
|
+
avg_changes = metrics.get("avg_changes_per_entity", 0)
|
|
108
|
+
|
|
109
|
+
# Type 1: Rare changes, only current value matters
|
|
110
|
+
if change_pct < 10 and avg_changes < 2:
|
|
111
|
+
return "Type 1 (Overwrite - Rare changes, history not important)"
|
|
112
|
+
|
|
113
|
+
# Type 2: Frequent changes, history matters
|
|
114
|
+
elif change_pct >= 30 or avg_changes >= 3:
|
|
115
|
+
return "Type 2 (Track History - Frequent changes, full history needed)"
|
|
116
|
+
|
|
117
|
+
# Type 3: Moderate changes, only previous value matters
|
|
118
|
+
elif change_pct < 30 and avg_changes < 3:
|
|
119
|
+
return "Type 3 (Keep Previous - Only previous value matters)"
|
|
120
|
+
|
|
121
|
+
# Default
|
|
122
|
+
return "Type 2 (Track History - Moderate to frequent changes)"
|
|
123
|
+
|
|
124
|
+
def analyze_with_config(self, df: pd.DataFrame, config: DataSourceConfig) -> Dict[str, Dict[str, Any]]:
|
|
125
|
+
"""
|
|
126
|
+
Analyze SCD patterns using configuration.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
df: DataFrame to analyze
|
|
130
|
+
config: DataSourceConfig with entity key information
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
Dictionary of SCD metrics per column
|
|
134
|
+
"""
|
|
135
|
+
# Use primary key as entity key
|
|
136
|
+
self.entity_key = config.primary_key
|
|
137
|
+
|
|
138
|
+
# Analyze all columns except primary key
|
|
139
|
+
columns = [col.name for col in config.columns if col.name != config.primary_key]
|
|
140
|
+
|
|
141
|
+
return self.analyze(df, columns)
|
|
142
|
+
|
|
143
|
+
def to_dataframe(self, results: Dict[str, Dict[str, Any]]) -> pd.DataFrame:
|
|
144
|
+
"""Convert SCD analysis results to a summary DataFrame."""
|
|
145
|
+
rows = []
|
|
146
|
+
for column_name, metrics in results.items():
|
|
147
|
+
row = {"column": column_name}
|
|
148
|
+
row.update(metrics)
|
|
149
|
+
rows.append(row)
|
|
150
|
+
|
|
151
|
+
return pd.DataFrame(rows)
|