churnkit 0.75.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
- churnkit-0.75.0a1.dist-info/METADATA +229 -0
- churnkit-0.75.0a1.dist-info/RECORD +302 -0
- churnkit-0.75.0a1.dist-info/WHEEL +4 -0
- churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
- churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
- customer_retention/__init__.py +37 -0
- customer_retention/analysis/__init__.py +0 -0
- customer_retention/analysis/auto_explorer/__init__.py +62 -0
- customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
- customer_retention/analysis/auto_explorer/explorer.py +258 -0
- customer_retention/analysis/auto_explorer/findings.py +291 -0
- customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
- customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
- customer_retention/analysis/auto_explorer/recommendations.py +418 -0
- customer_retention/analysis/business/__init__.py +26 -0
- customer_retention/analysis/business/ab_test_designer.py +144 -0
- customer_retention/analysis/business/fairness_analyzer.py +166 -0
- customer_retention/analysis/business/intervention_matcher.py +121 -0
- customer_retention/analysis/business/report_generator.py +222 -0
- customer_retention/analysis/business/risk_profile.py +199 -0
- customer_retention/analysis/business/roi_analyzer.py +139 -0
- customer_retention/analysis/diagnostics/__init__.py +20 -0
- customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
- customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
- customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
- customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
- customer_retention/analysis/diagnostics/noise_tester.py +140 -0
- customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
- customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
- customer_retention/analysis/discovery/__init__.py +8 -0
- customer_retention/analysis/discovery/config_generator.py +49 -0
- customer_retention/analysis/discovery/discovery_flow.py +19 -0
- customer_retention/analysis/discovery/type_inferencer.py +147 -0
- customer_retention/analysis/interpretability/__init__.py +13 -0
- customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
- customer_retention/analysis/interpretability/counterfactual.py +175 -0
- customer_retention/analysis/interpretability/individual_explainer.py +141 -0
- customer_retention/analysis/interpretability/pdp_generator.py +103 -0
- customer_retention/analysis/interpretability/shap_explainer.py +106 -0
- customer_retention/analysis/jupyter_save_hook.py +28 -0
- customer_retention/analysis/notebook_html_exporter.py +136 -0
- customer_retention/analysis/notebook_progress.py +60 -0
- customer_retention/analysis/plotly_preprocessor.py +154 -0
- customer_retention/analysis/recommendations/__init__.py +54 -0
- customer_retention/analysis/recommendations/base.py +158 -0
- customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
- customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
- customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
- customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
- customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
- customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
- customer_retention/analysis/recommendations/datetime/extract.py +149 -0
- customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
- customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
- customer_retention/analysis/recommendations/pipeline.py +74 -0
- customer_retention/analysis/recommendations/registry.py +76 -0
- customer_retention/analysis/recommendations/selection/__init__.py +3 -0
- customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
- customer_retention/analysis/recommendations/transform/__init__.py +4 -0
- customer_retention/analysis/recommendations/transform/power.py +94 -0
- customer_retention/analysis/recommendations/transform/scale.py +112 -0
- customer_retention/analysis/visualization/__init__.py +15 -0
- customer_retention/analysis/visualization/chart_builder.py +2619 -0
- customer_retention/analysis/visualization/console.py +122 -0
- customer_retention/analysis/visualization/display.py +171 -0
- customer_retention/analysis/visualization/number_formatter.py +36 -0
- customer_retention/artifacts/__init__.py +3 -0
- customer_retention/artifacts/fit_artifact_registry.py +146 -0
- customer_retention/cli.py +93 -0
- customer_retention/core/__init__.py +0 -0
- customer_retention/core/compat/__init__.py +193 -0
- customer_retention/core/compat/detection.py +99 -0
- customer_retention/core/compat/ops.py +48 -0
- customer_retention/core/compat/pandas_backend.py +57 -0
- customer_retention/core/compat/spark_backend.py +75 -0
- customer_retention/core/components/__init__.py +11 -0
- customer_retention/core/components/base.py +79 -0
- customer_retention/core/components/components/__init__.py +13 -0
- customer_retention/core/components/components/deployer.py +26 -0
- customer_retention/core/components/components/explainer.py +26 -0
- customer_retention/core/components/components/feature_eng.py +33 -0
- customer_retention/core/components/components/ingester.py +34 -0
- customer_retention/core/components/components/profiler.py +34 -0
- customer_retention/core/components/components/trainer.py +38 -0
- customer_retention/core/components/components/transformer.py +36 -0
- customer_retention/core/components/components/validator.py +37 -0
- customer_retention/core/components/enums.py +33 -0
- customer_retention/core/components/orchestrator.py +94 -0
- customer_retention/core/components/registry.py +59 -0
- customer_retention/core/config/__init__.py +39 -0
- customer_retention/core/config/column_config.py +95 -0
- customer_retention/core/config/experiments.py +71 -0
- customer_retention/core/config/pipeline_config.py +117 -0
- customer_retention/core/config/source_config.py +83 -0
- customer_retention/core/utils/__init__.py +28 -0
- customer_retention/core/utils/leakage.py +85 -0
- customer_retention/core/utils/severity.py +53 -0
- customer_retention/core/utils/statistics.py +90 -0
- customer_retention/generators/__init__.py +0 -0
- customer_retention/generators/notebook_generator/__init__.py +167 -0
- customer_retention/generators/notebook_generator/base.py +55 -0
- customer_retention/generators/notebook_generator/cell_builder.py +49 -0
- customer_retention/generators/notebook_generator/config.py +47 -0
- customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
- customer_retention/generators/notebook_generator/local_generator.py +48 -0
- customer_retention/generators/notebook_generator/project_init.py +174 -0
- customer_retention/generators/notebook_generator/runner.py +150 -0
- customer_retention/generators/notebook_generator/script_generator.py +110 -0
- customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
- customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
- customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
- customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
- customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
- customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
- customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
- customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
- customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
- customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
- customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
- customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
- customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
- customer_retention/generators/orchestration/__init__.py +23 -0
- customer_retention/generators/orchestration/code_generator.py +196 -0
- customer_retention/generators/orchestration/context.py +147 -0
- customer_retention/generators/orchestration/data_materializer.py +188 -0
- customer_retention/generators/orchestration/databricks_exporter.py +411 -0
- customer_retention/generators/orchestration/doc_generator.py +311 -0
- customer_retention/generators/pipeline_generator/__init__.py +26 -0
- customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
- customer_retention/generators/pipeline_generator/generator.py +142 -0
- customer_retention/generators/pipeline_generator/models.py +166 -0
- customer_retention/generators/pipeline_generator/renderer.py +2125 -0
- customer_retention/generators/spec_generator/__init__.py +37 -0
- customer_retention/generators/spec_generator/databricks_generator.py +433 -0
- customer_retention/generators/spec_generator/generic_generator.py +373 -0
- customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
- customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
- customer_retention/integrations/__init__.py +0 -0
- customer_retention/integrations/adapters/__init__.py +13 -0
- customer_retention/integrations/adapters/base.py +10 -0
- customer_retention/integrations/adapters/factory.py +25 -0
- customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
- customer_retention/integrations/adapters/feature_store/base.py +57 -0
- customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
- customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
- customer_retention/integrations/adapters/feature_store/local.py +75 -0
- customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
- customer_retention/integrations/adapters/mlflow/base.py +32 -0
- customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
- customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
- customer_retention/integrations/adapters/mlflow/local.py +50 -0
- customer_retention/integrations/adapters/storage/__init__.py +5 -0
- customer_retention/integrations/adapters/storage/base.py +33 -0
- customer_retention/integrations/adapters/storage/databricks.py +76 -0
- customer_retention/integrations/adapters/storage/local.py +59 -0
- customer_retention/integrations/feature_store/__init__.py +47 -0
- customer_retention/integrations/feature_store/definitions.py +215 -0
- customer_retention/integrations/feature_store/manager.py +744 -0
- customer_retention/integrations/feature_store/registry.py +412 -0
- customer_retention/integrations/iteration/__init__.py +28 -0
- customer_retention/integrations/iteration/context.py +212 -0
- customer_retention/integrations/iteration/feedback_collector.py +184 -0
- customer_retention/integrations/iteration/orchestrator.py +168 -0
- customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
- customer_retention/integrations/iteration/signals.py +212 -0
- customer_retention/integrations/llm_context/__init__.py +4 -0
- customer_retention/integrations/llm_context/context_builder.py +201 -0
- customer_retention/integrations/llm_context/prompts.py +100 -0
- customer_retention/integrations/streaming/__init__.py +103 -0
- customer_retention/integrations/streaming/batch_integration.py +149 -0
- customer_retention/integrations/streaming/early_warning_model.py +227 -0
- customer_retention/integrations/streaming/event_schema.py +214 -0
- customer_retention/integrations/streaming/online_store_writer.py +249 -0
- customer_retention/integrations/streaming/realtime_scorer.py +261 -0
- customer_retention/integrations/streaming/trigger_engine.py +293 -0
- customer_retention/integrations/streaming/window_aggregator.py +393 -0
- customer_retention/stages/__init__.py +0 -0
- customer_retention/stages/cleaning/__init__.py +9 -0
- customer_retention/stages/cleaning/base.py +28 -0
- customer_retention/stages/cleaning/missing_handler.py +160 -0
- customer_retention/stages/cleaning/outlier_handler.py +204 -0
- customer_retention/stages/deployment/__init__.py +28 -0
- customer_retention/stages/deployment/batch_scorer.py +106 -0
- customer_retention/stages/deployment/champion_challenger.py +299 -0
- customer_retention/stages/deployment/model_registry.py +182 -0
- customer_retention/stages/deployment/retraining_trigger.py +245 -0
- customer_retention/stages/features/__init__.py +73 -0
- customer_retention/stages/features/behavioral_features.py +266 -0
- customer_retention/stages/features/customer_segmentation.py +505 -0
- customer_retention/stages/features/feature_definitions.py +265 -0
- customer_retention/stages/features/feature_engineer.py +551 -0
- customer_retention/stages/features/feature_manifest.py +340 -0
- customer_retention/stages/features/feature_selector.py +239 -0
- customer_retention/stages/features/interaction_features.py +160 -0
- customer_retention/stages/features/temporal_features.py +243 -0
- customer_retention/stages/ingestion/__init__.py +9 -0
- customer_retention/stages/ingestion/load_result.py +32 -0
- customer_retention/stages/ingestion/loaders.py +195 -0
- customer_retention/stages/ingestion/source_registry.py +130 -0
- customer_retention/stages/modeling/__init__.py +31 -0
- customer_retention/stages/modeling/baseline_trainer.py +139 -0
- customer_retention/stages/modeling/cross_validator.py +125 -0
- customer_retention/stages/modeling/data_splitter.py +205 -0
- customer_retention/stages/modeling/feature_scaler.py +99 -0
- customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
- customer_retention/stages/modeling/imbalance_handler.py +282 -0
- customer_retention/stages/modeling/mlflow_logger.py +95 -0
- customer_retention/stages/modeling/model_comparator.py +149 -0
- customer_retention/stages/modeling/model_evaluator.py +138 -0
- customer_retention/stages/modeling/threshold_optimizer.py +131 -0
- customer_retention/stages/monitoring/__init__.py +37 -0
- customer_retention/stages/monitoring/alert_manager.py +328 -0
- customer_retention/stages/monitoring/drift_detector.py +201 -0
- customer_retention/stages/monitoring/performance_monitor.py +242 -0
- customer_retention/stages/preprocessing/__init__.py +5 -0
- customer_retention/stages/preprocessing/transformer_manager.py +284 -0
- customer_retention/stages/profiling/__init__.py +256 -0
- customer_retention/stages/profiling/categorical_distribution.py +269 -0
- customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
- customer_retention/stages/profiling/column_profiler.py +527 -0
- customer_retention/stages/profiling/distribution_analysis.py +483 -0
- customer_retention/stages/profiling/drift_detector.py +310 -0
- customer_retention/stages/profiling/feature_capacity.py +507 -0
- customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
- customer_retention/stages/profiling/profile_result.py +212 -0
- customer_retention/stages/profiling/quality_checks.py +1632 -0
- customer_retention/stages/profiling/relationship_detector.py +256 -0
- customer_retention/stages/profiling/relationship_recommender.py +454 -0
- customer_retention/stages/profiling/report_generator.py +520 -0
- customer_retention/stages/profiling/scd_analyzer.py +151 -0
- customer_retention/stages/profiling/segment_analyzer.py +632 -0
- customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
- customer_retention/stages/profiling/target_level_analyzer.py +217 -0
- customer_retention/stages/profiling/temporal_analyzer.py +388 -0
- customer_retention/stages/profiling/temporal_coverage.py +488 -0
- customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
- customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
- customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
- customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
- customer_retention/stages/profiling/text_embedder.py +87 -0
- customer_retention/stages/profiling/text_processor.py +115 -0
- customer_retention/stages/profiling/text_reducer.py +60 -0
- customer_retention/stages/profiling/time_series_profiler.py +303 -0
- customer_retention/stages/profiling/time_window_aggregator.py +376 -0
- customer_retention/stages/profiling/type_detector.py +382 -0
- customer_retention/stages/profiling/window_recommendation.py +288 -0
- customer_retention/stages/temporal/__init__.py +166 -0
- customer_retention/stages/temporal/access_guard.py +180 -0
- customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
- customer_retention/stages/temporal/data_preparer.py +178 -0
- customer_retention/stages/temporal/point_in_time_join.py +134 -0
- customer_retention/stages/temporal/point_in_time_registry.py +148 -0
- customer_retention/stages/temporal/scenario_detector.py +163 -0
- customer_retention/stages/temporal/snapshot_manager.py +259 -0
- customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
- customer_retention/stages/temporal/timestamp_discovery.py +531 -0
- customer_retention/stages/temporal/timestamp_manager.py +255 -0
- customer_retention/stages/transformation/__init__.py +13 -0
- customer_retention/stages/transformation/binary_handler.py +85 -0
- customer_retention/stages/transformation/categorical_encoder.py +245 -0
- customer_retention/stages/transformation/datetime_transformer.py +97 -0
- customer_retention/stages/transformation/numeric_transformer.py +181 -0
- customer_retention/stages/transformation/pipeline.py +257 -0
- customer_retention/stages/validation/__init__.py +60 -0
- customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
- customer_retention/stages/validation/business_sense_gate.py +173 -0
- customer_retention/stages/validation/data_quality_gate.py +235 -0
- customer_retention/stages/validation/data_validators.py +511 -0
- customer_retention/stages/validation/feature_quality_gate.py +183 -0
- customer_retention/stages/validation/gates.py +117 -0
- customer_retention/stages/validation/leakage_gate.py +352 -0
- customer_retention/stages/validation/model_validity_gate.py +213 -0
- customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
- customer_retention/stages/validation/quality_scorer.py +544 -0
- customer_retention/stages/validation/rule_generator.py +57 -0
- customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
- customer_retention/stages/validation/timeseries_detector.py +769 -0
- customer_retention/transforms/__init__.py +47 -0
- customer_retention/transforms/artifact_store.py +50 -0
- customer_retention/transforms/executor.py +157 -0
- customer_retention/transforms/fitted.py +92 -0
- customer_retention/transforms/ops.py +148 -0
|
@@ -0,0 +1,1632 @@
|
|
|
1
|
+
from typing import Any, Optional
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
|
|
5
|
+
from customer_retention.core.compat import is_datetime64_any_dtype, pd
|
|
6
|
+
from customer_retention.core.components.enums import Severity
|
|
7
|
+
from customer_retention.core.config import ColumnType
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class QualityCheckResult(BaseModel):
|
|
11
|
+
check_id: str
|
|
12
|
+
check_name: str
|
|
13
|
+
column_name: str
|
|
14
|
+
passed: bool
|
|
15
|
+
severity: Severity
|
|
16
|
+
message: str
|
|
17
|
+
details: dict[str, Any] = {}
|
|
18
|
+
recommendation: Optional[str] = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class QualityCheck:
|
|
22
|
+
def __init__(self, check_id: str, check_name: str, severity: Severity):
|
|
23
|
+
self.check_id = check_id
|
|
24
|
+
self.check_name = check_name
|
|
25
|
+
self.severity = severity
|
|
26
|
+
|
|
27
|
+
def create_result(self, column_name: str, passed: bool, message: str,
|
|
28
|
+
details: dict = None, recommendation: str = None,
|
|
29
|
+
severity: Optional[Severity] = None) -> QualityCheckResult:
|
|
30
|
+
return QualityCheckResult(
|
|
31
|
+
check_id=self.check_id,
|
|
32
|
+
check_name=self.check_name,
|
|
33
|
+
column_name=column_name,
|
|
34
|
+
passed=passed,
|
|
35
|
+
severity=severity or self.severity,
|
|
36
|
+
message=message,
|
|
37
|
+
details=details or {},
|
|
38
|
+
recommendation=recommendation
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class MissingValueCheck(QualityCheck):
|
|
43
|
+
def __init__(self):
|
|
44
|
+
super().__init__("FQ001", "Column has >95% missing", Severity.CRITICAL)
|
|
45
|
+
self.threshold_critical = 95.0
|
|
46
|
+
self.threshold_high = 70.0
|
|
47
|
+
self.threshold_medium = 20.0
|
|
48
|
+
|
|
49
|
+
def run(self, column_name: str, universal_metrics: Any) -> QualityCheckResult:
|
|
50
|
+
null_pct = universal_metrics.null_percentage
|
|
51
|
+
|
|
52
|
+
if null_pct > self.threshold_critical:
|
|
53
|
+
return self.create_result(
|
|
54
|
+
column_name, False,
|
|
55
|
+
f"Critical: {null_pct}% missing values (>{self.threshold_critical}%)",
|
|
56
|
+
{"null_percentage": null_pct, "null_count": universal_metrics.null_count},
|
|
57
|
+
"Consider imputation strategy or feature removal if not informative",
|
|
58
|
+
Severity.CRITICAL
|
|
59
|
+
)
|
|
60
|
+
elif null_pct > self.threshold_high:
|
|
61
|
+
return self.create_result(
|
|
62
|
+
column_name, False,
|
|
63
|
+
f"High: {null_pct}% missing values (>{self.threshold_high}%)",
|
|
64
|
+
{"null_percentage": null_pct, "null_count": universal_metrics.null_count},
|
|
65
|
+
"Review imputation strategy or investigate data collection issues",
|
|
66
|
+
Severity.HIGH
|
|
67
|
+
)
|
|
68
|
+
elif null_pct > self.threshold_medium:
|
|
69
|
+
return self.create_result(
|
|
70
|
+
column_name, True,
|
|
71
|
+
f"Medium: {null_pct}% missing values (>{self.threshold_medium}%)",
|
|
72
|
+
{"null_percentage": null_pct, "null_count": universal_metrics.null_count},
|
|
73
|
+
"Monitor missingness pattern and consider simple imputation",
|
|
74
|
+
Severity.MEDIUM
|
|
75
|
+
)
|
|
76
|
+
else:
|
|
77
|
+
return self.create_result(
|
|
78
|
+
column_name, True,
|
|
79
|
+
f"Acceptable missing values: {null_pct}%",
|
|
80
|
+
{"null_percentage": null_pct, "null_count": universal_metrics.null_count}
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class HighCardinalityCheck(QualityCheck):
|
|
85
|
+
def __init__(self):
|
|
86
|
+
super().__init__("CAT001", "High Cardinality Categorical", Severity.MEDIUM)
|
|
87
|
+
self.threshold_ratio = 0.95
|
|
88
|
+
|
|
89
|
+
def run(self, column_name: str, categorical_metrics: Any) -> Optional[QualityCheckResult]:
|
|
90
|
+
if categorical_metrics is None:
|
|
91
|
+
return None
|
|
92
|
+
|
|
93
|
+
cardinality_ratio = categorical_metrics.cardinality_ratio
|
|
94
|
+
|
|
95
|
+
if cardinality_ratio > self.threshold_ratio:
|
|
96
|
+
return self.create_result(
|
|
97
|
+
column_name, False,
|
|
98
|
+
f"Very high cardinality ratio: {cardinality_ratio:.2%}",
|
|
99
|
+
{"cardinality": categorical_metrics.cardinality, "cardinality_ratio": cardinality_ratio},
|
|
100
|
+
f"Consider using {categorical_metrics.encoding_recommendation} encoding or treating as text"
|
|
101
|
+
)
|
|
102
|
+
else:
|
|
103
|
+
return self.create_result(
|
|
104
|
+
column_name, True,
|
|
105
|
+
f"Acceptable cardinality ratio: {cardinality_ratio:.2%}",
|
|
106
|
+
{"cardinality": categorical_metrics.cardinality, "cardinality_ratio": cardinality_ratio}
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class LowCardinalityCheck(QualityCheck):
|
|
111
|
+
def __init__(self):
|
|
112
|
+
super().__init__("NUM001", "Low Cardinality Numeric", Severity.LOW)
|
|
113
|
+
self.threshold = 10
|
|
114
|
+
|
|
115
|
+
def run(self, column_name: str, universal_metrics: Any, column_type: ColumnType) -> Optional[QualityCheckResult]:
|
|
116
|
+
if column_type not in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]:
|
|
117
|
+
return None
|
|
118
|
+
|
|
119
|
+
distinct_count = universal_metrics.distinct_count
|
|
120
|
+
|
|
121
|
+
if distinct_count < self.threshold:
|
|
122
|
+
return self.create_result(
|
|
123
|
+
column_name, False,
|
|
124
|
+
f"Low cardinality for numeric: {distinct_count} unique values",
|
|
125
|
+
{"distinct_count": distinct_count},
|
|
126
|
+
"Consider treating as categorical or ordinal feature"
|
|
127
|
+
)
|
|
128
|
+
else:
|
|
129
|
+
return self.create_result(
|
|
130
|
+
column_name, True,
|
|
131
|
+
f"Acceptable cardinality for numeric: {distinct_count} unique values",
|
|
132
|
+
{"distinct_count": distinct_count}
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class ConstantFeatureCheck(QualityCheck):
|
|
137
|
+
def __init__(self):
|
|
138
|
+
super().__init__("FQ003", "Column is constant", Severity.CRITICAL)
|
|
139
|
+
self.threshold_ratio = 1.0
|
|
140
|
+
|
|
141
|
+
def run(self, column_name: str, universal_metrics: Any, column_type: Optional[ColumnType] = None) -> QualityCheckResult:
|
|
142
|
+
if universal_metrics.total_count == 0:
|
|
143
|
+
return self.create_result(column_name, True, "Empty column", {})
|
|
144
|
+
|
|
145
|
+
distinct_count = universal_metrics.distinct_count
|
|
146
|
+
|
|
147
|
+
if distinct_count == 1:
|
|
148
|
+
return self.create_result(
|
|
149
|
+
column_name, False,
|
|
150
|
+
f"Column is constant: only 1 distinct value ({universal_metrics.most_common_value})",
|
|
151
|
+
{"distinct_count": 1, "constant_value": universal_metrics.most_common_value},
|
|
152
|
+
"CRITICAL: Remove constant column - provides no information for modeling"
|
|
153
|
+
)
|
|
154
|
+
else:
|
|
155
|
+
return self.create_result(
|
|
156
|
+
column_name, True,
|
|
157
|
+
f"Column has {distinct_count} distinct values",
|
|
158
|
+
{"distinct_count": distinct_count}
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class ImbalancedTargetCheck(QualityCheck):
|
|
163
|
+
def __init__(self):
|
|
164
|
+
super().__init__("CAT002", "Imbalanced Target Variable", Severity.HIGH)
|
|
165
|
+
self.threshold_severe = 20.0
|
|
166
|
+
self.threshold_moderate = 5.0
|
|
167
|
+
|
|
168
|
+
def run(self, column_name: str, target_metrics: Any) -> Optional[QualityCheckResult]:
|
|
169
|
+
if target_metrics is None:
|
|
170
|
+
return None
|
|
171
|
+
|
|
172
|
+
imbalance_ratio = target_metrics.imbalance_ratio
|
|
173
|
+
minority_pct = target_metrics.minority_percentage
|
|
174
|
+
|
|
175
|
+
if imbalance_ratio > self.threshold_severe:
|
|
176
|
+
return self.create_result(
|
|
177
|
+
column_name, False,
|
|
178
|
+
f"Severe imbalance: {imbalance_ratio:.1f}:1 ratio, minority class {minority_pct}%",
|
|
179
|
+
{"imbalance_ratio": imbalance_ratio, "minority_percentage": minority_pct,
|
|
180
|
+
"minority_class": target_metrics.minority_class},
|
|
181
|
+
"Apply SMOTE, class weights, or stratified sampling"
|
|
182
|
+
)
|
|
183
|
+
elif imbalance_ratio > self.threshold_moderate:
|
|
184
|
+
return self.create_result(
|
|
185
|
+
column_name, False,
|
|
186
|
+
f"Moderate imbalance: {imbalance_ratio:.1f}:1 ratio, minority class {minority_pct}%",
|
|
187
|
+
{"imbalance_ratio": imbalance_ratio, "minority_percentage": minority_pct,
|
|
188
|
+
"minority_class": target_metrics.minority_class},
|
|
189
|
+
"Consider class weights or balanced sampling"
|
|
190
|
+
)
|
|
191
|
+
else:
|
|
192
|
+
return self.create_result(
|
|
193
|
+
column_name, True,
|
|
194
|
+
f"Acceptable balance: {imbalance_ratio:.1f}:1 ratio, minority class {minority_pct}%",
|
|
195
|
+
{"imbalance_ratio": imbalance_ratio, "minority_percentage": minority_pct}
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
class TargetNullCheck(QualityCheck):
|
|
200
|
+
def __init__(self):
|
|
201
|
+
super().__init__("TG001", "Target Contains Nulls", Severity.CRITICAL)
|
|
202
|
+
|
|
203
|
+
def run(self, column_name: str, universal_metrics: Any) -> Optional[QualityCheckResult]:
|
|
204
|
+
if universal_metrics is None:
|
|
205
|
+
return None
|
|
206
|
+
|
|
207
|
+
if universal_metrics.null_count > 0:
|
|
208
|
+
return self.create_result(
|
|
209
|
+
column_name, False,
|
|
210
|
+
f"Target variable contains {universal_metrics.null_count} null values ({universal_metrics.null_percentage}%)",
|
|
211
|
+
{"null_count": universal_metrics.null_count, "null_percentage": universal_metrics.null_percentage},
|
|
212
|
+
"CRITICAL: Target variable must not contain nulls. Remove or impute before modeling."
|
|
213
|
+
)
|
|
214
|
+
else:
|
|
215
|
+
return self.create_result(
|
|
216
|
+
column_name, True,
|
|
217
|
+
"Target variable has no null values",
|
|
218
|
+
{"null_count": 0}
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
class SingleClassTargetCheck(QualityCheck):
|
|
223
|
+
def __init__(self):
|
|
224
|
+
super().__init__("TG005", "Single Class Target", Severity.CRITICAL)
|
|
225
|
+
|
|
226
|
+
def run(self, column_name: str, target_metrics: Any) -> Optional[QualityCheckResult]:
|
|
227
|
+
if target_metrics is None:
|
|
228
|
+
return None
|
|
229
|
+
|
|
230
|
+
if target_metrics.n_classes == 1:
|
|
231
|
+
return self.create_result(
|
|
232
|
+
column_name, False,
|
|
233
|
+
f"Target variable has only 1 class: {list(target_metrics.class_distribution.keys())[0]}",
|
|
234
|
+
{"n_classes": 1, "classes": list(target_metrics.class_distribution.keys())},
|
|
235
|
+
"CRITICAL: Cannot train a classifier with only one class. Check data filtering or sampling."
|
|
236
|
+
)
|
|
237
|
+
else:
|
|
238
|
+
return self.create_result(
|
|
239
|
+
column_name, True,
|
|
240
|
+
f"Target variable has {target_metrics.n_classes} classes",
|
|
241
|
+
{"n_classes": target_metrics.n_classes}
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
class TargetSevereImbalanceCheck(QualityCheck):
|
|
246
|
+
def __init__(self):
|
|
247
|
+
super().__init__("TG002", "Target Severe Imbalance", Severity.HIGH)
|
|
248
|
+
self.threshold = 1.0 # < 1%
|
|
249
|
+
|
|
250
|
+
def run(self, column_name: str, target_metrics: Any) -> Optional[QualityCheckResult]:
|
|
251
|
+
if target_metrics is None:
|
|
252
|
+
return None
|
|
253
|
+
|
|
254
|
+
minority_pct = target_metrics.minority_percentage
|
|
255
|
+
|
|
256
|
+
if minority_pct < self.threshold:
|
|
257
|
+
return self.create_result(
|
|
258
|
+
column_name, False,
|
|
259
|
+
f"Target has severe class imbalance: minority class {minority_pct}% (< {self.threshold}%)",
|
|
260
|
+
{"minority_percentage": minority_pct, "minority_class": target_metrics.minority_class,
|
|
261
|
+
"imbalance_ratio": target_metrics.imbalance_ratio},
|
|
262
|
+
"Apply SMOTE, class weights, or consider alternative algorithms (e.g., anomaly detection)."
|
|
263
|
+
)
|
|
264
|
+
else:
|
|
265
|
+
return self.create_result(
|
|
266
|
+
column_name, True,
|
|
267
|
+
f"Minority class at {minority_pct}% (>= {self.threshold}%)",
|
|
268
|
+
{"minority_percentage": minority_pct}
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
class TargetModerateImbalanceCheck(QualityCheck):
|
|
273
|
+
def __init__(self):
|
|
274
|
+
super().__init__("TG003", "Target Moderate Imbalance", Severity.MEDIUM)
|
|
275
|
+
self.threshold = 10.0 # < 10%
|
|
276
|
+
|
|
277
|
+
def run(self, column_name: str, target_metrics: Any) -> Optional[QualityCheckResult]:
|
|
278
|
+
if target_metrics is None:
|
|
279
|
+
return None
|
|
280
|
+
|
|
281
|
+
minority_pct = target_metrics.minority_percentage
|
|
282
|
+
|
|
283
|
+
if minority_pct < self.threshold:
|
|
284
|
+
return self.create_result(
|
|
285
|
+
column_name, False,
|
|
286
|
+
f"Target has moderate class imbalance: minority class {minority_pct}% (< {self.threshold}%)",
|
|
287
|
+
{"minority_percentage": minority_pct, "minority_class": target_metrics.minority_class,
|
|
288
|
+
"imbalance_ratio": target_metrics.imbalance_ratio},
|
|
289
|
+
"Consider class weights, stratified sampling, or balanced algorithms."
|
|
290
|
+
)
|
|
291
|
+
else:
|
|
292
|
+
return self.create_result(
|
|
293
|
+
column_name, True,
|
|
294
|
+
f"Minority class at {minority_pct}% (>= {self.threshold}%)",
|
|
295
|
+
{"minority_percentage": minority_pct}
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
class TargetUnexpectedClassesCheck(QualityCheck):
|
|
300
|
+
def __init__(self, expected_classes: Optional[int] = None):
|
|
301
|
+
super().__init__("TG004", "Target Unexpected Classes", Severity.HIGH)
|
|
302
|
+
self.expected_classes = expected_classes
|
|
303
|
+
|
|
304
|
+
def run(self, column_name: str, target_metrics: Any) -> Optional[QualityCheckResult]:
|
|
305
|
+
if target_metrics is None or self.expected_classes is None:
|
|
306
|
+
return None
|
|
307
|
+
|
|
308
|
+
n_classes = target_metrics.n_classes
|
|
309
|
+
|
|
310
|
+
if n_classes != self.expected_classes:
|
|
311
|
+
return self.create_result(
|
|
312
|
+
column_name, False,
|
|
313
|
+
f"Target has {n_classes} classes, expected {self.expected_classes}",
|
|
314
|
+
{"n_classes": n_classes, "expected_classes": self.expected_classes,
|
|
315
|
+
"classes": list(target_metrics.class_distribution.keys())},
|
|
316
|
+
"Investigate class mismatch. Check for data leakage, incorrect filtering, or configuration error."
|
|
317
|
+
)
|
|
318
|
+
else:
|
|
319
|
+
return self.create_result(
|
|
320
|
+
column_name, True,
|
|
321
|
+
f"Target has expected {n_classes} classes",
|
|
322
|
+
{"n_classes": n_classes}
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
class SkewnessCheck(QualityCheck):
|
|
327
|
+
def __init__(self):
|
|
328
|
+
super().__init__("NUM002", "Extreme Skewness", Severity.MEDIUM)
|
|
329
|
+
self.threshold_severe = 3.0
|
|
330
|
+
self.threshold_moderate = 1.0
|
|
331
|
+
|
|
332
|
+
def run(self, column_name: str, numeric_metrics: Any) -> Optional[QualityCheckResult]:
|
|
333
|
+
if numeric_metrics is None or numeric_metrics.skewness is None:
|
|
334
|
+
return None
|
|
335
|
+
|
|
336
|
+
skewness = abs(numeric_metrics.skewness)
|
|
337
|
+
|
|
338
|
+
if skewness > self.threshold_severe:
|
|
339
|
+
return self.create_result(
|
|
340
|
+
column_name, False,
|
|
341
|
+
f"Extreme skewness: {numeric_metrics.skewness:.2f}",
|
|
342
|
+
{"skewness": numeric_metrics.skewness},
|
|
343
|
+
"Apply log, sqrt, or Box-Cox transformation"
|
|
344
|
+
)
|
|
345
|
+
elif skewness > self.threshold_moderate:
|
|
346
|
+
return self.create_result(
|
|
347
|
+
column_name, False,
|
|
348
|
+
f"Moderate skewness: {numeric_metrics.skewness:.2f}",
|
|
349
|
+
{"skewness": numeric_metrics.skewness},
|
|
350
|
+
"Consider transformation for linear models"
|
|
351
|
+
)
|
|
352
|
+
else:
|
|
353
|
+
return self.create_result(
|
|
354
|
+
column_name, True,
|
|
355
|
+
f"Acceptable skewness: {numeric_metrics.skewness:.2f}",
|
|
356
|
+
{"skewness": numeric_metrics.skewness}
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
class OutlierCheck(QualityCheck):
|
|
361
|
+
def __init__(self):
|
|
362
|
+
super().__init__("NUM003", "Excessive Outliers", Severity.MEDIUM)
|
|
363
|
+
self.threshold_high = 10.0
|
|
364
|
+
self.threshold_medium = 5.0
|
|
365
|
+
|
|
366
|
+
def run(self, column_name: str, numeric_metrics: Any) -> Optional[QualityCheckResult]:
|
|
367
|
+
if numeric_metrics is None:
|
|
368
|
+
return None
|
|
369
|
+
|
|
370
|
+
outlier_pct = numeric_metrics.outlier_percentage
|
|
371
|
+
|
|
372
|
+
if outlier_pct > self.threshold_high:
|
|
373
|
+
return self.create_result(
|
|
374
|
+
column_name, False,
|
|
375
|
+
f"High outlier percentage: {outlier_pct}%",
|
|
376
|
+
{"outlier_count_iqr": numeric_metrics.outlier_count_iqr,
|
|
377
|
+
"outlier_percentage": outlier_pct},
|
|
378
|
+
"Review outliers for data quality issues or apply winsorization/clipping"
|
|
379
|
+
)
|
|
380
|
+
elif outlier_pct > self.threshold_medium:
|
|
381
|
+
return self.create_result(
|
|
382
|
+
column_name, False,
|
|
383
|
+
f"Moderate outlier percentage: {outlier_pct}%",
|
|
384
|
+
{"outlier_count_iqr": numeric_metrics.outlier_count_iqr,
|
|
385
|
+
"outlier_percentage": outlier_pct},
|
|
386
|
+
"Consider robust scaling or outlier treatment"
|
|
387
|
+
)
|
|
388
|
+
else:
|
|
389
|
+
return self.create_result(
|
|
390
|
+
column_name, True,
|
|
391
|
+
f"Acceptable outlier percentage: {outlier_pct}%",
|
|
392
|
+
{"outlier_percentage": outlier_pct}
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
class ZeroInflationCheck(QualityCheck):
|
|
397
|
+
def __init__(self):
|
|
398
|
+
super().__init__("NUM004", "Zero-Inflated Feature", Severity.LOW)
|
|
399
|
+
self.threshold = 50.0
|
|
400
|
+
|
|
401
|
+
def run(self, column_name: str, numeric_metrics: Any) -> Optional[QualityCheckResult]:
|
|
402
|
+
if numeric_metrics is None:
|
|
403
|
+
return None
|
|
404
|
+
|
|
405
|
+
zero_pct = numeric_metrics.zero_percentage
|
|
406
|
+
|
|
407
|
+
if zero_pct > self.threshold:
|
|
408
|
+
return self.create_result(
|
|
409
|
+
column_name, False,
|
|
410
|
+
f"Zero-inflated: {zero_pct}% zeros",
|
|
411
|
+
{"zero_count": numeric_metrics.zero_count, "zero_percentage": zero_pct},
|
|
412
|
+
"Consider zero-inflated models or separate binary indicator"
|
|
413
|
+
)
|
|
414
|
+
else:
|
|
415
|
+
return self.create_result(
|
|
416
|
+
column_name, True,
|
|
417
|
+
f"Acceptable zero percentage: {zero_pct}%",
|
|
418
|
+
{"zero_percentage": zero_pct}
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
class IdentifierLeakageCheck(QualityCheck):
|
|
423
|
+
def __init__(self):
|
|
424
|
+
super().__init__("LEAK001", "Identifier Column in Features", Severity.CRITICAL)
|
|
425
|
+
|
|
426
|
+
def run(self, column_name: str, column_type: ColumnType, should_use_as_feature: bool) -> Optional[QualityCheckResult]:
|
|
427
|
+
if column_type != ColumnType.IDENTIFIER:
|
|
428
|
+
return None
|
|
429
|
+
|
|
430
|
+
if should_use_as_feature:
|
|
431
|
+
return self.create_result(
|
|
432
|
+
column_name, False,
|
|
433
|
+
"Identifier column marked as feature",
|
|
434
|
+
{"column_type": column_type.value},
|
|
435
|
+
"Remove identifier from feature set to prevent data leakage"
|
|
436
|
+
)
|
|
437
|
+
else:
|
|
438
|
+
return self.create_result(
|
|
439
|
+
column_name, True,
|
|
440
|
+
"Identifier correctly excluded from features",
|
|
441
|
+
{"column_type": column_type.value}
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
class DatetimeFutureLeakageCheck(QualityCheck):
|
|
446
|
+
def __init__(self):
|
|
447
|
+
super().__init__("DT001", "Future Dates Detected", Severity.HIGH)
|
|
448
|
+
|
|
449
|
+
def run(self, column_name: str, datetime_metrics: Any) -> Optional[QualityCheckResult]:
|
|
450
|
+
if datetime_metrics is None:
|
|
451
|
+
return None
|
|
452
|
+
|
|
453
|
+
future_count = datetime_metrics.future_date_count
|
|
454
|
+
|
|
455
|
+
if future_count > 0:
|
|
456
|
+
return self.create_result(
|
|
457
|
+
column_name, False,
|
|
458
|
+
f"Found {future_count} future dates",
|
|
459
|
+
{"future_date_count": future_count},
|
|
460
|
+
"Investigate data quality issues or potential temporal leakage"
|
|
461
|
+
)
|
|
462
|
+
else:
|
|
463
|
+
return self.create_result(
|
|
464
|
+
column_name, True,
|
|
465
|
+
"No future dates detected",
|
|
466
|
+
{"future_date_count": 0}
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
class PlaceholderDateCheck(QualityCheck):
|
|
471
|
+
def __init__(self):
|
|
472
|
+
super().__init__("DT002", "Placeholder Dates", Severity.MEDIUM)
|
|
473
|
+
self.threshold = 0.05
|
|
474
|
+
|
|
475
|
+
def run(self, column_name: str, datetime_metrics: Any, total_count: int) -> Optional[QualityCheckResult]:
|
|
476
|
+
if datetime_metrics is None or total_count == 0:
|
|
477
|
+
return None
|
|
478
|
+
|
|
479
|
+
placeholder_count = datetime_metrics.placeholder_count
|
|
480
|
+
placeholder_pct = (placeholder_count / total_count) * 100
|
|
481
|
+
|
|
482
|
+
if placeholder_pct > self.threshold:
|
|
483
|
+
return self.create_result(
|
|
484
|
+
column_name, False,
|
|
485
|
+
f"Placeholder dates found: {placeholder_count} ({placeholder_pct:.2f}%)",
|
|
486
|
+
{"placeholder_count": placeholder_count, "placeholder_percentage": placeholder_pct},
|
|
487
|
+
"Replace placeholder dates with null or investigate data quality"
|
|
488
|
+
)
|
|
489
|
+
else:
|
|
490
|
+
return self.create_result(
|
|
491
|
+
column_name, True,
|
|
492
|
+
f"No significant placeholder dates: {placeholder_count}",
|
|
493
|
+
{"placeholder_count": placeholder_count}
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
class RareCategoryCheck(QualityCheck):
|
|
498
|
+
def __init__(self):
|
|
499
|
+
super().__init__("CAT003", "High Rare Category Count", Severity.MEDIUM)
|
|
500
|
+
self.threshold_pct = 20.0
|
|
501
|
+
|
|
502
|
+
def run(self, column_name: str, categorical_metrics: Any) -> Optional[QualityCheckResult]:
|
|
503
|
+
if categorical_metrics is None:
|
|
504
|
+
return None
|
|
505
|
+
|
|
506
|
+
rare_pct = categorical_metrics.rare_category_percentage
|
|
507
|
+
rare_count = categorical_metrics.rare_category_count
|
|
508
|
+
|
|
509
|
+
if rare_pct > self.threshold_pct:
|
|
510
|
+
return self.create_result(
|
|
511
|
+
column_name, False,
|
|
512
|
+
f"High rare category percentage: {rare_pct}% ({rare_count} categories)",
|
|
513
|
+
{"rare_category_count": rare_count, "rare_category_percentage": rare_pct},
|
|
514
|
+
"Consider grouping rare categories or using target encoding"
|
|
515
|
+
)
|
|
516
|
+
else:
|
|
517
|
+
return self.create_result(
|
|
518
|
+
column_name, True,
|
|
519
|
+
f"Acceptable rare categories: {rare_pct}% ({rare_count} categories)",
|
|
520
|
+
{"rare_category_count": rare_count, "rare_category_percentage": rare_pct}
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
class UnknownCategoryCheck(QualityCheck):
|
|
525
|
+
def __init__(self):
|
|
526
|
+
super().__init__("CAT004", "Unknown Categories Present", Severity.LOW)
|
|
527
|
+
|
|
528
|
+
def run(self, column_name: str, categorical_metrics: Any) -> Optional[QualityCheckResult]:
|
|
529
|
+
if categorical_metrics is None:
|
|
530
|
+
return None
|
|
531
|
+
|
|
532
|
+
has_unknown = categorical_metrics.contains_unknown
|
|
533
|
+
|
|
534
|
+
if has_unknown:
|
|
535
|
+
return self.create_result(
|
|
536
|
+
column_name, False,
|
|
537
|
+
"Contains unknown/missing value indicators",
|
|
538
|
+
{"contains_unknown": has_unknown},
|
|
539
|
+
"Replace with proper nulls or create explicit category"
|
|
540
|
+
)
|
|
541
|
+
else:
|
|
542
|
+
return self.create_result(
|
|
543
|
+
column_name, True,
|
|
544
|
+
"No unknown value indicators found",
|
|
545
|
+
{"contains_unknown": has_unknown}
|
|
546
|
+
)
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
class PIIDetectedCheck(QualityCheck):
|
|
550
|
+
def __init__(self):
|
|
551
|
+
super().__init__("TX001", "PII Detected", Severity.CRITICAL)
|
|
552
|
+
|
|
553
|
+
def run(self, column_name: str, text_metrics: Any) -> Optional[QualityCheckResult]:
|
|
554
|
+
if text_metrics is None:
|
|
555
|
+
return None
|
|
556
|
+
|
|
557
|
+
if text_metrics.pii_detected:
|
|
558
|
+
pii_types_str = ", ".join(text_metrics.pii_types)
|
|
559
|
+
return self.create_result(
|
|
560
|
+
column_name, False,
|
|
561
|
+
f"PII detected: {pii_types_str}",
|
|
562
|
+
{"pii_types": text_metrics.pii_types},
|
|
563
|
+
"CRITICAL: Remove PII or mask sensitive data before processing. Consider data anonymization techniques."
|
|
564
|
+
)
|
|
565
|
+
else:
|
|
566
|
+
return self.create_result(
|
|
567
|
+
column_name, True,
|
|
568
|
+
"No PII detected",
|
|
569
|
+
{"pii_detected": False}
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
class EmptyTextCheck(QualityCheck):
|
|
574
|
+
def __init__(self):
|
|
575
|
+
super().__init__("TX002", "Mostly Empty Text", Severity.HIGH)
|
|
576
|
+
self.threshold = 50.0
|
|
577
|
+
|
|
578
|
+
def run(self, column_name: str, text_metrics: Any) -> Optional[QualityCheckResult]:
|
|
579
|
+
if text_metrics is None:
|
|
580
|
+
return None
|
|
581
|
+
|
|
582
|
+
empty_pct = text_metrics.empty_percentage
|
|
583
|
+
|
|
584
|
+
if empty_pct > self.threshold:
|
|
585
|
+
return self.create_result(
|
|
586
|
+
column_name, False,
|
|
587
|
+
f"High percentage of empty text: {empty_pct}%",
|
|
588
|
+
{"empty_percentage": empty_pct, "empty_count": text_metrics.empty_count},
|
|
589
|
+
"Review data quality and consider imputation or feature removal"
|
|
590
|
+
)
|
|
591
|
+
else:
|
|
592
|
+
return self.create_result(
|
|
593
|
+
column_name, True,
|
|
594
|
+
f"Acceptable empty text percentage: {empty_pct}%",
|
|
595
|
+
{"empty_percentage": empty_pct}
|
|
596
|
+
)
|
|
597
|
+
|
|
598
|
+
|
|
599
|
+
class ShortTextCheck(QualityCheck):
|
|
600
|
+
def __init__(self):
|
|
601
|
+
super().__init__("TX003", "Very Short Texts", Severity.MEDIUM)
|
|
602
|
+
self.threshold = 10.0
|
|
603
|
+
|
|
604
|
+
def run(self, column_name: str, text_metrics: Any) -> Optional[QualityCheckResult]:
|
|
605
|
+
if text_metrics is None:
|
|
606
|
+
return None
|
|
607
|
+
|
|
608
|
+
avg_length = text_metrics.length_mean
|
|
609
|
+
|
|
610
|
+
if avg_length < self.threshold:
|
|
611
|
+
return self.create_result(
|
|
612
|
+
column_name, False,
|
|
613
|
+
f"Very short average text length: {avg_length:.1f} characters",
|
|
614
|
+
{"length_mean": avg_length},
|
|
615
|
+
"May be better treated as categorical. Consider reclassifying column type."
|
|
616
|
+
)
|
|
617
|
+
else:
|
|
618
|
+
return self.create_result(
|
|
619
|
+
column_name, True,
|
|
620
|
+
f"Acceptable text length: {avg_length:.1f} characters",
|
|
621
|
+
{"length_mean": avg_length}
|
|
622
|
+
)
|
|
623
|
+
|
|
624
|
+
|
|
625
|
+
class InfiniteValuesCheck(QualityCheck):
|
|
626
|
+
def __init__(self):
|
|
627
|
+
super().__init__("NC006", "Infinite Values", Severity.CRITICAL)
|
|
628
|
+
|
|
629
|
+
def run(self, column_name: str, numeric_metrics: Any) -> Optional[QualityCheckResult]:
|
|
630
|
+
if numeric_metrics is None:
|
|
631
|
+
return None
|
|
632
|
+
|
|
633
|
+
if numeric_metrics.inf_count > 0:
|
|
634
|
+
return self.create_result(
|
|
635
|
+
column_name, False,
|
|
636
|
+
f"Column contains {numeric_metrics.inf_count} infinite values ({numeric_metrics.inf_percentage}%)",
|
|
637
|
+
{"inf_count": numeric_metrics.inf_count, "inf_percentage": numeric_metrics.inf_percentage},
|
|
638
|
+
"CRITICAL: Remove or replace infinite values before processing. Use imputation or capping strategies."
|
|
639
|
+
)
|
|
640
|
+
else:
|
|
641
|
+
return self.create_result(
|
|
642
|
+
column_name, True,
|
|
643
|
+
"No infinite values detected",
|
|
644
|
+
{"inf_count": 0, "inf_percentage": 0.0}
|
|
645
|
+
)
|
|
646
|
+
|
|
647
|
+
|
|
648
|
+
class ExtremeOutliersCheck(QualityCheck):
|
|
649
|
+
def __init__(self):
|
|
650
|
+
super().__init__("NC001", "Extreme Outliers", Severity.HIGH)
|
|
651
|
+
self.threshold = 5.0 # > 5%
|
|
652
|
+
|
|
653
|
+
def run(self, column_name: str, numeric_metrics: Any) -> Optional[QualityCheckResult]:
|
|
654
|
+
if numeric_metrics is None:
|
|
655
|
+
return None
|
|
656
|
+
|
|
657
|
+
outlier_pct = numeric_metrics.outlier_percentage
|
|
658
|
+
|
|
659
|
+
if outlier_pct > self.threshold:
|
|
660
|
+
return self.create_result(
|
|
661
|
+
column_name, False,
|
|
662
|
+
f"Extreme outlier percentage: {outlier_pct}% (> {self.threshold}%)",
|
|
663
|
+
{"outlier_percentage": outlier_pct, "outlier_count_iqr": numeric_metrics.outlier_count_iqr,
|
|
664
|
+
"outlier_count_zscore": numeric_metrics.outlier_count_zscore},
|
|
665
|
+
"Apply robust scaling, winsorization, or consider removing outliers."
|
|
666
|
+
)
|
|
667
|
+
else:
|
|
668
|
+
return self.create_result(
|
|
669
|
+
column_name, True,
|
|
670
|
+
f"Acceptable outlier percentage: {outlier_pct}%",
|
|
671
|
+
{"outlier_percentage": outlier_pct}
|
|
672
|
+
)
|
|
673
|
+
|
|
674
|
+
|
|
675
|
+
class ModerateOutliersCheck(QualityCheck):
|
|
676
|
+
def __init__(self):
|
|
677
|
+
super().__init__("NC002", "Moderate Outliers", Severity.MEDIUM)
|
|
678
|
+
self.threshold = 1.0 # > 1%
|
|
679
|
+
|
|
680
|
+
def run(self, column_name: str, numeric_metrics: Any) -> Optional[QualityCheckResult]:
|
|
681
|
+
if numeric_metrics is None:
|
|
682
|
+
return None
|
|
683
|
+
|
|
684
|
+
outlier_pct = numeric_metrics.outlier_percentage
|
|
685
|
+
|
|
686
|
+
if outlier_pct > self.threshold:
|
|
687
|
+
return self.create_result(
|
|
688
|
+
column_name, False,
|
|
689
|
+
f"Moderate outlier percentage: {outlier_pct}% (> {self.threshold}%)",
|
|
690
|
+
{"outlier_percentage": outlier_pct, "outlier_count_iqr": numeric_metrics.outlier_count_iqr},
|
|
691
|
+
"Consider investigating outliers and applying transformations."
|
|
692
|
+
)
|
|
693
|
+
else:
|
|
694
|
+
return self.create_result(
|
|
695
|
+
column_name, True,
|
|
696
|
+
f"Low outlier percentage: {outlier_pct}%",
|
|
697
|
+
{"outlier_percentage": outlier_pct}
|
|
698
|
+
)
|
|
699
|
+
|
|
700
|
+
|
|
701
|
+
class HighSkewnessCheck(QualityCheck):
|
|
702
|
+
def __init__(self):
|
|
703
|
+
super().__init__("NC003", "High Skewness", Severity.MEDIUM)
|
|
704
|
+
self.threshold = 2.0 # |skewness| > 2
|
|
705
|
+
|
|
706
|
+
def run(self, column_name: str, numeric_metrics: Any) -> Optional[QualityCheckResult]:
|
|
707
|
+
if numeric_metrics is None or numeric_metrics.skewness is None:
|
|
708
|
+
return None
|
|
709
|
+
|
|
710
|
+
skewness = abs(numeric_metrics.skewness)
|
|
711
|
+
|
|
712
|
+
if skewness > self.threshold:
|
|
713
|
+
return self.create_result(
|
|
714
|
+
column_name, False,
|
|
715
|
+
f"High skewness detected: {numeric_metrics.skewness:.2f} (|skew| > {self.threshold})",
|
|
716
|
+
{"skewness": numeric_metrics.skewness, "abs_skewness": skewness},
|
|
717
|
+
"Apply log, sqrt, or Box-Cox transformation to reduce skewness."
|
|
718
|
+
)
|
|
719
|
+
else:
|
|
720
|
+
return self.create_result(
|
|
721
|
+
column_name, True,
|
|
722
|
+
f"Acceptable skewness: {numeric_metrics.skewness:.2f}",
|
|
723
|
+
{"skewness": numeric_metrics.skewness}
|
|
724
|
+
)
|
|
725
|
+
|
|
726
|
+
|
|
727
|
+
class NumericZeroInflationCheck(QualityCheck):
|
|
728
|
+
def __init__(self):
|
|
729
|
+
super().__init__("NC004", "Zero Inflation", Severity.MEDIUM)
|
|
730
|
+
self.threshold = 50.0 # > 50%
|
|
731
|
+
|
|
732
|
+
def run(self, column_name: str, numeric_metrics: Any) -> Optional[QualityCheckResult]:
|
|
733
|
+
if numeric_metrics is None:
|
|
734
|
+
return None
|
|
735
|
+
|
|
736
|
+
zero_pct = numeric_metrics.zero_percentage
|
|
737
|
+
|
|
738
|
+
if zero_pct > self.threshold:
|
|
739
|
+
return self.create_result(
|
|
740
|
+
column_name, False,
|
|
741
|
+
f"Zero-inflated: {zero_pct}% zeros (> {self.threshold}%)",
|
|
742
|
+
{"zero_percentage": zero_pct, "zero_count": numeric_metrics.zero_count},
|
|
743
|
+
"Consider zero-inflated models, indicator variable, or separate handling of zeros."
|
|
744
|
+
)
|
|
745
|
+
else:
|
|
746
|
+
return self.create_result(
|
|
747
|
+
column_name, True,
|
|
748
|
+
f"Acceptable zero percentage: {zero_pct}%",
|
|
749
|
+
{"zero_percentage": zero_pct}
|
|
750
|
+
)
|
|
751
|
+
|
|
752
|
+
|
|
753
|
+
class UnexpectedNegativesCheck(QualityCheck):
|
|
754
|
+
def __init__(self, allow_negatives: bool = True):
|
|
755
|
+
super().__init__("NC005", "Unexpected Negative Values", Severity.HIGH)
|
|
756
|
+
self.allow_negatives = allow_negatives
|
|
757
|
+
|
|
758
|
+
def run(self, column_name: str, numeric_metrics: Any) -> Optional[QualityCheckResult]:
|
|
759
|
+
if numeric_metrics is None or self.allow_negatives:
|
|
760
|
+
return None
|
|
761
|
+
|
|
762
|
+
negative_count = numeric_metrics.negative_count
|
|
763
|
+
|
|
764
|
+
if negative_count > 0:
|
|
765
|
+
return self.create_result(
|
|
766
|
+
column_name, False,
|
|
767
|
+
f"Column contains {negative_count} negative values ({numeric_metrics.negative_percentage}%), but negatives not expected",
|
|
768
|
+
{"negative_count": negative_count, "negative_percentage": numeric_metrics.negative_percentage},
|
|
769
|
+
"Investigate negative values. May indicate data errors or need for transformation."
|
|
770
|
+
)
|
|
771
|
+
else:
|
|
772
|
+
return self.create_result(
|
|
773
|
+
column_name, True,
|
|
774
|
+
"No negative values found",
|
|
775
|
+
{"negative_count": 0}
|
|
776
|
+
)
|
|
777
|
+
|
|
778
|
+
|
|
779
|
+
class ConstantValueCheck(QualityCheck):
|
|
780
|
+
def __init__(self):
|
|
781
|
+
super().__init__("NC007", "Constant Value", Severity.HIGH)
|
|
782
|
+
|
|
783
|
+
def run(self, column_name: str, numeric_metrics: Any) -> Optional[QualityCheckResult]:
|
|
784
|
+
if numeric_metrics is None:
|
|
785
|
+
return None
|
|
786
|
+
|
|
787
|
+
if numeric_metrics.std == 0:
|
|
788
|
+
return self.create_result(
|
|
789
|
+
column_name, False,
|
|
790
|
+
"Column has constant value (std = 0)",
|
|
791
|
+
{"std": 0, "mean": numeric_metrics.mean},
|
|
792
|
+
"Remove constant column - provides no information for modeling."
|
|
793
|
+
)
|
|
794
|
+
else:
|
|
795
|
+
return self.create_result(
|
|
796
|
+
column_name, True,
|
|
797
|
+
f"Column has variance (std = {numeric_metrics.std:.4f})",
|
|
798
|
+
{"std": numeric_metrics.std}
|
|
799
|
+
)
|
|
800
|
+
|
|
801
|
+
|
|
802
|
+
class SuspiciousPrecisionCheck(QualityCheck):
|
|
803
|
+
def __init__(self):
|
|
804
|
+
super().__init__("NC008", "Suspicious Precision", Severity.LOW)
|
|
805
|
+
|
|
806
|
+
def run(self, column_name: str, series: pd.Series) -> Optional[QualityCheckResult]:
|
|
807
|
+
# This check needs access to raw series to check decimal places
|
|
808
|
+
# Note: This is a simplified implementation
|
|
809
|
+
if series is None or len(series) == 0:
|
|
810
|
+
return None
|
|
811
|
+
|
|
812
|
+
clean_series = series.dropna()
|
|
813
|
+
if len(clean_series) == 0:
|
|
814
|
+
return None
|
|
815
|
+
|
|
816
|
+
# Check if all values end in .00 (are whole numbers)
|
|
817
|
+
all_whole = all((isinstance(v, (int, float)) and v == int(v)) for v in clean_series[:min(100, len(clean_series))])
|
|
818
|
+
|
|
819
|
+
if all_whole and len(clean_series) > 10:
|
|
820
|
+
return self.create_result(
|
|
821
|
+
column_name, False,
|
|
822
|
+
"All sampled values are whole numbers - may indicate precision loss or rounding",
|
|
823
|
+
{"precision_issue": "all_whole_numbers"},
|
|
824
|
+
"Verify source data precision and check for unintended rounding."
|
|
825
|
+
)
|
|
826
|
+
else:
|
|
827
|
+
return self.create_result(
|
|
828
|
+
column_name, True,
|
|
829
|
+
"Precision appears normal",
|
|
830
|
+
{}
|
|
831
|
+
)
|
|
832
|
+
|
|
833
|
+
|
|
834
|
+
class HighOutliersCheck(QualityCheck):
|
|
835
|
+
def __init__(self):
|
|
836
|
+
super().__init__("FQ005", "Column has >50% outliers", Severity.HIGH)
|
|
837
|
+
self.threshold = 50.0
|
|
838
|
+
|
|
839
|
+
def run(self, column_name: str, numeric_metrics: Any) -> Optional[QualityCheckResult]:
|
|
840
|
+
if numeric_metrics is None:
|
|
841
|
+
return None
|
|
842
|
+
|
|
843
|
+
outlier_pct = numeric_metrics.outlier_percentage
|
|
844
|
+
|
|
845
|
+
if outlier_pct > self.threshold:
|
|
846
|
+
return self.create_result(
|
|
847
|
+
column_name, False,
|
|
848
|
+
f"Extreme outlier percentage: {outlier_pct}% (> {self.threshold}%)",
|
|
849
|
+
{"outlier_percentage": outlier_pct, "outlier_count_iqr": numeric_metrics.outlier_count_iqr},
|
|
850
|
+
"HIGH: Column may be unreliable for modeling. Consider robust transformations or removal."
|
|
851
|
+
)
|
|
852
|
+
else:
|
|
853
|
+
return self.create_result(
|
|
854
|
+
column_name, True,
|
|
855
|
+
f"Acceptable outlier percentage: {outlier_pct}%",
|
|
856
|
+
{"outlier_percentage": outlier_pct}
|
|
857
|
+
)
|
|
858
|
+
|
|
859
|
+
|
|
860
|
+
class AllValuesOutliersCheck(QualityCheck):
|
|
861
|
+
def __init__(self):
|
|
862
|
+
super().__init__("FQ011", "All values are outliers", Severity.CRITICAL)
|
|
863
|
+
|
|
864
|
+
def run(self, column_name: str, numeric_metrics: Any) -> Optional[QualityCheckResult]:
|
|
865
|
+
if numeric_metrics is None:
|
|
866
|
+
return None
|
|
867
|
+
|
|
868
|
+
outlier_pct = numeric_metrics.outlier_percentage
|
|
869
|
+
|
|
870
|
+
if outlier_pct == 100.0:
|
|
871
|
+
return self.create_result(
|
|
872
|
+
column_name, False,
|
|
873
|
+
"CRITICAL: All values are outliers (100%)",
|
|
874
|
+
{"outlier_percentage": 100.0},
|
|
875
|
+
"CRITICAL: Column may have data quality issues. Investigate and consider removal."
|
|
876
|
+
)
|
|
877
|
+
else:
|
|
878
|
+
return self.create_result(
|
|
879
|
+
column_name, True,
|
|
880
|
+
f"Not all values are outliers: {outlier_pct}%",
|
|
881
|
+
{"outlier_percentage": outlier_pct}
|
|
882
|
+
)
|
|
883
|
+
|
|
884
|
+
|
|
885
|
+
class UnknownColumnTypeCheck(QualityCheck):
|
|
886
|
+
def __init__(self):
|
|
887
|
+
super().__init__("FQ008", "Unknown column type", Severity.MEDIUM)
|
|
888
|
+
|
|
889
|
+
def run(self, column_name: str, column_type: ColumnType) -> Optional[QualityCheckResult]:
|
|
890
|
+
if column_type == ColumnType.UNKNOWN:
|
|
891
|
+
return self.create_result(
|
|
892
|
+
column_name, False,
|
|
893
|
+
"Column type could not be determined",
|
|
894
|
+
{"column_type": "UNKNOWN"},
|
|
895
|
+
"Manually specify column type or investigate data format."
|
|
896
|
+
)
|
|
897
|
+
else:
|
|
898
|
+
return self.create_result(
|
|
899
|
+
column_name, True,
|
|
900
|
+
f"Column type determined: {column_type.value}",
|
|
901
|
+
{"column_type": column_type.value}
|
|
902
|
+
)
|
|
903
|
+
|
|
904
|
+
|
|
905
|
+
class VeryHighCardinalityNominalCheck(QualityCheck):
|
|
906
|
+
def __init__(self):
|
|
907
|
+
super().__init__("FQ009", "Very high cardinality nominal", Severity.MEDIUM)
|
|
908
|
+
self.threshold = 1000
|
|
909
|
+
|
|
910
|
+
def run(self, column_name: str, categorical_metrics: Any) -> Optional[QualityCheckResult]:
|
|
911
|
+
if categorical_metrics is None:
|
|
912
|
+
return None
|
|
913
|
+
|
|
914
|
+
cardinality = categorical_metrics.cardinality
|
|
915
|
+
|
|
916
|
+
if cardinality > self.threshold:
|
|
917
|
+
return self.create_result(
|
|
918
|
+
column_name, False,
|
|
919
|
+
f"Very high cardinality: {cardinality} unique categories (> {self.threshold})",
|
|
920
|
+
{"cardinality": cardinality},
|
|
921
|
+
"Consider treating as high cardinality or using hashing/embedding encoding."
|
|
922
|
+
)
|
|
923
|
+
else:
|
|
924
|
+
return self.create_result(
|
|
925
|
+
column_name, True,
|
|
926
|
+
f"Acceptable cardinality: {cardinality} unique categories",
|
|
927
|
+
{"cardinality": cardinality}
|
|
928
|
+
)
|
|
929
|
+
|
|
930
|
+
|
|
931
|
+
class UnrealisticDateRangeCheck(QualityCheck):
|
|
932
|
+
def __init__(self):
|
|
933
|
+
super().__init__("FQ012", "Date range unrealistic", Severity.HIGH)
|
|
934
|
+
self.threshold_years = 100
|
|
935
|
+
|
|
936
|
+
def run(self, column_name: str, datetime_metrics: Any) -> Optional[QualityCheckResult]:
|
|
937
|
+
if datetime_metrics is None:
|
|
938
|
+
return None
|
|
939
|
+
|
|
940
|
+
date_range_days = datetime_metrics.date_range_days
|
|
941
|
+
date_range_years = date_range_days / 365.25
|
|
942
|
+
|
|
943
|
+
if date_range_years > self.threshold_years:
|
|
944
|
+
return self.create_result(
|
|
945
|
+
column_name, False,
|
|
946
|
+
f"Unrealistic date range: {date_range_years:.1f} years (> {self.threshold_years} years)",
|
|
947
|
+
{"date_range_days": date_range_days, "date_range_years": round(date_range_years, 1),
|
|
948
|
+
"min_date": datetime_metrics.min_date, "max_date": datetime_metrics.max_date},
|
|
949
|
+
"HIGH: Date range spans > 100 years. Review for data quality issues."
|
|
950
|
+
)
|
|
951
|
+
else:
|
|
952
|
+
return self.create_result(
|
|
953
|
+
column_name, True,
|
|
954
|
+
f"Acceptable date range: {date_range_years:.1f} years",
|
|
955
|
+
{"date_range_days": date_range_days, "date_range_years": round(date_range_years, 1)}
|
|
956
|
+
)
|
|
957
|
+
|
|
958
|
+
|
|
959
|
+
class HighUniquenessTextCheck(QualityCheck):
|
|
960
|
+
def __init__(self):
|
|
961
|
+
super().__init__("TX004", "High Uniqueness Text", Severity.MEDIUM)
|
|
962
|
+
self.threshold = 0.95
|
|
963
|
+
|
|
964
|
+
def run(self, column_name: str, universal_metrics: Any) -> Optional[QualityCheckResult]:
|
|
965
|
+
if universal_metrics is None:
|
|
966
|
+
return None
|
|
967
|
+
|
|
968
|
+
distinct_pct = universal_metrics.distinct_percentage / 100.0
|
|
969
|
+
|
|
970
|
+
if distinct_pct > self.threshold:
|
|
971
|
+
return self.create_result(
|
|
972
|
+
column_name, False,
|
|
973
|
+
f"Text column has very high uniqueness: {universal_metrics.distinct_percentage}% unique (> {self.threshold * 100}%)",
|
|
974
|
+
{"distinct_percentage": universal_metrics.distinct_percentage, "distinct_count": universal_metrics.distinct_count},
|
|
975
|
+
"Text column may actually be an identifier. Consider reclassifying as IDENTIFIER type."
|
|
976
|
+
)
|
|
977
|
+
else:
|
|
978
|
+
return self.create_result(
|
|
979
|
+
column_name, True,
|
|
980
|
+
f"Acceptable text uniqueness: {universal_metrics.distinct_percentage}%",
|
|
981
|
+
{"distinct_percentage": universal_metrics.distinct_percentage}
|
|
982
|
+
)
|
|
983
|
+
|
|
984
|
+
|
|
985
|
+
class BinaryNotBinaryCheck(QualityCheck):
|
|
986
|
+
def __init__(self):
|
|
987
|
+
super().__init__("BN001", "Not Binary", Severity.CRITICAL)
|
|
988
|
+
|
|
989
|
+
def run(self, column_name: str, universal_metrics: Any) -> Optional[QualityCheckResult]:
|
|
990
|
+
if universal_metrics is None:
|
|
991
|
+
return None
|
|
992
|
+
|
|
993
|
+
distinct_count = universal_metrics.distinct_count
|
|
994
|
+
|
|
995
|
+
if distinct_count != 2:
|
|
996
|
+
return self.create_result(
|
|
997
|
+
column_name, False,
|
|
998
|
+
f"Column marked as binary but has {distinct_count} distinct values (expected 2)",
|
|
999
|
+
{"distinct_count": distinct_count},
|
|
1000
|
+
"CRITICAL: Binary columns must have exactly 2 distinct values. Review column type or data."
|
|
1001
|
+
)
|
|
1002
|
+
else:
|
|
1003
|
+
return self.create_result(
|
|
1004
|
+
column_name, True,
|
|
1005
|
+
"Column has exactly 2 distinct values",
|
|
1006
|
+
{"distinct_count": 2}
|
|
1007
|
+
)
|
|
1008
|
+
|
|
1009
|
+
|
|
1010
|
+
class BinarySevereImbalanceCheck(QualityCheck):
|
|
1011
|
+
def __init__(self):
|
|
1012
|
+
super().__init__("BN002", "Binary Severe Imbalance", Severity.MEDIUM)
|
|
1013
|
+
self.threshold_low = 1.0
|
|
1014
|
+
self.threshold_high = 99.0
|
|
1015
|
+
|
|
1016
|
+
def run(self, column_name: str, binary_metrics: Any) -> Optional[QualityCheckResult]:
|
|
1017
|
+
if binary_metrics is None:
|
|
1018
|
+
return None
|
|
1019
|
+
|
|
1020
|
+
true_pct = binary_metrics.true_percentage
|
|
1021
|
+
|
|
1022
|
+
if true_pct < self.threshold_low or true_pct > self.threshold_high:
|
|
1023
|
+
return self.create_result(
|
|
1024
|
+
column_name, False,
|
|
1025
|
+
f"Severe binary imbalance: {true_pct}% true values (< {self.threshold_low}% or > {self.threshold_high}%)",
|
|
1026
|
+
{"true_percentage": true_pct, "balance_ratio": binary_metrics.balance_ratio},
|
|
1027
|
+
"Consider class balancing techniques or check if column should be binary."
|
|
1028
|
+
)
|
|
1029
|
+
else:
|
|
1030
|
+
return self.create_result(
|
|
1031
|
+
column_name, True,
|
|
1032
|
+
f"Acceptable binary balance: {true_pct}% true values",
|
|
1033
|
+
{"true_percentage": true_pct}
|
|
1034
|
+
)
|
|
1035
|
+
|
|
1036
|
+
|
|
1037
|
+
class BinaryAllSameValueCheck(QualityCheck):
|
|
1038
|
+
def __init__(self):
|
|
1039
|
+
super().__init__("BN003", "Binary All Same Value", Severity.HIGH)
|
|
1040
|
+
|
|
1041
|
+
def run(self, column_name: str, universal_metrics: Any) -> Optional[QualityCheckResult]:
|
|
1042
|
+
if universal_metrics is None:
|
|
1043
|
+
return None
|
|
1044
|
+
|
|
1045
|
+
distinct_count = universal_metrics.distinct_count
|
|
1046
|
+
|
|
1047
|
+
if distinct_count == 1:
|
|
1048
|
+
return self.create_result(
|
|
1049
|
+
column_name, False,
|
|
1050
|
+
f"Binary column has only 1 distinct value: {universal_metrics.most_common_value}",
|
|
1051
|
+
{"distinct_count": 1, "value": universal_metrics.most_common_value},
|
|
1052
|
+
"Binary column provides no information. Consider removing."
|
|
1053
|
+
)
|
|
1054
|
+
else:
|
|
1055
|
+
return self.create_result(
|
|
1056
|
+
column_name, True,
|
|
1057
|
+
f"Binary column has {distinct_count} distinct values",
|
|
1058
|
+
{"distinct_count": distinct_count}
|
|
1059
|
+
)
|
|
1060
|
+
|
|
1061
|
+
|
|
1062
|
+
class BinaryUnexpectedValuesCheck(QualityCheck):
|
|
1063
|
+
def __init__(self):
|
|
1064
|
+
super().__init__("BN004", "Binary Unexpected Values", Severity.HIGH)
|
|
1065
|
+
|
|
1066
|
+
def run(self, column_name: str, binary_metrics: Any) -> Optional[QualityCheckResult]:
|
|
1067
|
+
if binary_metrics is None:
|
|
1068
|
+
return None
|
|
1069
|
+
|
|
1070
|
+
values_found = binary_metrics.values_found
|
|
1071
|
+
expected_values = {0, 1, 0.0, 1.0, True, False, "0", "1", "yes", "Yes", "YES", "no", "No", "NO",
|
|
1072
|
+
"true", "True", "TRUE", "false", "False", "FALSE", "y", "Y", "n", "N"}
|
|
1073
|
+
|
|
1074
|
+
unexpected = [v for v in values_found if v not in expected_values]
|
|
1075
|
+
|
|
1076
|
+
if len(unexpected) > 0:
|
|
1077
|
+
return self.create_result(
|
|
1078
|
+
column_name, False,
|
|
1079
|
+
f"Binary column contains unexpected values: {unexpected[:5]}",
|
|
1080
|
+
{"unexpected_values": unexpected[:5], "values_found": values_found},
|
|
1081
|
+
"Standardize binary values to 0/1 or True/False format."
|
|
1082
|
+
)
|
|
1083
|
+
else:
|
|
1084
|
+
return self.create_result(
|
|
1085
|
+
column_name, True,
|
|
1086
|
+
"Binary column contains only expected values",
|
|
1087
|
+
{"values_found": values_found}
|
|
1088
|
+
)
|
|
1089
|
+
|
|
1090
|
+
|
|
1091
|
+
class DatetimeFormatInconsistentCheck(QualityCheck):
|
|
1092
|
+
def __init__(self):
|
|
1093
|
+
super().__init__("DT003", "Datetime Format Inconsistent", Severity.MEDIUM)
|
|
1094
|
+
self.threshold = 95.0
|
|
1095
|
+
|
|
1096
|
+
def run(self, column_name: str, datetime_metrics: Any) -> Optional[QualityCheckResult]:
|
|
1097
|
+
if datetime_metrics is None or datetime_metrics.format_consistency is None:
|
|
1098
|
+
return None
|
|
1099
|
+
|
|
1100
|
+
format_consistency = datetime_metrics.format_consistency
|
|
1101
|
+
|
|
1102
|
+
if format_consistency < self.threshold:
|
|
1103
|
+
return self.create_result(
|
|
1104
|
+
column_name, False,
|
|
1105
|
+
f"Datetime format inconsistent: {format_consistency}% match format '{datetime_metrics.format_detected}' (< {self.threshold}%)",
|
|
1106
|
+
{"format_consistency": format_consistency, "format_detected": datetime_metrics.format_detected},
|
|
1107
|
+
"Standardize datetime format during data loading or preprocessing."
|
|
1108
|
+
)
|
|
1109
|
+
else:
|
|
1110
|
+
return self.create_result(
|
|
1111
|
+
column_name, True,
|
|
1112
|
+
f"Datetime format consistent: {format_consistency}% match format '{datetime_metrics.format_detected}'",
|
|
1113
|
+
{"format_consistency": format_consistency, "format_detected": datetime_metrics.format_detected}
|
|
1114
|
+
)
|
|
1115
|
+
|
|
1116
|
+
|
|
1117
|
+
class DatetimeMixedTimezonesCheck(QualityCheck):
|
|
1118
|
+
def __init__(self):
|
|
1119
|
+
super().__init__("DT004", "Mixed Timezones", Severity.MEDIUM)
|
|
1120
|
+
|
|
1121
|
+
def run(self, column_name: str, datetime_metrics: Any) -> Optional[QualityCheckResult]:
|
|
1122
|
+
if datetime_metrics is None:
|
|
1123
|
+
return None
|
|
1124
|
+
|
|
1125
|
+
timezone_consistent = datetime_metrics.timezone_consistent
|
|
1126
|
+
|
|
1127
|
+
if not timezone_consistent:
|
|
1128
|
+
return self.create_result(
|
|
1129
|
+
column_name, False,
|
|
1130
|
+
"Mixed timezones detected in datetime column",
|
|
1131
|
+
{"timezone_consistent": False},
|
|
1132
|
+
"Convert all datetimes to a single timezone (e.g., UTC) for consistency."
|
|
1133
|
+
)
|
|
1134
|
+
else:
|
|
1135
|
+
return self.create_result(
|
|
1136
|
+
column_name, True,
|
|
1137
|
+
"Timezones are consistent",
|
|
1138
|
+
{"timezone_consistent": True}
|
|
1139
|
+
)
|
|
1140
|
+
|
|
1141
|
+
|
|
1142
|
+
class DatetimeInvalidDatesCheck(QualityCheck):
|
|
1143
|
+
def __init__(self):
|
|
1144
|
+
super().__init__("DT005", "Invalid Dates", Severity.CRITICAL)
|
|
1145
|
+
|
|
1146
|
+
def run(self, column_name: str, series: pd.Series, universal_metrics: Any) -> Optional[QualityCheckResult]:
|
|
1147
|
+
if series is None:
|
|
1148
|
+
return None
|
|
1149
|
+
|
|
1150
|
+
clean_series = series.dropna()
|
|
1151
|
+
invalid_count = 0
|
|
1152
|
+
|
|
1153
|
+
if not is_datetime64_any_dtype(clean_series):
|
|
1154
|
+
for val in clean_series:
|
|
1155
|
+
try:
|
|
1156
|
+
pd.to_datetime(val, format='mixed')
|
|
1157
|
+
except Exception:
|
|
1158
|
+
invalid_count += 1
|
|
1159
|
+
|
|
1160
|
+
if invalid_count > 0:
|
|
1161
|
+
invalid_pct = (invalid_count / len(series)) * 100 if len(series) > 0 else 0.0
|
|
1162
|
+
return self.create_result(
|
|
1163
|
+
column_name, False,
|
|
1164
|
+
f"Column contains {invalid_count} invalid dates ({invalid_pct:.2f}%)",
|
|
1165
|
+
{"invalid_count": invalid_count, "invalid_percentage": invalid_pct},
|
|
1166
|
+
"CRITICAL: Fix or remove invalid dates before processing."
|
|
1167
|
+
)
|
|
1168
|
+
else:
|
|
1169
|
+
return self.create_result(
|
|
1170
|
+
column_name, True,
|
|
1171
|
+
"No invalid dates detected",
|
|
1172
|
+
{"invalid_count": 0}
|
|
1173
|
+
)
|
|
1174
|
+
|
|
1175
|
+
|
|
1176
|
+
class DatetimeUnrealisticRangeCheck(QualityCheck):
|
|
1177
|
+
def __init__(self):
|
|
1178
|
+
super().__init__("DT006", "Unrealistic Date Range", Severity.MEDIUM)
|
|
1179
|
+
self.threshold_years = 50
|
|
1180
|
+
|
|
1181
|
+
def run(self, column_name: str, datetime_metrics: Any) -> Optional[QualityCheckResult]:
|
|
1182
|
+
if datetime_metrics is None:
|
|
1183
|
+
return None
|
|
1184
|
+
|
|
1185
|
+
date_range_days = datetime_metrics.date_range_days
|
|
1186
|
+
date_range_years = date_range_days / 365.25
|
|
1187
|
+
|
|
1188
|
+
if date_range_years > self.threshold_years:
|
|
1189
|
+
return self.create_result(
|
|
1190
|
+
column_name, False,
|
|
1191
|
+
f"Unrealistic date range: {date_range_years:.1f} years (> {self.threshold_years} years)",
|
|
1192
|
+
{"date_range_days": date_range_days, "date_range_years": round(date_range_years, 1),
|
|
1193
|
+
"min_date": datetime_metrics.min_date, "max_date": datetime_metrics.max_date},
|
|
1194
|
+
"Review min/max dates for data quality issues or placeholder values."
|
|
1195
|
+
)
|
|
1196
|
+
else:
|
|
1197
|
+
return self.create_result(
|
|
1198
|
+
column_name, True,
|
|
1199
|
+
f"Acceptable date range: {date_range_years:.1f} years",
|
|
1200
|
+
{"date_range_days": date_range_days, "date_range_years": round(date_range_years, 1)}
|
|
1201
|
+
)
|
|
1202
|
+
|
|
1203
|
+
|
|
1204
|
+
class VeryHighCardinalityCheck(QualityCheck):
|
|
1205
|
+
def __init__(self):
|
|
1206
|
+
super().__init__("CN001", "Very High Cardinality", Severity.HIGH)
|
|
1207
|
+
self.threshold = 100
|
|
1208
|
+
|
|
1209
|
+
def run(self, column_name: str, categorical_metrics: Any) -> Optional[QualityCheckResult]:
|
|
1210
|
+
if categorical_metrics is None:
|
|
1211
|
+
return None
|
|
1212
|
+
|
|
1213
|
+
cardinality = categorical_metrics.cardinality
|
|
1214
|
+
|
|
1215
|
+
if cardinality > self.threshold:
|
|
1216
|
+
return self.create_result(
|
|
1217
|
+
column_name, False,
|
|
1218
|
+
f"Very high cardinality: {cardinality} unique categories (> {self.threshold})",
|
|
1219
|
+
{"cardinality": cardinality},
|
|
1220
|
+
f"Consider using {categorical_metrics.encoding_recommendation} encoding or treating as high cardinality feature."
|
|
1221
|
+
)
|
|
1222
|
+
else:
|
|
1223
|
+
return self.create_result(
|
|
1224
|
+
column_name, True,
|
|
1225
|
+
f"Acceptable cardinality: {cardinality} unique categories",
|
|
1226
|
+
{"cardinality": cardinality}
|
|
1227
|
+
)
|
|
1228
|
+
|
|
1229
|
+
|
|
1230
|
+
class HighCardinalityCategoricalCheck(QualityCheck):
|
|
1231
|
+
def __init__(self):
|
|
1232
|
+
super().__init__("CN002", "High Cardinality Categorical", Severity.MEDIUM)
|
|
1233
|
+
self.threshold = 50
|
|
1234
|
+
|
|
1235
|
+
def run(self, column_name: str, categorical_metrics: Any) -> Optional[QualityCheckResult]:
|
|
1236
|
+
if categorical_metrics is None:
|
|
1237
|
+
return None
|
|
1238
|
+
|
|
1239
|
+
cardinality = categorical_metrics.cardinality
|
|
1240
|
+
|
|
1241
|
+
if cardinality > self.threshold:
|
|
1242
|
+
return self.create_result(
|
|
1243
|
+
column_name, False,
|
|
1244
|
+
f"High cardinality: {cardinality} unique categories (> {self.threshold})",
|
|
1245
|
+
{"cardinality": cardinality},
|
|
1246
|
+
f"Consider using {categorical_metrics.encoding_recommendation} encoding."
|
|
1247
|
+
)
|
|
1248
|
+
else:
|
|
1249
|
+
return self.create_result(
|
|
1250
|
+
column_name, True,
|
|
1251
|
+
f"Acceptable cardinality: {cardinality} unique categories",
|
|
1252
|
+
{"cardinality": cardinality}
|
|
1253
|
+
)
|
|
1254
|
+
|
|
1255
|
+
|
|
1256
|
+
class ManyRareCategoriesCheck(QualityCheck):
|
|
1257
|
+
def __init__(self):
|
|
1258
|
+
super().__init__("CN003", "Many Rare Categories", Severity.MEDIUM)
|
|
1259
|
+
self.threshold = 10
|
|
1260
|
+
|
|
1261
|
+
def run(self, column_name: str, categorical_metrics: Any) -> Optional[QualityCheckResult]:
|
|
1262
|
+
if categorical_metrics is None:
|
|
1263
|
+
return None
|
|
1264
|
+
|
|
1265
|
+
rare_count = categorical_metrics.rare_category_count
|
|
1266
|
+
|
|
1267
|
+
if rare_count > self.threshold:
|
|
1268
|
+
return self.create_result(
|
|
1269
|
+
column_name, False,
|
|
1270
|
+
f"Many rare categories: {rare_count} categories with < 1% frequency (> {self.threshold})",
|
|
1271
|
+
{"rare_category_count": rare_count, "rare_categories": categorical_metrics.rare_categories[:5]},
|
|
1272
|
+
"Consider grouping rare categories into 'Other' or using frequency encoding."
|
|
1273
|
+
)
|
|
1274
|
+
else:
|
|
1275
|
+
return self.create_result(
|
|
1276
|
+
column_name, True,
|
|
1277
|
+
f"Acceptable rare categories: {rare_count} categories",
|
|
1278
|
+
{"rare_category_count": rare_count}
|
|
1279
|
+
)
|
|
1280
|
+
|
|
1281
|
+
|
|
1282
|
+
class SignificantRareVolumeCheck(QualityCheck):
|
|
1283
|
+
def __init__(self):
|
|
1284
|
+
super().__init__("CN004", "Significant Rare Category Volume", Severity.HIGH)
|
|
1285
|
+
self.threshold = 20.0
|
|
1286
|
+
|
|
1287
|
+
def run(self, column_name: str, categorical_metrics: Any) -> Optional[QualityCheckResult]:
|
|
1288
|
+
if categorical_metrics is None:
|
|
1289
|
+
return None
|
|
1290
|
+
|
|
1291
|
+
rare_pct = categorical_metrics.rare_category_percentage
|
|
1292
|
+
|
|
1293
|
+
if rare_pct > self.threshold:
|
|
1294
|
+
return self.create_result(
|
|
1295
|
+
column_name, False,
|
|
1296
|
+
f"High rare category volume: {rare_pct}% of rows in rare categories (> {self.threshold}%)",
|
|
1297
|
+
{"rare_category_percentage": rare_pct, "rare_category_count": categorical_metrics.rare_category_count},
|
|
1298
|
+
"Group rare categories or use encoding that handles high cardinality (target encoding, embedding)."
|
|
1299
|
+
)
|
|
1300
|
+
else:
|
|
1301
|
+
return self.create_result(
|
|
1302
|
+
column_name, True,
|
|
1303
|
+
f"Acceptable rare category volume: {rare_pct}%",
|
|
1304
|
+
{"rare_category_percentage": rare_pct}
|
|
1305
|
+
)
|
|
1306
|
+
|
|
1307
|
+
|
|
1308
|
+
class CaseInconsistencyCheck(QualityCheck):
|
|
1309
|
+
def __init__(self):
|
|
1310
|
+
super().__init__("CN005", "Case Inconsistency", Severity.LOW)
|
|
1311
|
+
|
|
1312
|
+
def run(self, column_name: str, categorical_metrics: Any) -> Optional[QualityCheckResult]:
|
|
1313
|
+
if categorical_metrics is None:
|
|
1314
|
+
return None
|
|
1315
|
+
|
|
1316
|
+
case_variations = categorical_metrics.case_variations
|
|
1317
|
+
|
|
1318
|
+
if len(case_variations) > 0:
|
|
1319
|
+
return self.create_result(
|
|
1320
|
+
column_name, False,
|
|
1321
|
+
f"Case inconsistency detected: {len(case_variations)} variations found",
|
|
1322
|
+
{"case_variations": case_variations},
|
|
1323
|
+
"Standardize case (e.g., lowercase all values) during preprocessing."
|
|
1324
|
+
)
|
|
1325
|
+
else:
|
|
1326
|
+
return self.create_result(
|
|
1327
|
+
column_name, True,
|
|
1328
|
+
"No case inconsistency detected",
|
|
1329
|
+
{"case_variations": []}
|
|
1330
|
+
)
|
|
1331
|
+
|
|
1332
|
+
|
|
1333
|
+
class WhitespaceIssuesCheck(QualityCheck):
|
|
1334
|
+
def __init__(self):
|
|
1335
|
+
super().__init__("CN006", "Whitespace Issues", Severity.LOW)
|
|
1336
|
+
|
|
1337
|
+
def run(self, column_name: str, categorical_metrics: Any) -> Optional[QualityCheckResult]:
|
|
1338
|
+
if categorical_metrics is None:
|
|
1339
|
+
return None
|
|
1340
|
+
|
|
1341
|
+
whitespace_issues = categorical_metrics.whitespace_issues
|
|
1342
|
+
|
|
1343
|
+
if len(whitespace_issues) > 0:
|
|
1344
|
+
return self.create_result(
|
|
1345
|
+
column_name, False,
|
|
1346
|
+
f"Whitespace issues detected: {len(whitespace_issues)} values with leading/trailing spaces",
|
|
1347
|
+
{"whitespace_issues": whitespace_issues},
|
|
1348
|
+
"Strip leading/trailing whitespace during preprocessing."
|
|
1349
|
+
)
|
|
1350
|
+
else:
|
|
1351
|
+
return self.create_result(
|
|
1352
|
+
column_name, True,
|
|
1353
|
+
"No whitespace issues detected",
|
|
1354
|
+
{"whitespace_issues": []}
|
|
1355
|
+
)
|
|
1356
|
+
|
|
1357
|
+
|
|
1358
|
+
class SingleCategoryCheck(QualityCheck):
|
|
1359
|
+
def __init__(self):
|
|
1360
|
+
super().__init__("CN007", "Single Category Only", Severity.HIGH)
|
|
1361
|
+
|
|
1362
|
+
def run(self, column_name: str, categorical_metrics: Any) -> Optional[QualityCheckResult]:
|
|
1363
|
+
if categorical_metrics is None:
|
|
1364
|
+
return None
|
|
1365
|
+
|
|
1366
|
+
cardinality = categorical_metrics.cardinality
|
|
1367
|
+
|
|
1368
|
+
if cardinality == 1:
|
|
1369
|
+
return self.create_result(
|
|
1370
|
+
column_name, False,
|
|
1371
|
+
f"Column has only 1 category: {categorical_metrics.top_categories[0][0]}",
|
|
1372
|
+
{"cardinality": 1, "category": categorical_metrics.top_categories[0][0]},
|
|
1373
|
+
"Remove constant categorical column - provides no information for modeling."
|
|
1374
|
+
)
|
|
1375
|
+
else:
|
|
1376
|
+
return self.create_result(
|
|
1377
|
+
column_name, True,
|
|
1378
|
+
f"Column has {cardinality} categories",
|
|
1379
|
+
{"cardinality": cardinality}
|
|
1380
|
+
)
|
|
1381
|
+
|
|
1382
|
+
|
|
1383
|
+
class PossibleTyposCheck(QualityCheck):
|
|
1384
|
+
def __init__(self):
|
|
1385
|
+
super().__init__("CN008", "Possible Typos Detected", Severity.MEDIUM)
|
|
1386
|
+
|
|
1387
|
+
def run(self, column_name: str, categorical_metrics: Any) -> Optional[QualityCheckResult]:
|
|
1388
|
+
if categorical_metrics is None:
|
|
1389
|
+
return None
|
|
1390
|
+
|
|
1391
|
+
try:
|
|
1392
|
+
from difflib import SequenceMatcher
|
|
1393
|
+
except ImportError:
|
|
1394
|
+
return None
|
|
1395
|
+
|
|
1396
|
+
unique_values = list(categorical_metrics.value_counts.keys())[:100]
|
|
1397
|
+
similar_pairs = []
|
|
1398
|
+
|
|
1399
|
+
for i, val1 in enumerate(unique_values):
|
|
1400
|
+
for val2 in unique_values[i+1:]:
|
|
1401
|
+
if len(val1) > 3 and len(val2) > 3:
|
|
1402
|
+
ratio = SequenceMatcher(None, val1.lower(), val2.lower()).ratio()
|
|
1403
|
+
if 0.8 < ratio < 1.0:
|
|
1404
|
+
similar_pairs.append(f"{val1} ~ {val2}")
|
|
1405
|
+
if len(similar_pairs) >= 5:
|
|
1406
|
+
break
|
|
1407
|
+
if len(similar_pairs) >= 5:
|
|
1408
|
+
break
|
|
1409
|
+
|
|
1410
|
+
if len(similar_pairs) > 0:
|
|
1411
|
+
return self.create_result(
|
|
1412
|
+
column_name, False,
|
|
1413
|
+
f"Possible typos detected: {len(similar_pairs)} similar value pairs found",
|
|
1414
|
+
{"similar_pairs": similar_pairs},
|
|
1415
|
+
"Review similar values for potential typos and standardize."
|
|
1416
|
+
)
|
|
1417
|
+
else:
|
|
1418
|
+
return self.create_result(
|
|
1419
|
+
column_name, True,
|
|
1420
|
+
"No obvious typos detected",
|
|
1421
|
+
{"similar_pairs": []}
|
|
1422
|
+
)
|
|
1423
|
+
|
|
1424
|
+
|
|
1425
|
+
class IdentifierDuplicatesCheck(QualityCheck):
|
|
1426
|
+
def __init__(self):
|
|
1427
|
+
super().__init__("ID001", "Identifier Has Duplicates", Severity.CRITICAL)
|
|
1428
|
+
|
|
1429
|
+
def run(self, column_name: str, identifier_metrics: Any) -> Optional[QualityCheckResult]:
|
|
1430
|
+
if identifier_metrics is None:
|
|
1431
|
+
return None
|
|
1432
|
+
|
|
1433
|
+
if identifier_metrics.duplicate_count > 0:
|
|
1434
|
+
return self.create_result(
|
|
1435
|
+
column_name, False,
|
|
1436
|
+
f"Identifier column has {identifier_metrics.duplicate_count} duplicate values",
|
|
1437
|
+
{"duplicate_count": identifier_metrics.duplicate_count,
|
|
1438
|
+
"duplicate_values": identifier_metrics.duplicate_values[:5]},
|
|
1439
|
+
"CRITICAL: Identifiers must be unique. Investigate and resolve duplicates or reconsider column type."
|
|
1440
|
+
)
|
|
1441
|
+
else:
|
|
1442
|
+
return self.create_result(
|
|
1443
|
+
column_name, True,
|
|
1444
|
+
"Identifier column is unique",
|
|
1445
|
+
{"duplicate_count": 0}
|
|
1446
|
+
)
|
|
1447
|
+
|
|
1448
|
+
|
|
1449
|
+
class IdentifierFormatCheck(QualityCheck):
|
|
1450
|
+
def __init__(self):
|
|
1451
|
+
super().__init__("ID002", "Identifier Format Inconsistent", Severity.MEDIUM)
|
|
1452
|
+
self.threshold = 95.0
|
|
1453
|
+
|
|
1454
|
+
def run(self, column_name: str, identifier_metrics: Any) -> Optional[QualityCheckResult]:
|
|
1455
|
+
if identifier_metrics is None or identifier_metrics.format_consistency is None:
|
|
1456
|
+
return None
|
|
1457
|
+
|
|
1458
|
+
format_consistency = identifier_metrics.format_consistency
|
|
1459
|
+
|
|
1460
|
+
if format_consistency < self.threshold:
|
|
1461
|
+
return self.create_result(
|
|
1462
|
+
column_name, False,
|
|
1463
|
+
f"Identifier format inconsistent: {format_consistency}% match pattern '{identifier_metrics.format_pattern}' (< {self.threshold}%)",
|
|
1464
|
+
{"format_consistency": format_consistency, "format_pattern": identifier_metrics.format_pattern},
|
|
1465
|
+
"Standardize identifier format or investigate data quality issues."
|
|
1466
|
+
)
|
|
1467
|
+
else:
|
|
1468
|
+
return self.create_result(
|
|
1469
|
+
column_name, True,
|
|
1470
|
+
f"Identifier format consistent: {format_consistency}% match pattern '{identifier_metrics.format_pattern}'",
|
|
1471
|
+
{"format_consistency": format_consistency, "format_pattern": identifier_metrics.format_pattern}
|
|
1472
|
+
)
|
|
1473
|
+
|
|
1474
|
+
|
|
1475
|
+
class IdentifierNullCheck(QualityCheck):
|
|
1476
|
+
def __init__(self):
|
|
1477
|
+
super().__init__("ID003", "Identifier Contains Nulls", Severity.HIGH)
|
|
1478
|
+
|
|
1479
|
+
def run(self, column_name: str, universal_metrics: Any) -> Optional[QualityCheckResult]:
|
|
1480
|
+
if universal_metrics is None:
|
|
1481
|
+
return None
|
|
1482
|
+
|
|
1483
|
+
if universal_metrics.null_count > 0:
|
|
1484
|
+
return self.create_result(
|
|
1485
|
+
column_name, False,
|
|
1486
|
+
f"Identifier column contains {universal_metrics.null_count} null values ({universal_metrics.null_percentage}%)",
|
|
1487
|
+
{"null_count": universal_metrics.null_count, "null_percentage": universal_metrics.null_percentage},
|
|
1488
|
+
"Identifiers should not contain nulls. Investigate missing identifiers or data quality issues."
|
|
1489
|
+
)
|
|
1490
|
+
else:
|
|
1491
|
+
return self.create_result(
|
|
1492
|
+
column_name, True,
|
|
1493
|
+
"Identifier column has no null values",
|
|
1494
|
+
{"null_count": 0}
|
|
1495
|
+
)
|
|
1496
|
+
|
|
1497
|
+
|
|
1498
|
+
class QualityCheckRegistry:
|
|
1499
|
+
_checks = {
|
|
1500
|
+
"FQ001": MissingValueCheck,
|
|
1501
|
+
"FQ003": ConstantFeatureCheck,
|
|
1502
|
+
"FQ005": HighOutliersCheck,
|
|
1503
|
+
"FQ008": UnknownColumnTypeCheck,
|
|
1504
|
+
"FQ009": VeryHighCardinalityNominalCheck,
|
|
1505
|
+
"FQ011": AllValuesOutliersCheck,
|
|
1506
|
+
"FQ012": UnrealisticDateRangeCheck,
|
|
1507
|
+
"CAT001": HighCardinalityCheck,
|
|
1508
|
+
"NUM001": LowCardinalityCheck,
|
|
1509
|
+
"CAT002": ImbalancedTargetCheck,
|
|
1510
|
+
"NUM002": SkewnessCheck,
|
|
1511
|
+
"NUM003": OutlierCheck,
|
|
1512
|
+
"NUM004": ZeroInflationCheck,
|
|
1513
|
+
"LEAK001": IdentifierLeakageCheck,
|
|
1514
|
+
"DT001": DatetimeFutureLeakageCheck,
|
|
1515
|
+
"DT002": PlaceholderDateCheck,
|
|
1516
|
+
"CAT003": RareCategoryCheck,
|
|
1517
|
+
"CAT004": UnknownCategoryCheck,
|
|
1518
|
+
"TX001": PIIDetectedCheck,
|
|
1519
|
+
"TX002": EmptyTextCheck,
|
|
1520
|
+
"TX003": ShortTextCheck,
|
|
1521
|
+
"TX004": HighUniquenessTextCheck,
|
|
1522
|
+
"NC001": ExtremeOutliersCheck,
|
|
1523
|
+
"NC002": ModerateOutliersCheck,
|
|
1524
|
+
"NC003": HighSkewnessCheck,
|
|
1525
|
+
"NC004": NumericZeroInflationCheck,
|
|
1526
|
+
"NC005": UnexpectedNegativesCheck,
|
|
1527
|
+
"NC006": InfiniteValuesCheck,
|
|
1528
|
+
"NC007": ConstantValueCheck,
|
|
1529
|
+
"NC008": SuspiciousPrecisionCheck,
|
|
1530
|
+
"TG001": TargetNullCheck,
|
|
1531
|
+
"TG002": TargetSevereImbalanceCheck,
|
|
1532
|
+
"TG003": TargetModerateImbalanceCheck,
|
|
1533
|
+
"TG004": TargetUnexpectedClassesCheck,
|
|
1534
|
+
"TG005": SingleClassTargetCheck,
|
|
1535
|
+
"ID001": IdentifierDuplicatesCheck,
|
|
1536
|
+
"ID002": IdentifierFormatCheck,
|
|
1537
|
+
"ID003": IdentifierNullCheck,
|
|
1538
|
+
"CN001": VeryHighCardinalityCheck,
|
|
1539
|
+
"CN002": HighCardinalityCategoricalCheck,
|
|
1540
|
+
"CN003": ManyRareCategoriesCheck,
|
|
1541
|
+
"CN004": SignificantRareVolumeCheck,
|
|
1542
|
+
"CN005": CaseInconsistencyCheck,
|
|
1543
|
+
"CN006": WhitespaceIssuesCheck,
|
|
1544
|
+
"CN007": SingleCategoryCheck,
|
|
1545
|
+
"CN008": PossibleTyposCheck,
|
|
1546
|
+
"DT003": DatetimeFormatInconsistentCheck,
|
|
1547
|
+
"DT004": DatetimeMixedTimezonesCheck,
|
|
1548
|
+
"DT005": DatetimeInvalidDatesCheck,
|
|
1549
|
+
"DT006": DatetimeUnrealisticRangeCheck,
|
|
1550
|
+
"BN001": BinaryNotBinaryCheck,
|
|
1551
|
+
"BN002": BinarySevereImbalanceCheck,
|
|
1552
|
+
"BN003": BinaryAllSameValueCheck,
|
|
1553
|
+
"BN004": BinaryUnexpectedValuesCheck,
|
|
1554
|
+
}
|
|
1555
|
+
|
|
1556
|
+
@classmethod
|
|
1557
|
+
def get_check(cls, check_id: str):
|
|
1558
|
+
check_class = cls._checks.get(check_id)
|
|
1559
|
+
return check_class() if check_class else None
|
|
1560
|
+
|
|
1561
|
+
@classmethod
|
|
1562
|
+
def get_all_checks(cls):
|
|
1563
|
+
return [check_class() for check_class in cls._checks.values()]
|
|
1564
|
+
|
|
1565
|
+
@classmethod
|
|
1566
|
+
def get_checks_for_column_type(cls, column_type: ColumnType):
|
|
1567
|
+
checks = []
|
|
1568
|
+
|
|
1569
|
+
checks.append(MissingValueCheck())
|
|
1570
|
+
checks.append(ConstantFeatureCheck())
|
|
1571
|
+
|
|
1572
|
+
if column_type == ColumnType.IDENTIFIER:
|
|
1573
|
+
checks.append(IdentifierLeakageCheck())
|
|
1574
|
+
checks.append(IdentifierDuplicatesCheck())
|
|
1575
|
+
checks.append(IdentifierFormatCheck())
|
|
1576
|
+
checks.append(IdentifierNullCheck())
|
|
1577
|
+
|
|
1578
|
+
elif column_type == ColumnType.TARGET:
|
|
1579
|
+
checks.append(TargetNullCheck())
|
|
1580
|
+
checks.append(SingleClassTargetCheck())
|
|
1581
|
+
checks.append(TargetSevereImbalanceCheck())
|
|
1582
|
+
checks.append(TargetModerateImbalanceCheck())
|
|
1583
|
+
checks.append(ImbalancedTargetCheck())
|
|
1584
|
+
# Note: TG004 (TargetUnexpectedClassesCheck) requires expected_classes configuration
|
|
1585
|
+
|
|
1586
|
+
elif column_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]:
|
|
1587
|
+
checks.append(LowCardinalityCheck())
|
|
1588
|
+
checks.append(ExtremeOutliersCheck()) # NC001
|
|
1589
|
+
checks.append(ModerateOutliersCheck()) # NC002
|
|
1590
|
+
checks.append(HighSkewnessCheck()) # NC003
|
|
1591
|
+
checks.append(NumericZeroInflationCheck()) # NC004
|
|
1592
|
+
# NC005 (UnexpectedNegativesCheck) requires configuration
|
|
1593
|
+
checks.append(InfiniteValuesCheck()) # NC006
|
|
1594
|
+
checks.append(ConstantValueCheck()) # NC007
|
|
1595
|
+
# NC008 (SuspiciousPrecisionCheck) requires series access
|
|
1596
|
+
checks.append(SkewnessCheck()) # FQ006
|
|
1597
|
+
checks.append(OutlierCheck()) # FQ007
|
|
1598
|
+
checks.append(ZeroInflationCheck()) # FQ008
|
|
1599
|
+
|
|
1600
|
+
elif column_type in [ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL, ColumnType.CATEGORICAL_CYCLICAL]:
|
|
1601
|
+
checks.append(HighCardinalityCheck())
|
|
1602
|
+
checks.append(RareCategoryCheck())
|
|
1603
|
+
checks.append(UnknownCategoryCheck())
|
|
1604
|
+
checks.append(VeryHighCardinalityCheck())
|
|
1605
|
+
checks.append(HighCardinalityCategoricalCheck())
|
|
1606
|
+
checks.append(ManyRareCategoriesCheck())
|
|
1607
|
+
checks.append(SignificantRareVolumeCheck())
|
|
1608
|
+
checks.append(CaseInconsistencyCheck())
|
|
1609
|
+
checks.append(WhitespaceIssuesCheck())
|
|
1610
|
+
checks.append(SingleCategoryCheck())
|
|
1611
|
+
checks.append(PossibleTyposCheck())
|
|
1612
|
+
|
|
1613
|
+
elif column_type == ColumnType.DATETIME:
|
|
1614
|
+
checks.append(DatetimeFutureLeakageCheck())
|
|
1615
|
+
checks.append(PlaceholderDateCheck())
|
|
1616
|
+
checks.append(DatetimeFormatInconsistentCheck())
|
|
1617
|
+
checks.append(DatetimeMixedTimezonesCheck())
|
|
1618
|
+
checks.append(DatetimeUnrealisticRangeCheck())
|
|
1619
|
+
|
|
1620
|
+
elif column_type == ColumnType.BINARY:
|
|
1621
|
+
checks.append(BinaryNotBinaryCheck())
|
|
1622
|
+
checks.append(BinarySevereImbalanceCheck())
|
|
1623
|
+
checks.append(BinaryAllSameValueCheck())
|
|
1624
|
+
checks.append(BinaryUnexpectedValuesCheck())
|
|
1625
|
+
|
|
1626
|
+
elif column_type == ColumnType.TEXT:
|
|
1627
|
+
checks.append(PIIDetectedCheck())
|
|
1628
|
+
checks.append(EmptyTextCheck())
|
|
1629
|
+
checks.append(ShortTextCheck())
|
|
1630
|
+
checks.append(HighUniquenessTextCheck())
|
|
1631
|
+
|
|
1632
|
+
return checks
|