churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,1632 @@
1
+ from typing import Any, Optional
2
+
3
+ from pydantic import BaseModel
4
+
5
+ from customer_retention.core.compat import is_datetime64_any_dtype, pd
6
+ from customer_retention.core.components.enums import Severity
7
+ from customer_retention.core.config import ColumnType
8
+
9
+
10
+ class QualityCheckResult(BaseModel):
11
+ check_id: str
12
+ check_name: str
13
+ column_name: str
14
+ passed: bool
15
+ severity: Severity
16
+ message: str
17
+ details: dict[str, Any] = {}
18
+ recommendation: Optional[str] = None
19
+
20
+
21
+ class QualityCheck:
22
+ def __init__(self, check_id: str, check_name: str, severity: Severity):
23
+ self.check_id = check_id
24
+ self.check_name = check_name
25
+ self.severity = severity
26
+
27
+ def create_result(self, column_name: str, passed: bool, message: str,
28
+ details: dict = None, recommendation: str = None,
29
+ severity: Optional[Severity] = None) -> QualityCheckResult:
30
+ return QualityCheckResult(
31
+ check_id=self.check_id,
32
+ check_name=self.check_name,
33
+ column_name=column_name,
34
+ passed=passed,
35
+ severity=severity or self.severity,
36
+ message=message,
37
+ details=details or {},
38
+ recommendation=recommendation
39
+ )
40
+
41
+
42
+ class MissingValueCheck(QualityCheck):
43
+ def __init__(self):
44
+ super().__init__("FQ001", "Column has >95% missing", Severity.CRITICAL)
45
+ self.threshold_critical = 95.0
46
+ self.threshold_high = 70.0
47
+ self.threshold_medium = 20.0
48
+
49
+ def run(self, column_name: str, universal_metrics: Any) -> QualityCheckResult:
50
+ null_pct = universal_metrics.null_percentage
51
+
52
+ if null_pct > self.threshold_critical:
53
+ return self.create_result(
54
+ column_name, False,
55
+ f"Critical: {null_pct}% missing values (>{self.threshold_critical}%)",
56
+ {"null_percentage": null_pct, "null_count": universal_metrics.null_count},
57
+ "Consider imputation strategy or feature removal if not informative",
58
+ Severity.CRITICAL
59
+ )
60
+ elif null_pct > self.threshold_high:
61
+ return self.create_result(
62
+ column_name, False,
63
+ f"High: {null_pct}% missing values (>{self.threshold_high}%)",
64
+ {"null_percentage": null_pct, "null_count": universal_metrics.null_count},
65
+ "Review imputation strategy or investigate data collection issues",
66
+ Severity.HIGH
67
+ )
68
+ elif null_pct > self.threshold_medium:
69
+ return self.create_result(
70
+ column_name, True,
71
+ f"Medium: {null_pct}% missing values (>{self.threshold_medium}%)",
72
+ {"null_percentage": null_pct, "null_count": universal_metrics.null_count},
73
+ "Monitor missingness pattern and consider simple imputation",
74
+ Severity.MEDIUM
75
+ )
76
+ else:
77
+ return self.create_result(
78
+ column_name, True,
79
+ f"Acceptable missing values: {null_pct}%",
80
+ {"null_percentage": null_pct, "null_count": universal_metrics.null_count}
81
+ )
82
+
83
+
84
+ class HighCardinalityCheck(QualityCheck):
85
+ def __init__(self):
86
+ super().__init__("CAT001", "High Cardinality Categorical", Severity.MEDIUM)
87
+ self.threshold_ratio = 0.95
88
+
89
+ def run(self, column_name: str, categorical_metrics: Any) -> Optional[QualityCheckResult]:
90
+ if categorical_metrics is None:
91
+ return None
92
+
93
+ cardinality_ratio = categorical_metrics.cardinality_ratio
94
+
95
+ if cardinality_ratio > self.threshold_ratio:
96
+ return self.create_result(
97
+ column_name, False,
98
+ f"Very high cardinality ratio: {cardinality_ratio:.2%}",
99
+ {"cardinality": categorical_metrics.cardinality, "cardinality_ratio": cardinality_ratio},
100
+ f"Consider using {categorical_metrics.encoding_recommendation} encoding or treating as text"
101
+ )
102
+ else:
103
+ return self.create_result(
104
+ column_name, True,
105
+ f"Acceptable cardinality ratio: {cardinality_ratio:.2%}",
106
+ {"cardinality": categorical_metrics.cardinality, "cardinality_ratio": cardinality_ratio}
107
+ )
108
+
109
+
110
+ class LowCardinalityCheck(QualityCheck):
111
+ def __init__(self):
112
+ super().__init__("NUM001", "Low Cardinality Numeric", Severity.LOW)
113
+ self.threshold = 10
114
+
115
+ def run(self, column_name: str, universal_metrics: Any, column_type: ColumnType) -> Optional[QualityCheckResult]:
116
+ if column_type not in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]:
117
+ return None
118
+
119
+ distinct_count = universal_metrics.distinct_count
120
+
121
+ if distinct_count < self.threshold:
122
+ return self.create_result(
123
+ column_name, False,
124
+ f"Low cardinality for numeric: {distinct_count} unique values",
125
+ {"distinct_count": distinct_count},
126
+ "Consider treating as categorical or ordinal feature"
127
+ )
128
+ else:
129
+ return self.create_result(
130
+ column_name, True,
131
+ f"Acceptable cardinality for numeric: {distinct_count} unique values",
132
+ {"distinct_count": distinct_count}
133
+ )
134
+
135
+
136
+ class ConstantFeatureCheck(QualityCheck):
137
+ def __init__(self):
138
+ super().__init__("FQ003", "Column is constant", Severity.CRITICAL)
139
+ self.threshold_ratio = 1.0
140
+
141
+ def run(self, column_name: str, universal_metrics: Any, column_type: Optional[ColumnType] = None) -> QualityCheckResult:
142
+ if universal_metrics.total_count == 0:
143
+ return self.create_result(column_name, True, "Empty column", {})
144
+
145
+ distinct_count = universal_metrics.distinct_count
146
+
147
+ if distinct_count == 1:
148
+ return self.create_result(
149
+ column_name, False,
150
+ f"Column is constant: only 1 distinct value ({universal_metrics.most_common_value})",
151
+ {"distinct_count": 1, "constant_value": universal_metrics.most_common_value},
152
+ "CRITICAL: Remove constant column - provides no information for modeling"
153
+ )
154
+ else:
155
+ return self.create_result(
156
+ column_name, True,
157
+ f"Column has {distinct_count} distinct values",
158
+ {"distinct_count": distinct_count}
159
+ )
160
+
161
+
162
+ class ImbalancedTargetCheck(QualityCheck):
163
+ def __init__(self):
164
+ super().__init__("CAT002", "Imbalanced Target Variable", Severity.HIGH)
165
+ self.threshold_severe = 20.0
166
+ self.threshold_moderate = 5.0
167
+
168
+ def run(self, column_name: str, target_metrics: Any) -> Optional[QualityCheckResult]:
169
+ if target_metrics is None:
170
+ return None
171
+
172
+ imbalance_ratio = target_metrics.imbalance_ratio
173
+ minority_pct = target_metrics.minority_percentage
174
+
175
+ if imbalance_ratio > self.threshold_severe:
176
+ return self.create_result(
177
+ column_name, False,
178
+ f"Severe imbalance: {imbalance_ratio:.1f}:1 ratio, minority class {minority_pct}%",
179
+ {"imbalance_ratio": imbalance_ratio, "minority_percentage": minority_pct,
180
+ "minority_class": target_metrics.minority_class},
181
+ "Apply SMOTE, class weights, or stratified sampling"
182
+ )
183
+ elif imbalance_ratio > self.threshold_moderate:
184
+ return self.create_result(
185
+ column_name, False,
186
+ f"Moderate imbalance: {imbalance_ratio:.1f}:1 ratio, minority class {minority_pct}%",
187
+ {"imbalance_ratio": imbalance_ratio, "minority_percentage": minority_pct,
188
+ "minority_class": target_metrics.minority_class},
189
+ "Consider class weights or balanced sampling"
190
+ )
191
+ else:
192
+ return self.create_result(
193
+ column_name, True,
194
+ f"Acceptable balance: {imbalance_ratio:.1f}:1 ratio, minority class {minority_pct}%",
195
+ {"imbalance_ratio": imbalance_ratio, "minority_percentage": minority_pct}
196
+ )
197
+
198
+
199
+ class TargetNullCheck(QualityCheck):
200
+ def __init__(self):
201
+ super().__init__("TG001", "Target Contains Nulls", Severity.CRITICAL)
202
+
203
+ def run(self, column_name: str, universal_metrics: Any) -> Optional[QualityCheckResult]:
204
+ if universal_metrics is None:
205
+ return None
206
+
207
+ if universal_metrics.null_count > 0:
208
+ return self.create_result(
209
+ column_name, False,
210
+ f"Target variable contains {universal_metrics.null_count} null values ({universal_metrics.null_percentage}%)",
211
+ {"null_count": universal_metrics.null_count, "null_percentage": universal_metrics.null_percentage},
212
+ "CRITICAL: Target variable must not contain nulls. Remove or impute before modeling."
213
+ )
214
+ else:
215
+ return self.create_result(
216
+ column_name, True,
217
+ "Target variable has no null values",
218
+ {"null_count": 0}
219
+ )
220
+
221
+
222
+ class SingleClassTargetCheck(QualityCheck):
223
+ def __init__(self):
224
+ super().__init__("TG005", "Single Class Target", Severity.CRITICAL)
225
+
226
+ def run(self, column_name: str, target_metrics: Any) -> Optional[QualityCheckResult]:
227
+ if target_metrics is None:
228
+ return None
229
+
230
+ if target_metrics.n_classes == 1:
231
+ return self.create_result(
232
+ column_name, False,
233
+ f"Target variable has only 1 class: {list(target_metrics.class_distribution.keys())[0]}",
234
+ {"n_classes": 1, "classes": list(target_metrics.class_distribution.keys())},
235
+ "CRITICAL: Cannot train a classifier with only one class. Check data filtering or sampling."
236
+ )
237
+ else:
238
+ return self.create_result(
239
+ column_name, True,
240
+ f"Target variable has {target_metrics.n_classes} classes",
241
+ {"n_classes": target_metrics.n_classes}
242
+ )
243
+
244
+
245
+ class TargetSevereImbalanceCheck(QualityCheck):
246
+ def __init__(self):
247
+ super().__init__("TG002", "Target Severe Imbalance", Severity.HIGH)
248
+ self.threshold = 1.0 # < 1%
249
+
250
+ def run(self, column_name: str, target_metrics: Any) -> Optional[QualityCheckResult]:
251
+ if target_metrics is None:
252
+ return None
253
+
254
+ minority_pct = target_metrics.minority_percentage
255
+
256
+ if minority_pct < self.threshold:
257
+ return self.create_result(
258
+ column_name, False,
259
+ f"Target has severe class imbalance: minority class {minority_pct}% (< {self.threshold}%)",
260
+ {"minority_percentage": minority_pct, "minority_class": target_metrics.minority_class,
261
+ "imbalance_ratio": target_metrics.imbalance_ratio},
262
+ "Apply SMOTE, class weights, or consider alternative algorithms (e.g., anomaly detection)."
263
+ )
264
+ else:
265
+ return self.create_result(
266
+ column_name, True,
267
+ f"Minority class at {minority_pct}% (>= {self.threshold}%)",
268
+ {"minority_percentage": minority_pct}
269
+ )
270
+
271
+
272
+ class TargetModerateImbalanceCheck(QualityCheck):
273
+ def __init__(self):
274
+ super().__init__("TG003", "Target Moderate Imbalance", Severity.MEDIUM)
275
+ self.threshold = 10.0 # < 10%
276
+
277
+ def run(self, column_name: str, target_metrics: Any) -> Optional[QualityCheckResult]:
278
+ if target_metrics is None:
279
+ return None
280
+
281
+ minority_pct = target_metrics.minority_percentage
282
+
283
+ if minority_pct < self.threshold:
284
+ return self.create_result(
285
+ column_name, False,
286
+ f"Target has moderate class imbalance: minority class {minority_pct}% (< {self.threshold}%)",
287
+ {"minority_percentage": minority_pct, "minority_class": target_metrics.minority_class,
288
+ "imbalance_ratio": target_metrics.imbalance_ratio},
289
+ "Consider class weights, stratified sampling, or balanced algorithms."
290
+ )
291
+ else:
292
+ return self.create_result(
293
+ column_name, True,
294
+ f"Minority class at {minority_pct}% (>= {self.threshold}%)",
295
+ {"minority_percentage": minority_pct}
296
+ )
297
+
298
+
299
+ class TargetUnexpectedClassesCheck(QualityCheck):
300
+ def __init__(self, expected_classes: Optional[int] = None):
301
+ super().__init__("TG004", "Target Unexpected Classes", Severity.HIGH)
302
+ self.expected_classes = expected_classes
303
+
304
+ def run(self, column_name: str, target_metrics: Any) -> Optional[QualityCheckResult]:
305
+ if target_metrics is None or self.expected_classes is None:
306
+ return None
307
+
308
+ n_classes = target_metrics.n_classes
309
+
310
+ if n_classes != self.expected_classes:
311
+ return self.create_result(
312
+ column_name, False,
313
+ f"Target has {n_classes} classes, expected {self.expected_classes}",
314
+ {"n_classes": n_classes, "expected_classes": self.expected_classes,
315
+ "classes": list(target_metrics.class_distribution.keys())},
316
+ "Investigate class mismatch. Check for data leakage, incorrect filtering, or configuration error."
317
+ )
318
+ else:
319
+ return self.create_result(
320
+ column_name, True,
321
+ f"Target has expected {n_classes} classes",
322
+ {"n_classes": n_classes}
323
+ )
324
+
325
+
326
+ class SkewnessCheck(QualityCheck):
327
+ def __init__(self):
328
+ super().__init__("NUM002", "Extreme Skewness", Severity.MEDIUM)
329
+ self.threshold_severe = 3.0
330
+ self.threshold_moderate = 1.0
331
+
332
+ def run(self, column_name: str, numeric_metrics: Any) -> Optional[QualityCheckResult]:
333
+ if numeric_metrics is None or numeric_metrics.skewness is None:
334
+ return None
335
+
336
+ skewness = abs(numeric_metrics.skewness)
337
+
338
+ if skewness > self.threshold_severe:
339
+ return self.create_result(
340
+ column_name, False,
341
+ f"Extreme skewness: {numeric_metrics.skewness:.2f}",
342
+ {"skewness": numeric_metrics.skewness},
343
+ "Apply log, sqrt, or Box-Cox transformation"
344
+ )
345
+ elif skewness > self.threshold_moderate:
346
+ return self.create_result(
347
+ column_name, False,
348
+ f"Moderate skewness: {numeric_metrics.skewness:.2f}",
349
+ {"skewness": numeric_metrics.skewness},
350
+ "Consider transformation for linear models"
351
+ )
352
+ else:
353
+ return self.create_result(
354
+ column_name, True,
355
+ f"Acceptable skewness: {numeric_metrics.skewness:.2f}",
356
+ {"skewness": numeric_metrics.skewness}
357
+ )
358
+
359
+
360
+ class OutlierCheck(QualityCheck):
361
+ def __init__(self):
362
+ super().__init__("NUM003", "Excessive Outliers", Severity.MEDIUM)
363
+ self.threshold_high = 10.0
364
+ self.threshold_medium = 5.0
365
+
366
+ def run(self, column_name: str, numeric_metrics: Any) -> Optional[QualityCheckResult]:
367
+ if numeric_metrics is None:
368
+ return None
369
+
370
+ outlier_pct = numeric_metrics.outlier_percentage
371
+
372
+ if outlier_pct > self.threshold_high:
373
+ return self.create_result(
374
+ column_name, False,
375
+ f"High outlier percentage: {outlier_pct}%",
376
+ {"outlier_count_iqr": numeric_metrics.outlier_count_iqr,
377
+ "outlier_percentage": outlier_pct},
378
+ "Review outliers for data quality issues or apply winsorization/clipping"
379
+ )
380
+ elif outlier_pct > self.threshold_medium:
381
+ return self.create_result(
382
+ column_name, False,
383
+ f"Moderate outlier percentage: {outlier_pct}%",
384
+ {"outlier_count_iqr": numeric_metrics.outlier_count_iqr,
385
+ "outlier_percentage": outlier_pct},
386
+ "Consider robust scaling or outlier treatment"
387
+ )
388
+ else:
389
+ return self.create_result(
390
+ column_name, True,
391
+ f"Acceptable outlier percentage: {outlier_pct}%",
392
+ {"outlier_percentage": outlier_pct}
393
+ )
394
+
395
+
396
+ class ZeroInflationCheck(QualityCheck):
397
+ def __init__(self):
398
+ super().__init__("NUM004", "Zero-Inflated Feature", Severity.LOW)
399
+ self.threshold = 50.0
400
+
401
+ def run(self, column_name: str, numeric_metrics: Any) -> Optional[QualityCheckResult]:
402
+ if numeric_metrics is None:
403
+ return None
404
+
405
+ zero_pct = numeric_metrics.zero_percentage
406
+
407
+ if zero_pct > self.threshold:
408
+ return self.create_result(
409
+ column_name, False,
410
+ f"Zero-inflated: {zero_pct}% zeros",
411
+ {"zero_count": numeric_metrics.zero_count, "zero_percentage": zero_pct},
412
+ "Consider zero-inflated models or separate binary indicator"
413
+ )
414
+ else:
415
+ return self.create_result(
416
+ column_name, True,
417
+ f"Acceptable zero percentage: {zero_pct}%",
418
+ {"zero_percentage": zero_pct}
419
+ )
420
+
421
+
422
+ class IdentifierLeakageCheck(QualityCheck):
423
+ def __init__(self):
424
+ super().__init__("LEAK001", "Identifier Column in Features", Severity.CRITICAL)
425
+
426
+ def run(self, column_name: str, column_type: ColumnType, should_use_as_feature: bool) -> Optional[QualityCheckResult]:
427
+ if column_type != ColumnType.IDENTIFIER:
428
+ return None
429
+
430
+ if should_use_as_feature:
431
+ return self.create_result(
432
+ column_name, False,
433
+ "Identifier column marked as feature",
434
+ {"column_type": column_type.value},
435
+ "Remove identifier from feature set to prevent data leakage"
436
+ )
437
+ else:
438
+ return self.create_result(
439
+ column_name, True,
440
+ "Identifier correctly excluded from features",
441
+ {"column_type": column_type.value}
442
+ )
443
+
444
+
445
+ class DatetimeFutureLeakageCheck(QualityCheck):
446
+ def __init__(self):
447
+ super().__init__("DT001", "Future Dates Detected", Severity.HIGH)
448
+
449
+ def run(self, column_name: str, datetime_metrics: Any) -> Optional[QualityCheckResult]:
450
+ if datetime_metrics is None:
451
+ return None
452
+
453
+ future_count = datetime_metrics.future_date_count
454
+
455
+ if future_count > 0:
456
+ return self.create_result(
457
+ column_name, False,
458
+ f"Found {future_count} future dates",
459
+ {"future_date_count": future_count},
460
+ "Investigate data quality issues or potential temporal leakage"
461
+ )
462
+ else:
463
+ return self.create_result(
464
+ column_name, True,
465
+ "No future dates detected",
466
+ {"future_date_count": 0}
467
+ )
468
+
469
+
470
+ class PlaceholderDateCheck(QualityCheck):
471
+ def __init__(self):
472
+ super().__init__("DT002", "Placeholder Dates", Severity.MEDIUM)
473
+ self.threshold = 0.05
474
+
475
+ def run(self, column_name: str, datetime_metrics: Any, total_count: int) -> Optional[QualityCheckResult]:
476
+ if datetime_metrics is None or total_count == 0:
477
+ return None
478
+
479
+ placeholder_count = datetime_metrics.placeholder_count
480
+ placeholder_pct = (placeholder_count / total_count) * 100
481
+
482
+ if placeholder_pct > self.threshold:
483
+ return self.create_result(
484
+ column_name, False,
485
+ f"Placeholder dates found: {placeholder_count} ({placeholder_pct:.2f}%)",
486
+ {"placeholder_count": placeholder_count, "placeholder_percentage": placeholder_pct},
487
+ "Replace placeholder dates with null or investigate data quality"
488
+ )
489
+ else:
490
+ return self.create_result(
491
+ column_name, True,
492
+ f"No significant placeholder dates: {placeholder_count}",
493
+ {"placeholder_count": placeholder_count}
494
+ )
495
+
496
+
497
+ class RareCategoryCheck(QualityCheck):
498
+ def __init__(self):
499
+ super().__init__("CAT003", "High Rare Category Count", Severity.MEDIUM)
500
+ self.threshold_pct = 20.0
501
+
502
+ def run(self, column_name: str, categorical_metrics: Any) -> Optional[QualityCheckResult]:
503
+ if categorical_metrics is None:
504
+ return None
505
+
506
+ rare_pct = categorical_metrics.rare_category_percentage
507
+ rare_count = categorical_metrics.rare_category_count
508
+
509
+ if rare_pct > self.threshold_pct:
510
+ return self.create_result(
511
+ column_name, False,
512
+ f"High rare category percentage: {rare_pct}% ({rare_count} categories)",
513
+ {"rare_category_count": rare_count, "rare_category_percentage": rare_pct},
514
+ "Consider grouping rare categories or using target encoding"
515
+ )
516
+ else:
517
+ return self.create_result(
518
+ column_name, True,
519
+ f"Acceptable rare categories: {rare_pct}% ({rare_count} categories)",
520
+ {"rare_category_count": rare_count, "rare_category_percentage": rare_pct}
521
+ )
522
+
523
+
524
+ class UnknownCategoryCheck(QualityCheck):
525
+ def __init__(self):
526
+ super().__init__("CAT004", "Unknown Categories Present", Severity.LOW)
527
+
528
+ def run(self, column_name: str, categorical_metrics: Any) -> Optional[QualityCheckResult]:
529
+ if categorical_metrics is None:
530
+ return None
531
+
532
+ has_unknown = categorical_metrics.contains_unknown
533
+
534
+ if has_unknown:
535
+ return self.create_result(
536
+ column_name, False,
537
+ "Contains unknown/missing value indicators",
538
+ {"contains_unknown": has_unknown},
539
+ "Replace with proper nulls or create explicit category"
540
+ )
541
+ else:
542
+ return self.create_result(
543
+ column_name, True,
544
+ "No unknown value indicators found",
545
+ {"contains_unknown": has_unknown}
546
+ )
547
+
548
+
549
+ class PIIDetectedCheck(QualityCheck):
550
+ def __init__(self):
551
+ super().__init__("TX001", "PII Detected", Severity.CRITICAL)
552
+
553
+ def run(self, column_name: str, text_metrics: Any) -> Optional[QualityCheckResult]:
554
+ if text_metrics is None:
555
+ return None
556
+
557
+ if text_metrics.pii_detected:
558
+ pii_types_str = ", ".join(text_metrics.pii_types)
559
+ return self.create_result(
560
+ column_name, False,
561
+ f"PII detected: {pii_types_str}",
562
+ {"pii_types": text_metrics.pii_types},
563
+ "CRITICAL: Remove PII or mask sensitive data before processing. Consider data anonymization techniques."
564
+ )
565
+ else:
566
+ return self.create_result(
567
+ column_name, True,
568
+ "No PII detected",
569
+ {"pii_detected": False}
570
+ )
571
+
572
+
573
+ class EmptyTextCheck(QualityCheck):
574
+ def __init__(self):
575
+ super().__init__("TX002", "Mostly Empty Text", Severity.HIGH)
576
+ self.threshold = 50.0
577
+
578
+ def run(self, column_name: str, text_metrics: Any) -> Optional[QualityCheckResult]:
579
+ if text_metrics is None:
580
+ return None
581
+
582
+ empty_pct = text_metrics.empty_percentage
583
+
584
+ if empty_pct > self.threshold:
585
+ return self.create_result(
586
+ column_name, False,
587
+ f"High percentage of empty text: {empty_pct}%",
588
+ {"empty_percentage": empty_pct, "empty_count": text_metrics.empty_count},
589
+ "Review data quality and consider imputation or feature removal"
590
+ )
591
+ else:
592
+ return self.create_result(
593
+ column_name, True,
594
+ f"Acceptable empty text percentage: {empty_pct}%",
595
+ {"empty_percentage": empty_pct}
596
+ )
597
+
598
+
599
+ class ShortTextCheck(QualityCheck):
600
+ def __init__(self):
601
+ super().__init__("TX003", "Very Short Texts", Severity.MEDIUM)
602
+ self.threshold = 10.0
603
+
604
+ def run(self, column_name: str, text_metrics: Any) -> Optional[QualityCheckResult]:
605
+ if text_metrics is None:
606
+ return None
607
+
608
+ avg_length = text_metrics.length_mean
609
+
610
+ if avg_length < self.threshold:
611
+ return self.create_result(
612
+ column_name, False,
613
+ f"Very short average text length: {avg_length:.1f} characters",
614
+ {"length_mean": avg_length},
615
+ "May be better treated as categorical. Consider reclassifying column type."
616
+ )
617
+ else:
618
+ return self.create_result(
619
+ column_name, True,
620
+ f"Acceptable text length: {avg_length:.1f} characters",
621
+ {"length_mean": avg_length}
622
+ )
623
+
624
+
625
+ class InfiniteValuesCheck(QualityCheck):
626
+ def __init__(self):
627
+ super().__init__("NC006", "Infinite Values", Severity.CRITICAL)
628
+
629
+ def run(self, column_name: str, numeric_metrics: Any) -> Optional[QualityCheckResult]:
630
+ if numeric_metrics is None:
631
+ return None
632
+
633
+ if numeric_metrics.inf_count > 0:
634
+ return self.create_result(
635
+ column_name, False,
636
+ f"Column contains {numeric_metrics.inf_count} infinite values ({numeric_metrics.inf_percentage}%)",
637
+ {"inf_count": numeric_metrics.inf_count, "inf_percentage": numeric_metrics.inf_percentage},
638
+ "CRITICAL: Remove or replace infinite values before processing. Use imputation or capping strategies."
639
+ )
640
+ else:
641
+ return self.create_result(
642
+ column_name, True,
643
+ "No infinite values detected",
644
+ {"inf_count": 0, "inf_percentage": 0.0}
645
+ )
646
+
647
+
648
+ class ExtremeOutliersCheck(QualityCheck):
649
+ def __init__(self):
650
+ super().__init__("NC001", "Extreme Outliers", Severity.HIGH)
651
+ self.threshold = 5.0 # > 5%
652
+
653
+ def run(self, column_name: str, numeric_metrics: Any) -> Optional[QualityCheckResult]:
654
+ if numeric_metrics is None:
655
+ return None
656
+
657
+ outlier_pct = numeric_metrics.outlier_percentage
658
+
659
+ if outlier_pct > self.threshold:
660
+ return self.create_result(
661
+ column_name, False,
662
+ f"Extreme outlier percentage: {outlier_pct}% (> {self.threshold}%)",
663
+ {"outlier_percentage": outlier_pct, "outlier_count_iqr": numeric_metrics.outlier_count_iqr,
664
+ "outlier_count_zscore": numeric_metrics.outlier_count_zscore},
665
+ "Apply robust scaling, winsorization, or consider removing outliers."
666
+ )
667
+ else:
668
+ return self.create_result(
669
+ column_name, True,
670
+ f"Acceptable outlier percentage: {outlier_pct}%",
671
+ {"outlier_percentage": outlier_pct}
672
+ )
673
+
674
+
675
+ class ModerateOutliersCheck(QualityCheck):
676
+ def __init__(self):
677
+ super().__init__("NC002", "Moderate Outliers", Severity.MEDIUM)
678
+ self.threshold = 1.0 # > 1%
679
+
680
+ def run(self, column_name: str, numeric_metrics: Any) -> Optional[QualityCheckResult]:
681
+ if numeric_metrics is None:
682
+ return None
683
+
684
+ outlier_pct = numeric_metrics.outlier_percentage
685
+
686
+ if outlier_pct > self.threshold:
687
+ return self.create_result(
688
+ column_name, False,
689
+ f"Moderate outlier percentage: {outlier_pct}% (> {self.threshold}%)",
690
+ {"outlier_percentage": outlier_pct, "outlier_count_iqr": numeric_metrics.outlier_count_iqr},
691
+ "Consider investigating outliers and applying transformations."
692
+ )
693
+ else:
694
+ return self.create_result(
695
+ column_name, True,
696
+ f"Low outlier percentage: {outlier_pct}%",
697
+ {"outlier_percentage": outlier_pct}
698
+ )
699
+
700
+
701
+ class HighSkewnessCheck(QualityCheck):
702
+ def __init__(self):
703
+ super().__init__("NC003", "High Skewness", Severity.MEDIUM)
704
+ self.threshold = 2.0 # |skewness| > 2
705
+
706
+ def run(self, column_name: str, numeric_metrics: Any) -> Optional[QualityCheckResult]:
707
+ if numeric_metrics is None or numeric_metrics.skewness is None:
708
+ return None
709
+
710
+ skewness = abs(numeric_metrics.skewness)
711
+
712
+ if skewness > self.threshold:
713
+ return self.create_result(
714
+ column_name, False,
715
+ f"High skewness detected: {numeric_metrics.skewness:.2f} (|skew| > {self.threshold})",
716
+ {"skewness": numeric_metrics.skewness, "abs_skewness": skewness},
717
+ "Apply log, sqrt, or Box-Cox transformation to reduce skewness."
718
+ )
719
+ else:
720
+ return self.create_result(
721
+ column_name, True,
722
+ f"Acceptable skewness: {numeric_metrics.skewness:.2f}",
723
+ {"skewness": numeric_metrics.skewness}
724
+ )
725
+
726
+
727
+ class NumericZeroInflationCheck(QualityCheck):
728
+ def __init__(self):
729
+ super().__init__("NC004", "Zero Inflation", Severity.MEDIUM)
730
+ self.threshold = 50.0 # > 50%
731
+
732
+ def run(self, column_name: str, numeric_metrics: Any) -> Optional[QualityCheckResult]:
733
+ if numeric_metrics is None:
734
+ return None
735
+
736
+ zero_pct = numeric_metrics.zero_percentage
737
+
738
+ if zero_pct > self.threshold:
739
+ return self.create_result(
740
+ column_name, False,
741
+ f"Zero-inflated: {zero_pct}% zeros (> {self.threshold}%)",
742
+ {"zero_percentage": zero_pct, "zero_count": numeric_metrics.zero_count},
743
+ "Consider zero-inflated models, indicator variable, or separate handling of zeros."
744
+ )
745
+ else:
746
+ return self.create_result(
747
+ column_name, True,
748
+ f"Acceptable zero percentage: {zero_pct}%",
749
+ {"zero_percentage": zero_pct}
750
+ )
751
+
752
+
753
+ class UnexpectedNegativesCheck(QualityCheck):
754
+ def __init__(self, allow_negatives: bool = True):
755
+ super().__init__("NC005", "Unexpected Negative Values", Severity.HIGH)
756
+ self.allow_negatives = allow_negatives
757
+
758
+ def run(self, column_name: str, numeric_metrics: Any) -> Optional[QualityCheckResult]:
759
+ if numeric_metrics is None or self.allow_negatives:
760
+ return None
761
+
762
+ negative_count = numeric_metrics.negative_count
763
+
764
+ if negative_count > 0:
765
+ return self.create_result(
766
+ column_name, False,
767
+ f"Column contains {negative_count} negative values ({numeric_metrics.negative_percentage}%), but negatives not expected",
768
+ {"negative_count": negative_count, "negative_percentage": numeric_metrics.negative_percentage},
769
+ "Investigate negative values. May indicate data errors or need for transformation."
770
+ )
771
+ else:
772
+ return self.create_result(
773
+ column_name, True,
774
+ "No negative values found",
775
+ {"negative_count": 0}
776
+ )
777
+
778
+
779
+ class ConstantValueCheck(QualityCheck):
780
+ def __init__(self):
781
+ super().__init__("NC007", "Constant Value", Severity.HIGH)
782
+
783
+ def run(self, column_name: str, numeric_metrics: Any) -> Optional[QualityCheckResult]:
784
+ if numeric_metrics is None:
785
+ return None
786
+
787
+ if numeric_metrics.std == 0:
788
+ return self.create_result(
789
+ column_name, False,
790
+ "Column has constant value (std = 0)",
791
+ {"std": 0, "mean": numeric_metrics.mean},
792
+ "Remove constant column - provides no information for modeling."
793
+ )
794
+ else:
795
+ return self.create_result(
796
+ column_name, True,
797
+ f"Column has variance (std = {numeric_metrics.std:.4f})",
798
+ {"std": numeric_metrics.std}
799
+ )
800
+
801
+
802
+ class SuspiciousPrecisionCheck(QualityCheck):
803
+ def __init__(self):
804
+ super().__init__("NC008", "Suspicious Precision", Severity.LOW)
805
+
806
+ def run(self, column_name: str, series: pd.Series) -> Optional[QualityCheckResult]:
807
+ # This check needs access to raw series to check decimal places
808
+ # Note: This is a simplified implementation
809
+ if series is None or len(series) == 0:
810
+ return None
811
+
812
+ clean_series = series.dropna()
813
+ if len(clean_series) == 0:
814
+ return None
815
+
816
+ # Check if all values end in .00 (are whole numbers)
817
+ all_whole = all((isinstance(v, (int, float)) and v == int(v)) for v in clean_series[:min(100, len(clean_series))])
818
+
819
+ if all_whole and len(clean_series) > 10:
820
+ return self.create_result(
821
+ column_name, False,
822
+ "All sampled values are whole numbers - may indicate precision loss or rounding",
823
+ {"precision_issue": "all_whole_numbers"},
824
+ "Verify source data precision and check for unintended rounding."
825
+ )
826
+ else:
827
+ return self.create_result(
828
+ column_name, True,
829
+ "Precision appears normal",
830
+ {}
831
+ )
832
+
833
+
834
+ class HighOutliersCheck(QualityCheck):
835
+ def __init__(self):
836
+ super().__init__("FQ005", "Column has >50% outliers", Severity.HIGH)
837
+ self.threshold = 50.0
838
+
839
+ def run(self, column_name: str, numeric_metrics: Any) -> Optional[QualityCheckResult]:
840
+ if numeric_metrics is None:
841
+ return None
842
+
843
+ outlier_pct = numeric_metrics.outlier_percentage
844
+
845
+ if outlier_pct > self.threshold:
846
+ return self.create_result(
847
+ column_name, False,
848
+ f"Extreme outlier percentage: {outlier_pct}% (> {self.threshold}%)",
849
+ {"outlier_percentage": outlier_pct, "outlier_count_iqr": numeric_metrics.outlier_count_iqr},
850
+ "HIGH: Column may be unreliable for modeling. Consider robust transformations or removal."
851
+ )
852
+ else:
853
+ return self.create_result(
854
+ column_name, True,
855
+ f"Acceptable outlier percentage: {outlier_pct}%",
856
+ {"outlier_percentage": outlier_pct}
857
+ )
858
+
859
+
860
+ class AllValuesOutliersCheck(QualityCheck):
861
+ def __init__(self):
862
+ super().__init__("FQ011", "All values are outliers", Severity.CRITICAL)
863
+
864
+ def run(self, column_name: str, numeric_metrics: Any) -> Optional[QualityCheckResult]:
865
+ if numeric_metrics is None:
866
+ return None
867
+
868
+ outlier_pct = numeric_metrics.outlier_percentage
869
+
870
+ if outlier_pct == 100.0:
871
+ return self.create_result(
872
+ column_name, False,
873
+ "CRITICAL: All values are outliers (100%)",
874
+ {"outlier_percentage": 100.0},
875
+ "CRITICAL: Column may have data quality issues. Investigate and consider removal."
876
+ )
877
+ else:
878
+ return self.create_result(
879
+ column_name, True,
880
+ f"Not all values are outliers: {outlier_pct}%",
881
+ {"outlier_percentage": outlier_pct}
882
+ )
883
+
884
+
885
+ class UnknownColumnTypeCheck(QualityCheck):
886
+ def __init__(self):
887
+ super().__init__("FQ008", "Unknown column type", Severity.MEDIUM)
888
+
889
+ def run(self, column_name: str, column_type: ColumnType) -> Optional[QualityCheckResult]:
890
+ if column_type == ColumnType.UNKNOWN:
891
+ return self.create_result(
892
+ column_name, False,
893
+ "Column type could not be determined",
894
+ {"column_type": "UNKNOWN"},
895
+ "Manually specify column type or investigate data format."
896
+ )
897
+ else:
898
+ return self.create_result(
899
+ column_name, True,
900
+ f"Column type determined: {column_type.value}",
901
+ {"column_type": column_type.value}
902
+ )
903
+
904
+
905
+ class VeryHighCardinalityNominalCheck(QualityCheck):
906
+ def __init__(self):
907
+ super().__init__("FQ009", "Very high cardinality nominal", Severity.MEDIUM)
908
+ self.threshold = 1000
909
+
910
+ def run(self, column_name: str, categorical_metrics: Any) -> Optional[QualityCheckResult]:
911
+ if categorical_metrics is None:
912
+ return None
913
+
914
+ cardinality = categorical_metrics.cardinality
915
+
916
+ if cardinality > self.threshold:
917
+ return self.create_result(
918
+ column_name, False,
919
+ f"Very high cardinality: {cardinality} unique categories (> {self.threshold})",
920
+ {"cardinality": cardinality},
921
+ "Consider treating as high cardinality or using hashing/embedding encoding."
922
+ )
923
+ else:
924
+ return self.create_result(
925
+ column_name, True,
926
+ f"Acceptable cardinality: {cardinality} unique categories",
927
+ {"cardinality": cardinality}
928
+ )
929
+
930
+
931
+ class UnrealisticDateRangeCheck(QualityCheck):
932
+ def __init__(self):
933
+ super().__init__("FQ012", "Date range unrealistic", Severity.HIGH)
934
+ self.threshold_years = 100
935
+
936
+ def run(self, column_name: str, datetime_metrics: Any) -> Optional[QualityCheckResult]:
937
+ if datetime_metrics is None:
938
+ return None
939
+
940
+ date_range_days = datetime_metrics.date_range_days
941
+ date_range_years = date_range_days / 365.25
942
+
943
+ if date_range_years > self.threshold_years:
944
+ return self.create_result(
945
+ column_name, False,
946
+ f"Unrealistic date range: {date_range_years:.1f} years (> {self.threshold_years} years)",
947
+ {"date_range_days": date_range_days, "date_range_years": round(date_range_years, 1),
948
+ "min_date": datetime_metrics.min_date, "max_date": datetime_metrics.max_date},
949
+ "HIGH: Date range spans > 100 years. Review for data quality issues."
950
+ )
951
+ else:
952
+ return self.create_result(
953
+ column_name, True,
954
+ f"Acceptable date range: {date_range_years:.1f} years",
955
+ {"date_range_days": date_range_days, "date_range_years": round(date_range_years, 1)}
956
+ )
957
+
958
+
959
+ class HighUniquenessTextCheck(QualityCheck):
960
+ def __init__(self):
961
+ super().__init__("TX004", "High Uniqueness Text", Severity.MEDIUM)
962
+ self.threshold = 0.95
963
+
964
+ def run(self, column_name: str, universal_metrics: Any) -> Optional[QualityCheckResult]:
965
+ if universal_metrics is None:
966
+ return None
967
+
968
+ distinct_pct = universal_metrics.distinct_percentage / 100.0
969
+
970
+ if distinct_pct > self.threshold:
971
+ return self.create_result(
972
+ column_name, False,
973
+ f"Text column has very high uniqueness: {universal_metrics.distinct_percentage}% unique (> {self.threshold * 100}%)",
974
+ {"distinct_percentage": universal_metrics.distinct_percentage, "distinct_count": universal_metrics.distinct_count},
975
+ "Text column may actually be an identifier. Consider reclassifying as IDENTIFIER type."
976
+ )
977
+ else:
978
+ return self.create_result(
979
+ column_name, True,
980
+ f"Acceptable text uniqueness: {universal_metrics.distinct_percentage}%",
981
+ {"distinct_percentage": universal_metrics.distinct_percentage}
982
+ )
983
+
984
+
985
+ class BinaryNotBinaryCheck(QualityCheck):
986
+ def __init__(self):
987
+ super().__init__("BN001", "Not Binary", Severity.CRITICAL)
988
+
989
+ def run(self, column_name: str, universal_metrics: Any) -> Optional[QualityCheckResult]:
990
+ if universal_metrics is None:
991
+ return None
992
+
993
+ distinct_count = universal_metrics.distinct_count
994
+
995
+ if distinct_count != 2:
996
+ return self.create_result(
997
+ column_name, False,
998
+ f"Column marked as binary but has {distinct_count} distinct values (expected 2)",
999
+ {"distinct_count": distinct_count},
1000
+ "CRITICAL: Binary columns must have exactly 2 distinct values. Review column type or data."
1001
+ )
1002
+ else:
1003
+ return self.create_result(
1004
+ column_name, True,
1005
+ "Column has exactly 2 distinct values",
1006
+ {"distinct_count": 2}
1007
+ )
1008
+
1009
+
1010
+ class BinarySevereImbalanceCheck(QualityCheck):
1011
+ def __init__(self):
1012
+ super().__init__("BN002", "Binary Severe Imbalance", Severity.MEDIUM)
1013
+ self.threshold_low = 1.0
1014
+ self.threshold_high = 99.0
1015
+
1016
+ def run(self, column_name: str, binary_metrics: Any) -> Optional[QualityCheckResult]:
1017
+ if binary_metrics is None:
1018
+ return None
1019
+
1020
+ true_pct = binary_metrics.true_percentage
1021
+
1022
+ if true_pct < self.threshold_low or true_pct > self.threshold_high:
1023
+ return self.create_result(
1024
+ column_name, False,
1025
+ f"Severe binary imbalance: {true_pct}% true values (< {self.threshold_low}% or > {self.threshold_high}%)",
1026
+ {"true_percentage": true_pct, "balance_ratio": binary_metrics.balance_ratio},
1027
+ "Consider class balancing techniques or check if column should be binary."
1028
+ )
1029
+ else:
1030
+ return self.create_result(
1031
+ column_name, True,
1032
+ f"Acceptable binary balance: {true_pct}% true values",
1033
+ {"true_percentage": true_pct}
1034
+ )
1035
+
1036
+
1037
+ class BinaryAllSameValueCheck(QualityCheck):
1038
+ def __init__(self):
1039
+ super().__init__("BN003", "Binary All Same Value", Severity.HIGH)
1040
+
1041
+ def run(self, column_name: str, universal_metrics: Any) -> Optional[QualityCheckResult]:
1042
+ if universal_metrics is None:
1043
+ return None
1044
+
1045
+ distinct_count = universal_metrics.distinct_count
1046
+
1047
+ if distinct_count == 1:
1048
+ return self.create_result(
1049
+ column_name, False,
1050
+ f"Binary column has only 1 distinct value: {universal_metrics.most_common_value}",
1051
+ {"distinct_count": 1, "value": universal_metrics.most_common_value},
1052
+ "Binary column provides no information. Consider removing."
1053
+ )
1054
+ else:
1055
+ return self.create_result(
1056
+ column_name, True,
1057
+ f"Binary column has {distinct_count} distinct values",
1058
+ {"distinct_count": distinct_count}
1059
+ )
1060
+
1061
+
1062
+ class BinaryUnexpectedValuesCheck(QualityCheck):
1063
+ def __init__(self):
1064
+ super().__init__("BN004", "Binary Unexpected Values", Severity.HIGH)
1065
+
1066
+ def run(self, column_name: str, binary_metrics: Any) -> Optional[QualityCheckResult]:
1067
+ if binary_metrics is None:
1068
+ return None
1069
+
1070
+ values_found = binary_metrics.values_found
1071
+ expected_values = {0, 1, 0.0, 1.0, True, False, "0", "1", "yes", "Yes", "YES", "no", "No", "NO",
1072
+ "true", "True", "TRUE", "false", "False", "FALSE", "y", "Y", "n", "N"}
1073
+
1074
+ unexpected = [v for v in values_found if v not in expected_values]
1075
+
1076
+ if len(unexpected) > 0:
1077
+ return self.create_result(
1078
+ column_name, False,
1079
+ f"Binary column contains unexpected values: {unexpected[:5]}",
1080
+ {"unexpected_values": unexpected[:5], "values_found": values_found},
1081
+ "Standardize binary values to 0/1 or True/False format."
1082
+ )
1083
+ else:
1084
+ return self.create_result(
1085
+ column_name, True,
1086
+ "Binary column contains only expected values",
1087
+ {"values_found": values_found}
1088
+ )
1089
+
1090
+
1091
+ class DatetimeFormatInconsistentCheck(QualityCheck):
1092
+ def __init__(self):
1093
+ super().__init__("DT003", "Datetime Format Inconsistent", Severity.MEDIUM)
1094
+ self.threshold = 95.0
1095
+
1096
+ def run(self, column_name: str, datetime_metrics: Any) -> Optional[QualityCheckResult]:
1097
+ if datetime_metrics is None or datetime_metrics.format_consistency is None:
1098
+ return None
1099
+
1100
+ format_consistency = datetime_metrics.format_consistency
1101
+
1102
+ if format_consistency < self.threshold:
1103
+ return self.create_result(
1104
+ column_name, False,
1105
+ f"Datetime format inconsistent: {format_consistency}% match format '{datetime_metrics.format_detected}' (< {self.threshold}%)",
1106
+ {"format_consistency": format_consistency, "format_detected": datetime_metrics.format_detected},
1107
+ "Standardize datetime format during data loading or preprocessing."
1108
+ )
1109
+ else:
1110
+ return self.create_result(
1111
+ column_name, True,
1112
+ f"Datetime format consistent: {format_consistency}% match format '{datetime_metrics.format_detected}'",
1113
+ {"format_consistency": format_consistency, "format_detected": datetime_metrics.format_detected}
1114
+ )
1115
+
1116
+
1117
+ class DatetimeMixedTimezonesCheck(QualityCheck):
1118
+ def __init__(self):
1119
+ super().__init__("DT004", "Mixed Timezones", Severity.MEDIUM)
1120
+
1121
+ def run(self, column_name: str, datetime_metrics: Any) -> Optional[QualityCheckResult]:
1122
+ if datetime_metrics is None:
1123
+ return None
1124
+
1125
+ timezone_consistent = datetime_metrics.timezone_consistent
1126
+
1127
+ if not timezone_consistent:
1128
+ return self.create_result(
1129
+ column_name, False,
1130
+ "Mixed timezones detected in datetime column",
1131
+ {"timezone_consistent": False},
1132
+ "Convert all datetimes to a single timezone (e.g., UTC) for consistency."
1133
+ )
1134
+ else:
1135
+ return self.create_result(
1136
+ column_name, True,
1137
+ "Timezones are consistent",
1138
+ {"timezone_consistent": True}
1139
+ )
1140
+
1141
+
1142
+ class DatetimeInvalidDatesCheck(QualityCheck):
1143
+ def __init__(self):
1144
+ super().__init__("DT005", "Invalid Dates", Severity.CRITICAL)
1145
+
1146
+ def run(self, column_name: str, series: pd.Series, universal_metrics: Any) -> Optional[QualityCheckResult]:
1147
+ if series is None:
1148
+ return None
1149
+
1150
+ clean_series = series.dropna()
1151
+ invalid_count = 0
1152
+
1153
+ if not is_datetime64_any_dtype(clean_series):
1154
+ for val in clean_series:
1155
+ try:
1156
+ pd.to_datetime(val, format='mixed')
1157
+ except Exception:
1158
+ invalid_count += 1
1159
+
1160
+ if invalid_count > 0:
1161
+ invalid_pct = (invalid_count / len(series)) * 100 if len(series) > 0 else 0.0
1162
+ return self.create_result(
1163
+ column_name, False,
1164
+ f"Column contains {invalid_count} invalid dates ({invalid_pct:.2f}%)",
1165
+ {"invalid_count": invalid_count, "invalid_percentage": invalid_pct},
1166
+ "CRITICAL: Fix or remove invalid dates before processing."
1167
+ )
1168
+ else:
1169
+ return self.create_result(
1170
+ column_name, True,
1171
+ "No invalid dates detected",
1172
+ {"invalid_count": 0}
1173
+ )
1174
+
1175
+
1176
+ class DatetimeUnrealisticRangeCheck(QualityCheck):
1177
+ def __init__(self):
1178
+ super().__init__("DT006", "Unrealistic Date Range", Severity.MEDIUM)
1179
+ self.threshold_years = 50
1180
+
1181
+ def run(self, column_name: str, datetime_metrics: Any) -> Optional[QualityCheckResult]:
1182
+ if datetime_metrics is None:
1183
+ return None
1184
+
1185
+ date_range_days = datetime_metrics.date_range_days
1186
+ date_range_years = date_range_days / 365.25
1187
+
1188
+ if date_range_years > self.threshold_years:
1189
+ return self.create_result(
1190
+ column_name, False,
1191
+ f"Unrealistic date range: {date_range_years:.1f} years (> {self.threshold_years} years)",
1192
+ {"date_range_days": date_range_days, "date_range_years": round(date_range_years, 1),
1193
+ "min_date": datetime_metrics.min_date, "max_date": datetime_metrics.max_date},
1194
+ "Review min/max dates for data quality issues or placeholder values."
1195
+ )
1196
+ else:
1197
+ return self.create_result(
1198
+ column_name, True,
1199
+ f"Acceptable date range: {date_range_years:.1f} years",
1200
+ {"date_range_days": date_range_days, "date_range_years": round(date_range_years, 1)}
1201
+ )
1202
+
1203
+
1204
+ class VeryHighCardinalityCheck(QualityCheck):
1205
+ def __init__(self):
1206
+ super().__init__("CN001", "Very High Cardinality", Severity.HIGH)
1207
+ self.threshold = 100
1208
+
1209
+ def run(self, column_name: str, categorical_metrics: Any) -> Optional[QualityCheckResult]:
1210
+ if categorical_metrics is None:
1211
+ return None
1212
+
1213
+ cardinality = categorical_metrics.cardinality
1214
+
1215
+ if cardinality > self.threshold:
1216
+ return self.create_result(
1217
+ column_name, False,
1218
+ f"Very high cardinality: {cardinality} unique categories (> {self.threshold})",
1219
+ {"cardinality": cardinality},
1220
+ f"Consider using {categorical_metrics.encoding_recommendation} encoding or treating as high cardinality feature."
1221
+ )
1222
+ else:
1223
+ return self.create_result(
1224
+ column_name, True,
1225
+ f"Acceptable cardinality: {cardinality} unique categories",
1226
+ {"cardinality": cardinality}
1227
+ )
1228
+
1229
+
1230
+ class HighCardinalityCategoricalCheck(QualityCheck):
1231
+ def __init__(self):
1232
+ super().__init__("CN002", "High Cardinality Categorical", Severity.MEDIUM)
1233
+ self.threshold = 50
1234
+
1235
+ def run(self, column_name: str, categorical_metrics: Any) -> Optional[QualityCheckResult]:
1236
+ if categorical_metrics is None:
1237
+ return None
1238
+
1239
+ cardinality = categorical_metrics.cardinality
1240
+
1241
+ if cardinality > self.threshold:
1242
+ return self.create_result(
1243
+ column_name, False,
1244
+ f"High cardinality: {cardinality} unique categories (> {self.threshold})",
1245
+ {"cardinality": cardinality},
1246
+ f"Consider using {categorical_metrics.encoding_recommendation} encoding."
1247
+ )
1248
+ else:
1249
+ return self.create_result(
1250
+ column_name, True,
1251
+ f"Acceptable cardinality: {cardinality} unique categories",
1252
+ {"cardinality": cardinality}
1253
+ )
1254
+
1255
+
1256
+ class ManyRareCategoriesCheck(QualityCheck):
1257
+ def __init__(self):
1258
+ super().__init__("CN003", "Many Rare Categories", Severity.MEDIUM)
1259
+ self.threshold = 10
1260
+
1261
+ def run(self, column_name: str, categorical_metrics: Any) -> Optional[QualityCheckResult]:
1262
+ if categorical_metrics is None:
1263
+ return None
1264
+
1265
+ rare_count = categorical_metrics.rare_category_count
1266
+
1267
+ if rare_count > self.threshold:
1268
+ return self.create_result(
1269
+ column_name, False,
1270
+ f"Many rare categories: {rare_count} categories with < 1% frequency (> {self.threshold})",
1271
+ {"rare_category_count": rare_count, "rare_categories": categorical_metrics.rare_categories[:5]},
1272
+ "Consider grouping rare categories into 'Other' or using frequency encoding."
1273
+ )
1274
+ else:
1275
+ return self.create_result(
1276
+ column_name, True,
1277
+ f"Acceptable rare categories: {rare_count} categories",
1278
+ {"rare_category_count": rare_count}
1279
+ )
1280
+
1281
+
1282
+ class SignificantRareVolumeCheck(QualityCheck):
1283
+ def __init__(self):
1284
+ super().__init__("CN004", "Significant Rare Category Volume", Severity.HIGH)
1285
+ self.threshold = 20.0
1286
+
1287
+ def run(self, column_name: str, categorical_metrics: Any) -> Optional[QualityCheckResult]:
1288
+ if categorical_metrics is None:
1289
+ return None
1290
+
1291
+ rare_pct = categorical_metrics.rare_category_percentage
1292
+
1293
+ if rare_pct > self.threshold:
1294
+ return self.create_result(
1295
+ column_name, False,
1296
+ f"High rare category volume: {rare_pct}% of rows in rare categories (> {self.threshold}%)",
1297
+ {"rare_category_percentage": rare_pct, "rare_category_count": categorical_metrics.rare_category_count},
1298
+ "Group rare categories or use encoding that handles high cardinality (target encoding, embedding)."
1299
+ )
1300
+ else:
1301
+ return self.create_result(
1302
+ column_name, True,
1303
+ f"Acceptable rare category volume: {rare_pct}%",
1304
+ {"rare_category_percentage": rare_pct}
1305
+ )
1306
+
1307
+
1308
+ class CaseInconsistencyCheck(QualityCheck):
1309
+ def __init__(self):
1310
+ super().__init__("CN005", "Case Inconsistency", Severity.LOW)
1311
+
1312
+ def run(self, column_name: str, categorical_metrics: Any) -> Optional[QualityCheckResult]:
1313
+ if categorical_metrics is None:
1314
+ return None
1315
+
1316
+ case_variations = categorical_metrics.case_variations
1317
+
1318
+ if len(case_variations) > 0:
1319
+ return self.create_result(
1320
+ column_name, False,
1321
+ f"Case inconsistency detected: {len(case_variations)} variations found",
1322
+ {"case_variations": case_variations},
1323
+ "Standardize case (e.g., lowercase all values) during preprocessing."
1324
+ )
1325
+ else:
1326
+ return self.create_result(
1327
+ column_name, True,
1328
+ "No case inconsistency detected",
1329
+ {"case_variations": []}
1330
+ )
1331
+
1332
+
1333
+ class WhitespaceIssuesCheck(QualityCheck):
1334
+ def __init__(self):
1335
+ super().__init__("CN006", "Whitespace Issues", Severity.LOW)
1336
+
1337
+ def run(self, column_name: str, categorical_metrics: Any) -> Optional[QualityCheckResult]:
1338
+ if categorical_metrics is None:
1339
+ return None
1340
+
1341
+ whitespace_issues = categorical_metrics.whitespace_issues
1342
+
1343
+ if len(whitespace_issues) > 0:
1344
+ return self.create_result(
1345
+ column_name, False,
1346
+ f"Whitespace issues detected: {len(whitespace_issues)} values with leading/trailing spaces",
1347
+ {"whitespace_issues": whitespace_issues},
1348
+ "Strip leading/trailing whitespace during preprocessing."
1349
+ )
1350
+ else:
1351
+ return self.create_result(
1352
+ column_name, True,
1353
+ "No whitespace issues detected",
1354
+ {"whitespace_issues": []}
1355
+ )
1356
+
1357
+
1358
+ class SingleCategoryCheck(QualityCheck):
1359
+ def __init__(self):
1360
+ super().__init__("CN007", "Single Category Only", Severity.HIGH)
1361
+
1362
+ def run(self, column_name: str, categorical_metrics: Any) -> Optional[QualityCheckResult]:
1363
+ if categorical_metrics is None:
1364
+ return None
1365
+
1366
+ cardinality = categorical_metrics.cardinality
1367
+
1368
+ if cardinality == 1:
1369
+ return self.create_result(
1370
+ column_name, False,
1371
+ f"Column has only 1 category: {categorical_metrics.top_categories[0][0]}",
1372
+ {"cardinality": 1, "category": categorical_metrics.top_categories[0][0]},
1373
+ "Remove constant categorical column - provides no information for modeling."
1374
+ )
1375
+ else:
1376
+ return self.create_result(
1377
+ column_name, True,
1378
+ f"Column has {cardinality} categories",
1379
+ {"cardinality": cardinality}
1380
+ )
1381
+
1382
+
1383
+ class PossibleTyposCheck(QualityCheck):
1384
+ def __init__(self):
1385
+ super().__init__("CN008", "Possible Typos Detected", Severity.MEDIUM)
1386
+
1387
+ def run(self, column_name: str, categorical_metrics: Any) -> Optional[QualityCheckResult]:
1388
+ if categorical_metrics is None:
1389
+ return None
1390
+
1391
+ try:
1392
+ from difflib import SequenceMatcher
1393
+ except ImportError:
1394
+ return None
1395
+
1396
+ unique_values = list(categorical_metrics.value_counts.keys())[:100]
1397
+ similar_pairs = []
1398
+
1399
+ for i, val1 in enumerate(unique_values):
1400
+ for val2 in unique_values[i+1:]:
1401
+ if len(val1) > 3 and len(val2) > 3:
1402
+ ratio = SequenceMatcher(None, val1.lower(), val2.lower()).ratio()
1403
+ if 0.8 < ratio < 1.0:
1404
+ similar_pairs.append(f"{val1} ~ {val2}")
1405
+ if len(similar_pairs) >= 5:
1406
+ break
1407
+ if len(similar_pairs) >= 5:
1408
+ break
1409
+
1410
+ if len(similar_pairs) > 0:
1411
+ return self.create_result(
1412
+ column_name, False,
1413
+ f"Possible typos detected: {len(similar_pairs)} similar value pairs found",
1414
+ {"similar_pairs": similar_pairs},
1415
+ "Review similar values for potential typos and standardize."
1416
+ )
1417
+ else:
1418
+ return self.create_result(
1419
+ column_name, True,
1420
+ "No obvious typos detected",
1421
+ {"similar_pairs": []}
1422
+ )
1423
+
1424
+
1425
+ class IdentifierDuplicatesCheck(QualityCheck):
1426
+ def __init__(self):
1427
+ super().__init__("ID001", "Identifier Has Duplicates", Severity.CRITICAL)
1428
+
1429
+ def run(self, column_name: str, identifier_metrics: Any) -> Optional[QualityCheckResult]:
1430
+ if identifier_metrics is None:
1431
+ return None
1432
+
1433
+ if identifier_metrics.duplicate_count > 0:
1434
+ return self.create_result(
1435
+ column_name, False,
1436
+ f"Identifier column has {identifier_metrics.duplicate_count} duplicate values",
1437
+ {"duplicate_count": identifier_metrics.duplicate_count,
1438
+ "duplicate_values": identifier_metrics.duplicate_values[:5]},
1439
+ "CRITICAL: Identifiers must be unique. Investigate and resolve duplicates or reconsider column type."
1440
+ )
1441
+ else:
1442
+ return self.create_result(
1443
+ column_name, True,
1444
+ "Identifier column is unique",
1445
+ {"duplicate_count": 0}
1446
+ )
1447
+
1448
+
1449
+ class IdentifierFormatCheck(QualityCheck):
1450
+ def __init__(self):
1451
+ super().__init__("ID002", "Identifier Format Inconsistent", Severity.MEDIUM)
1452
+ self.threshold = 95.0
1453
+
1454
+ def run(self, column_name: str, identifier_metrics: Any) -> Optional[QualityCheckResult]:
1455
+ if identifier_metrics is None or identifier_metrics.format_consistency is None:
1456
+ return None
1457
+
1458
+ format_consistency = identifier_metrics.format_consistency
1459
+
1460
+ if format_consistency < self.threshold:
1461
+ return self.create_result(
1462
+ column_name, False,
1463
+ f"Identifier format inconsistent: {format_consistency}% match pattern '{identifier_metrics.format_pattern}' (< {self.threshold}%)",
1464
+ {"format_consistency": format_consistency, "format_pattern": identifier_metrics.format_pattern},
1465
+ "Standardize identifier format or investigate data quality issues."
1466
+ )
1467
+ else:
1468
+ return self.create_result(
1469
+ column_name, True,
1470
+ f"Identifier format consistent: {format_consistency}% match pattern '{identifier_metrics.format_pattern}'",
1471
+ {"format_consistency": format_consistency, "format_pattern": identifier_metrics.format_pattern}
1472
+ )
1473
+
1474
+
1475
+ class IdentifierNullCheck(QualityCheck):
1476
+ def __init__(self):
1477
+ super().__init__("ID003", "Identifier Contains Nulls", Severity.HIGH)
1478
+
1479
+ def run(self, column_name: str, universal_metrics: Any) -> Optional[QualityCheckResult]:
1480
+ if universal_metrics is None:
1481
+ return None
1482
+
1483
+ if universal_metrics.null_count > 0:
1484
+ return self.create_result(
1485
+ column_name, False,
1486
+ f"Identifier column contains {universal_metrics.null_count} null values ({universal_metrics.null_percentage}%)",
1487
+ {"null_count": universal_metrics.null_count, "null_percentage": universal_metrics.null_percentage},
1488
+ "Identifiers should not contain nulls. Investigate missing identifiers or data quality issues."
1489
+ )
1490
+ else:
1491
+ return self.create_result(
1492
+ column_name, True,
1493
+ "Identifier column has no null values",
1494
+ {"null_count": 0}
1495
+ )
1496
+
1497
+
1498
+ class QualityCheckRegistry:
1499
+ _checks = {
1500
+ "FQ001": MissingValueCheck,
1501
+ "FQ003": ConstantFeatureCheck,
1502
+ "FQ005": HighOutliersCheck,
1503
+ "FQ008": UnknownColumnTypeCheck,
1504
+ "FQ009": VeryHighCardinalityNominalCheck,
1505
+ "FQ011": AllValuesOutliersCheck,
1506
+ "FQ012": UnrealisticDateRangeCheck,
1507
+ "CAT001": HighCardinalityCheck,
1508
+ "NUM001": LowCardinalityCheck,
1509
+ "CAT002": ImbalancedTargetCheck,
1510
+ "NUM002": SkewnessCheck,
1511
+ "NUM003": OutlierCheck,
1512
+ "NUM004": ZeroInflationCheck,
1513
+ "LEAK001": IdentifierLeakageCheck,
1514
+ "DT001": DatetimeFutureLeakageCheck,
1515
+ "DT002": PlaceholderDateCheck,
1516
+ "CAT003": RareCategoryCheck,
1517
+ "CAT004": UnknownCategoryCheck,
1518
+ "TX001": PIIDetectedCheck,
1519
+ "TX002": EmptyTextCheck,
1520
+ "TX003": ShortTextCheck,
1521
+ "TX004": HighUniquenessTextCheck,
1522
+ "NC001": ExtremeOutliersCheck,
1523
+ "NC002": ModerateOutliersCheck,
1524
+ "NC003": HighSkewnessCheck,
1525
+ "NC004": NumericZeroInflationCheck,
1526
+ "NC005": UnexpectedNegativesCheck,
1527
+ "NC006": InfiniteValuesCheck,
1528
+ "NC007": ConstantValueCheck,
1529
+ "NC008": SuspiciousPrecisionCheck,
1530
+ "TG001": TargetNullCheck,
1531
+ "TG002": TargetSevereImbalanceCheck,
1532
+ "TG003": TargetModerateImbalanceCheck,
1533
+ "TG004": TargetUnexpectedClassesCheck,
1534
+ "TG005": SingleClassTargetCheck,
1535
+ "ID001": IdentifierDuplicatesCheck,
1536
+ "ID002": IdentifierFormatCheck,
1537
+ "ID003": IdentifierNullCheck,
1538
+ "CN001": VeryHighCardinalityCheck,
1539
+ "CN002": HighCardinalityCategoricalCheck,
1540
+ "CN003": ManyRareCategoriesCheck,
1541
+ "CN004": SignificantRareVolumeCheck,
1542
+ "CN005": CaseInconsistencyCheck,
1543
+ "CN006": WhitespaceIssuesCheck,
1544
+ "CN007": SingleCategoryCheck,
1545
+ "CN008": PossibleTyposCheck,
1546
+ "DT003": DatetimeFormatInconsistentCheck,
1547
+ "DT004": DatetimeMixedTimezonesCheck,
1548
+ "DT005": DatetimeInvalidDatesCheck,
1549
+ "DT006": DatetimeUnrealisticRangeCheck,
1550
+ "BN001": BinaryNotBinaryCheck,
1551
+ "BN002": BinarySevereImbalanceCheck,
1552
+ "BN003": BinaryAllSameValueCheck,
1553
+ "BN004": BinaryUnexpectedValuesCheck,
1554
+ }
1555
+
1556
+ @classmethod
1557
+ def get_check(cls, check_id: str):
1558
+ check_class = cls._checks.get(check_id)
1559
+ return check_class() if check_class else None
1560
+
1561
+ @classmethod
1562
+ def get_all_checks(cls):
1563
+ return [check_class() for check_class in cls._checks.values()]
1564
+
1565
+ @classmethod
1566
+ def get_checks_for_column_type(cls, column_type: ColumnType):
1567
+ checks = []
1568
+
1569
+ checks.append(MissingValueCheck())
1570
+ checks.append(ConstantFeatureCheck())
1571
+
1572
+ if column_type == ColumnType.IDENTIFIER:
1573
+ checks.append(IdentifierLeakageCheck())
1574
+ checks.append(IdentifierDuplicatesCheck())
1575
+ checks.append(IdentifierFormatCheck())
1576
+ checks.append(IdentifierNullCheck())
1577
+
1578
+ elif column_type == ColumnType.TARGET:
1579
+ checks.append(TargetNullCheck())
1580
+ checks.append(SingleClassTargetCheck())
1581
+ checks.append(TargetSevereImbalanceCheck())
1582
+ checks.append(TargetModerateImbalanceCheck())
1583
+ checks.append(ImbalancedTargetCheck())
1584
+ # Note: TG004 (TargetUnexpectedClassesCheck) requires expected_classes configuration
1585
+
1586
+ elif column_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]:
1587
+ checks.append(LowCardinalityCheck())
1588
+ checks.append(ExtremeOutliersCheck()) # NC001
1589
+ checks.append(ModerateOutliersCheck()) # NC002
1590
+ checks.append(HighSkewnessCheck()) # NC003
1591
+ checks.append(NumericZeroInflationCheck()) # NC004
1592
+ # NC005 (UnexpectedNegativesCheck) requires configuration
1593
+ checks.append(InfiniteValuesCheck()) # NC006
1594
+ checks.append(ConstantValueCheck()) # NC007
1595
+ # NC008 (SuspiciousPrecisionCheck) requires series access
1596
+ checks.append(SkewnessCheck()) # FQ006
1597
+ checks.append(OutlierCheck()) # FQ007
1598
+ checks.append(ZeroInflationCheck()) # FQ008
1599
+
1600
+ elif column_type in [ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL, ColumnType.CATEGORICAL_CYCLICAL]:
1601
+ checks.append(HighCardinalityCheck())
1602
+ checks.append(RareCategoryCheck())
1603
+ checks.append(UnknownCategoryCheck())
1604
+ checks.append(VeryHighCardinalityCheck())
1605
+ checks.append(HighCardinalityCategoricalCheck())
1606
+ checks.append(ManyRareCategoriesCheck())
1607
+ checks.append(SignificantRareVolumeCheck())
1608
+ checks.append(CaseInconsistencyCheck())
1609
+ checks.append(WhitespaceIssuesCheck())
1610
+ checks.append(SingleCategoryCheck())
1611
+ checks.append(PossibleTyposCheck())
1612
+
1613
+ elif column_type == ColumnType.DATETIME:
1614
+ checks.append(DatetimeFutureLeakageCheck())
1615
+ checks.append(PlaceholderDateCheck())
1616
+ checks.append(DatetimeFormatInconsistentCheck())
1617
+ checks.append(DatetimeMixedTimezonesCheck())
1618
+ checks.append(DatetimeUnrealisticRangeCheck())
1619
+
1620
+ elif column_type == ColumnType.BINARY:
1621
+ checks.append(BinaryNotBinaryCheck())
1622
+ checks.append(BinarySevereImbalanceCheck())
1623
+ checks.append(BinaryAllSameValueCheck())
1624
+ checks.append(BinaryUnexpectedValuesCheck())
1625
+
1626
+ elif column_type == ColumnType.TEXT:
1627
+ checks.append(PIIDetectedCheck())
1628
+ checks.append(EmptyTextCheck())
1629
+ checks.append(ShortTextCheck())
1630
+ checks.append(HighUniquenessTextCheck())
1631
+
1632
+ return checks