churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,544 @@
1
+ """
2
+ Quality scoring for data exploration.
3
+
4
+ This module provides a comprehensive quality scorer that calculates
5
+ data quality scores based on validation results from the exploration phase.
6
+ """
7
+
8
+ from dataclasses import dataclass, field
9
+ from enum import Enum
10
+ from typing import Any, Dict, List, Optional
11
+
12
+ from .data_validators import DateLogicResult, DuplicateResult, RangeValidationResult
13
+ from .timeseries_detector import TimeSeriesCharacteristics, TimeSeriesValidationResult
14
+
15
+
16
+ class QualityLevel(Enum):
17
+ """Quality level classifications."""
18
+ EXCELLENT = "excellent" # 90-100
19
+ GOOD = "good" # 70-89
20
+ FAIR = "fair" # 50-69
21
+ POOR = "poor" # 0-49
22
+
23
+
24
+ @dataclass
25
+ class QualityScoreResult:
26
+ """Result of quality score calculation."""
27
+ overall_score: float
28
+ quality_level: QualityLevel
29
+ components: Dict[str, float]
30
+ component_weights: Dict[str, float]
31
+ issues: List[str] = field(default_factory=list)
32
+ recommendations: List[str] = field(default_factory=list)
33
+
34
+ # Time series specific
35
+ is_time_series: bool = False
36
+ dataset_type: str = "snapshot"
37
+ timeseries_characteristics: Optional[Dict[str, Any]] = None
38
+ timeseries_quality: Optional[Dict[str, Any]] = None
39
+
40
+ def to_dict(self) -> Dict[str, Any]:
41
+ """Convert to dictionary for serialization."""
42
+ result = {
43
+ "overall_score": round(self.overall_score, 1),
44
+ "quality_level": self.quality_level.value,
45
+ "components": {k: round(v, 1) for k, v in self.components.items()},
46
+ "component_weights": self.component_weights,
47
+ "issues": self.issues,
48
+ "recommendations": self.recommendations,
49
+ "is_time_series": self.is_time_series,
50
+ "dataset_type": self.dataset_type,
51
+ }
52
+ if self.timeseries_characteristics:
53
+ result["timeseries_characteristics"] = self.timeseries_characteristics
54
+ if self.timeseries_quality:
55
+ result["timeseries_quality"] = self.timeseries_quality
56
+ return result
57
+
58
+ def __str__(self) -> str:
59
+ """Human-readable string representation."""
60
+ lines = [
61
+ f"Quality Score: {self.overall_score:.1f}/100 ({self.quality_level.value})",
62
+ f"Dataset Type: {self.dataset_type}",
63
+ "",
64
+ "Components:"
65
+ ]
66
+ for component, score in self.components.items():
67
+ weight = self.component_weights.get(component, 0) * 100
68
+ lines.append(f" - {component}: {score:.1f} (weight: {weight:.0f}%)")
69
+
70
+ if self.is_time_series and self.timeseries_quality:
71
+ lines.append("")
72
+ lines.append("Time Series Quality:")
73
+ lines.append(f" - Temporal Score: {self.timeseries_quality.get('temporal_quality_score', 'N/A')}")
74
+
75
+ if self.issues:
76
+ lines.append("")
77
+ lines.append("Issues:")
78
+ for issue in self.issues:
79
+ lines.append(f" - {issue}")
80
+
81
+ return "\n".join(lines)
82
+
83
+
84
+ @dataclass
85
+ class ColumnFindings:
86
+ """Minimal column findings interface for quality scoring."""
87
+ inferred_type: Any # Should have .value attribute
88
+ universal_metrics: Dict[str, Any]
89
+
90
+
91
+ @dataclass
92
+ class ExplorationFindings:
93
+ """Minimal exploration findings interface for quality scoring."""
94
+ row_count: int
95
+ column_count: int
96
+ columns: Dict[str, ColumnFindings]
97
+ target_column: Optional[str] = None
98
+
99
+
100
+ class QualityScorer:
101
+ """
102
+ Calculate comprehensive data quality scores based on validation results.
103
+
104
+ The quality score is composed of four components:
105
+ - Completeness: Percentage of non-null values
106
+ - Validity: Values within expected ranges and formats
107
+ - Consistency: No conflicting duplicates or logical violations
108
+ - Uniqueness: Identifier columns have appropriate cardinality
109
+
110
+ Each component can be weighted differently based on use case.
111
+
112
+ Example
113
+ -------
114
+ >>> scorer = QualityScorer()
115
+ >>> result = scorer.calculate(
116
+ ... findings=exploration_findings,
117
+ ... duplicate_result=dup_result,
118
+ ... date_result=date_result,
119
+ ... range_results=range_results
120
+ ... )
121
+ >>> print(f"Quality Score: {result.overall_score:.1f}/100")
122
+ """
123
+
124
+ DEFAULT_WEIGHTS = {
125
+ "completeness": 0.25,
126
+ "validity": 0.25,
127
+ "consistency": 0.25,
128
+ "uniqueness": 0.25
129
+ }
130
+
131
+ def __init__(self, weights: Optional[Dict[str, float]] = None):
132
+ """
133
+ Initialize the quality scorer.
134
+
135
+ Parameters
136
+ ----------
137
+ weights : Dict[str, float], optional
138
+ Custom weights for each component. Must sum to 1.0.
139
+ Keys: 'completeness', 'validity', 'consistency', 'uniqueness'
140
+ """
141
+ self.weights = weights or self.DEFAULT_WEIGHTS.copy()
142
+ self._validate_weights()
143
+
144
+ def _validate_weights(self) -> None:
145
+ """Validate that weights sum to 1.0 and all components are present."""
146
+ required = set(self.DEFAULT_WEIGHTS.keys())
147
+ provided = set(self.weights.keys())
148
+
149
+ missing = required - provided
150
+ if missing:
151
+ raise ValueError(f"Missing weight components: {missing}")
152
+
153
+ total = sum(self.weights.values())
154
+ if not (0.99 <= total <= 1.01): # Allow small floating point variance
155
+ raise ValueError(f"Weights must sum to 1.0, got {total}")
156
+
157
+ def calculate(
158
+ self,
159
+ findings: ExplorationFindings,
160
+ duplicate_result: Optional[DuplicateResult] = None,
161
+ date_result: Optional[DateLogicResult] = None,
162
+ range_results: Optional[List[RangeValidationResult]] = None,
163
+ timeseries_characteristics: Optional[TimeSeriesCharacteristics] = None,
164
+ timeseries_validation: Optional[TimeSeriesValidationResult] = None
165
+ ) -> QualityScoreResult:
166
+ """
167
+ Calculate comprehensive quality score.
168
+
169
+ Parameters
170
+ ----------
171
+ findings : ExplorationFindings
172
+ Results from data exploration (column info, row counts, etc.)
173
+ duplicate_result : DuplicateResult, optional
174
+ Results from duplicate validation
175
+ date_result : DateLogicResult, optional
176
+ Results from date logic validation
177
+ range_results : List[RangeValidationResult], optional
178
+ Results from value range validations
179
+ timeseries_characteristics : TimeSeriesCharacteristics, optional
180
+ Results from time series detection
181
+ timeseries_validation : TimeSeriesValidationResult, optional
182
+ Results from time series validation
183
+
184
+ Returns
185
+ -------
186
+ QualityScoreResult
187
+ Comprehensive quality score with component breakdown
188
+ """
189
+ components = {}
190
+ issues = []
191
+ recommendations = []
192
+
193
+ # Calculate each component
194
+ components["completeness"], comp_issues = self._calculate_completeness(findings)
195
+ issues.extend(comp_issues)
196
+
197
+ components["validity"], val_issues = self._calculate_validity(range_results)
198
+ issues.extend(val_issues)
199
+
200
+ components["consistency"], cons_issues = self._calculate_consistency(
201
+ duplicate_result, date_result
202
+ )
203
+ issues.extend(cons_issues)
204
+
205
+ components["uniqueness"], uniq_issues = self._calculate_uniqueness(findings)
206
+ issues.extend(uniq_issues)
207
+
208
+ # Handle time series specific scoring
209
+ is_time_series = False
210
+ dataset_type = "snapshot"
211
+ ts_characteristics_dict = None
212
+ ts_quality_dict = None
213
+
214
+ if timeseries_characteristics is not None:
215
+ is_time_series = timeseries_characteristics.is_time_series
216
+ dataset_type = timeseries_characteristics.dataset_type.value
217
+ ts_characteristics_dict = timeseries_characteristics.to_dict()
218
+
219
+ if is_time_series and timeseries_validation is not None:
220
+ # Add temporal quality as a component for time series data
221
+ components["temporal"], ts_issues = self._calculate_temporal_quality(
222
+ timeseries_validation
223
+ )
224
+ issues.extend(ts_issues)
225
+ ts_quality_dict = timeseries_validation.to_dict()
226
+
227
+ # Adjust weights for time series data
228
+ adjusted_weights = self._adjust_weights_for_timeseries()
229
+ else:
230
+ adjusted_weights = self.weights
231
+ else:
232
+ adjusted_weights = self.weights
233
+
234
+ # Calculate weighted overall score
235
+ overall_score = 0.0
236
+ for comp, score in components.items():
237
+ weight = adjusted_weights.get(comp, 0)
238
+ overall_score += score * weight
239
+
240
+ # Determine quality level
241
+ quality_level = self._get_quality_level(overall_score)
242
+
243
+ # Generate recommendations based on issues
244
+ recommendations = self._generate_recommendations(
245
+ components, issues, is_time_series
246
+ )
247
+
248
+ return QualityScoreResult(
249
+ overall_score=overall_score,
250
+ quality_level=quality_level,
251
+ components=components,
252
+ component_weights=adjusted_weights,
253
+ issues=issues,
254
+ recommendations=recommendations,
255
+ is_time_series=is_time_series,
256
+ dataset_type=dataset_type,
257
+ timeseries_characteristics=ts_characteristics_dict,
258
+ timeseries_quality=ts_quality_dict
259
+ )
260
+
261
+ def _calculate_completeness(
262
+ self,
263
+ findings: ExplorationFindings
264
+ ) -> tuple[float, List[str]]:
265
+ """
266
+ Calculate completeness score based on missing values.
267
+
268
+ Returns
269
+ -------
270
+ tuple[float, List[str]]
271
+ Score (0-100) and list of issues found
272
+ """
273
+ issues = []
274
+
275
+ if findings.row_count == 0 or findings.column_count == 0:
276
+ return 100.0, issues
277
+
278
+ total_cells = findings.row_count * findings.column_count
279
+ missing_cells = 0
280
+ columns_with_high_missing = []
281
+
282
+ for col_name, col in findings.columns.items():
283
+ null_pct = col.universal_metrics.get("null_percentage", 0)
284
+ missing_cells += (null_pct / 100) * findings.row_count
285
+
286
+ if null_pct > 20:
287
+ columns_with_high_missing.append((col_name, null_pct))
288
+
289
+ completeness = 100 * (1 - missing_cells / total_cells)
290
+
291
+ # Add issues for high missing columns
292
+ for col_name, pct in columns_with_high_missing[:3]: # Top 3
293
+ issues.append(f"Column '{col_name}' has {pct:.1f}% missing values")
294
+
295
+ return max(0, min(100, completeness)), issues
296
+
297
+ def _calculate_validity(
298
+ self,
299
+ range_results: Optional[List[RangeValidationResult]]
300
+ ) -> tuple[float, List[str]]:
301
+ """
302
+ Calculate validity score based on range validation results.
303
+
304
+ Returns
305
+ -------
306
+ tuple[float, List[str]]
307
+ Score (0-100) and list of issues found
308
+ """
309
+ issues = []
310
+
311
+ if not range_results:
312
+ return 100.0, issues # No rules defined, assume valid
313
+
314
+ total_checked = 0
315
+ total_invalid = 0
316
+
317
+ for result in range_results:
318
+ total_checked += result.total_values
319
+ total_invalid += result.invalid_values
320
+
321
+ if result.invalid_percentage > 5:
322
+ issues.append(
323
+ f"Column '{result.column_name}' has {result.invalid_percentage:.1f}% "
324
+ f"values outside {result.rule_type} range"
325
+ )
326
+
327
+ if total_checked == 0:
328
+ return 100.0, issues
329
+
330
+ validity = 100 * (1 - total_invalid / total_checked)
331
+
332
+ return max(0, min(100, validity)), issues
333
+
334
+ def _calculate_consistency(
335
+ self,
336
+ duplicate_result: Optional[DuplicateResult],
337
+ date_result: Optional[DateLogicResult]
338
+ ) -> tuple[float, List[str]]:
339
+ """
340
+ Calculate consistency score based on duplicates and date logic.
341
+
342
+ Returns
343
+ -------
344
+ tuple[float, List[str]]
345
+ Score (0-100) and list of issues found
346
+ """
347
+ issues = []
348
+ penalties = 0
349
+
350
+ # Duplicate penalties
351
+ if duplicate_result is not None:
352
+ dup_pct = duplicate_result.duplicate_percentage
353
+
354
+ if dup_pct > 10:
355
+ penalties += 30
356
+ issues.append(f"High duplicate rate: {dup_pct:.1f}%")
357
+ elif dup_pct > 5:
358
+ penalties += 20
359
+ issues.append(f"Moderate duplicate rate: {dup_pct:.1f}%")
360
+ elif dup_pct > 1:
361
+ penalties += 10
362
+
363
+ # Value conflicts are more severe
364
+ if duplicate_result.has_value_conflicts:
365
+ penalties += 20
366
+ conflict_cols = ", ".join(duplicate_result.conflict_columns[:3])
367
+ issues.append(f"Value conflicts in duplicate records: {conflict_cols}")
368
+
369
+ # Date logic penalties
370
+ if date_result is not None:
371
+ invalid_pct = date_result.invalid_percentage
372
+
373
+ if invalid_pct > 10:
374
+ penalties += 20
375
+ issues.append(f"High date logic violation rate: {invalid_pct:.1f}%")
376
+ elif invalid_pct > 5:
377
+ penalties += 10
378
+ issues.append(f"Moderate date logic violations: {invalid_pct:.1f}%")
379
+ elif invalid_pct > 1:
380
+ penalties += 5
381
+
382
+ consistency = max(0, 100 - penalties)
383
+
384
+ return consistency, issues
385
+
386
+ def _calculate_uniqueness(
387
+ self,
388
+ findings: ExplorationFindings
389
+ ) -> tuple[float, List[str]]:
390
+ """
391
+ Calculate uniqueness score for identifier columns.
392
+
393
+ Returns
394
+ -------
395
+ tuple[float, List[str]]
396
+ Score (0-100) and list of issues found
397
+ """
398
+ issues = []
399
+ penalties = 0
400
+ identifier_count = 0
401
+
402
+ for col_name, col in findings.columns.items():
403
+ col_type = getattr(col.inferred_type, 'value', str(col.inferred_type))
404
+
405
+ if col_type in ('identifier', 'id'):
406
+ identifier_count += 1
407
+ distinct_pct = col.universal_metrics.get("distinct_percentage", 100)
408
+
409
+ if distinct_pct < 90:
410
+ penalties += 20
411
+ issues.append(
412
+ f"Identifier '{col_name}' has low uniqueness: {distinct_pct:.1f}%"
413
+ )
414
+ elif distinct_pct < 95:
415
+ penalties += 10
416
+
417
+ # If no identifiers found, full score
418
+ if identifier_count == 0:
419
+ return 100.0, issues
420
+
421
+ uniqueness = max(0, 100 - penalties)
422
+
423
+ return uniqueness, issues
424
+
425
+ def _get_quality_level(self, score: float) -> QualityLevel:
426
+ """Determine quality level from score."""
427
+ if score >= 90:
428
+ return QualityLevel.EXCELLENT
429
+ elif score >= 70:
430
+ return QualityLevel.GOOD
431
+ elif score >= 50:
432
+ return QualityLevel.FAIR
433
+ else:
434
+ return QualityLevel.POOR
435
+
436
+ def _calculate_temporal_quality(
437
+ self,
438
+ timeseries_validation: TimeSeriesValidationResult
439
+ ) -> tuple[float, List[str]]:
440
+ """
441
+ Calculate temporal quality score for time series data.
442
+
443
+ Returns
444
+ -------
445
+ tuple[float, List[str]]
446
+ Score (0-100) and list of issues found
447
+ """
448
+ issues = []
449
+
450
+ # Start with the temporal quality score from validation
451
+ score = timeseries_validation.temporal_quality_score
452
+
453
+ # Add issues from validation
454
+ issues.extend(timeseries_validation.issues)
455
+
456
+ # Add specific issues based on metrics
457
+ if timeseries_validation.entities_with_gaps > 0:
458
+ gap_rate = timeseries_validation.entities_with_gaps
459
+ if gap_rate > 10:
460
+ issues.append(f"High number of entities with gaps: {gap_rate}")
461
+
462
+ if timeseries_validation.total_duplicate_timestamps > 0:
463
+ issues.append(
464
+ f"Found {timeseries_validation.total_duplicate_timestamps} duplicate timestamps"
465
+ )
466
+
467
+ if timeseries_validation.entities_with_ordering_issues > 0:
468
+ issues.append(
469
+ f"{timeseries_validation.entities_with_ordering_issues} entities have "
470
+ "timestamps out of order"
471
+ )
472
+
473
+ return max(0, min(100, score)), issues
474
+
475
+ def _adjust_weights_for_timeseries(self) -> Dict[str, float]:
476
+ """
477
+ Adjust component weights for time series data.
478
+
479
+ When temporal quality is included, redistribute weights
480
+ to give appropriate importance to temporal aspects.
481
+ """
482
+ # For time series, include temporal as 20% and reduce others proportionally
483
+ temporal_weight = 0.20
484
+ reduction_factor = 1 - temporal_weight
485
+
486
+ adjusted = {
487
+ "completeness": self.weights["completeness"] * reduction_factor,
488
+ "validity": self.weights["validity"] * reduction_factor,
489
+ "consistency": self.weights["consistency"] * reduction_factor,
490
+ "uniqueness": self.weights["uniqueness"] * reduction_factor,
491
+ "temporal": temporal_weight
492
+ }
493
+
494
+ return adjusted
495
+
496
+ def _generate_recommendations(
497
+ self,
498
+ components: Dict[str, float],
499
+ issues: List[str],
500
+ is_time_series: bool = False
501
+ ) -> List[str]:
502
+ """Generate recommendations based on component scores and issues."""
503
+ recommendations = []
504
+
505
+ if components["completeness"] < 80:
506
+ recommendations.append(
507
+ "Review missing value imputation strategies before modeling"
508
+ )
509
+
510
+ if components["validity"] < 80:
511
+ recommendations.append(
512
+ "Investigate values outside expected ranges - may need cleaning or rule adjustment"
513
+ )
514
+
515
+ if components["consistency"] < 80:
516
+ recommendations.append(
517
+ "Resolve duplicate records and date logic violations before analysis"
518
+ )
519
+
520
+ if components["uniqueness"] < 80:
521
+ recommendations.append(
522
+ "Verify identifier columns - low uniqueness may indicate data issues"
523
+ )
524
+
525
+ # Time series specific recommendations
526
+ if is_time_series:
527
+ temporal_score = components.get("temporal", 100)
528
+ if temporal_score < 80:
529
+ recommendations.append(
530
+ "Address temporal quality issues: gaps, duplicates, or ordering problems"
531
+ )
532
+
533
+ if all(score >= 80 for score in components.values()):
534
+ recommendations.append(
535
+ "Time series data quality is good - proceed to temporal feature engineering"
536
+ )
537
+ else:
538
+ # Add general recommendation if score is good
539
+ if all(score >= 80 for score in components.values()):
540
+ recommendations.append(
541
+ "Data quality is good - proceed to feature engineering"
542
+ )
543
+
544
+ return recommendations
@@ -0,0 +1,57 @@
1
+ """Automatic validation rule generation from exploration findings."""
2
+
3
+ from typing import Any, Dict
4
+
5
+ from customer_retention.analysis.auto_explorer.findings import ColumnFinding, ExplorationFindings
6
+ from customer_retention.core.config.column_config import ColumnType
7
+
8
+ PERCENTAGE_PATTERNS = ["rate", "pct", "percent", "ratio"]
9
+ SKIP_TYPES = [ColumnType.IDENTIFIER, ColumnType.DATETIME, ColumnType.TEXT,
10
+ ColumnType.CATEGORICAL_NOMINAL, ColumnType.CATEGORICAL_ORDINAL,
11
+ ColumnType.CATEGORICAL_CYCLICAL, ColumnType.UNKNOWN]
12
+
13
+
14
+ class RuleGenerator:
15
+
16
+ @staticmethod
17
+ def for_column(col: ColumnFinding) -> Dict[str, Dict[str, Any]]:
18
+ if col.inferred_type in SKIP_TYPES:
19
+ return {}
20
+
21
+ if col.inferred_type == ColumnType.BINARY:
22
+ return {col.name: {"type": "binary", "valid_values": [0, 1]}}
23
+
24
+ if col.inferred_type == ColumnType.TARGET:
25
+ distinct = col.type_metrics.get("distinct_count", 0)
26
+ if distinct == 2:
27
+ return {col.name: {"type": "binary", "valid_values": [0, 1]}}
28
+ return {}
29
+
30
+ if col.inferred_type in [ColumnType.NUMERIC_CONTINUOUS, ColumnType.NUMERIC_DISCRETE]:
31
+ return RuleGenerator._numeric_rule(col)
32
+
33
+ return {}
34
+
35
+ @staticmethod
36
+ def _numeric_rule(col: ColumnFinding) -> Dict[str, Dict[str, Any]]:
37
+ name_lower = col.name.lower()
38
+ metrics = col.type_metrics
39
+ min_val = metrics.get("min")
40
+ max_val = metrics.get("max")
41
+
42
+ if any(p in name_lower for p in PERCENTAGE_PATTERNS):
43
+ if max_val is not None and max_val <= 1:
44
+ return {col.name: {"type": "percentage", "min": 0, "max": 1}}
45
+ return {col.name: {"type": "percentage", "min": 0, "max": 100}}
46
+
47
+ if min_val is not None and min_val >= 0:
48
+ return {col.name: {"type": "non_negative"}}
49
+
50
+ return {}
51
+
52
+ @staticmethod
53
+ def from_findings(findings: ExplorationFindings) -> Dict[str, Dict[str, Any]]:
54
+ rules = {}
55
+ for col in findings.columns.values():
56
+ rules.update(RuleGenerator.for_column(col))
57
+ return rules