churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,520 @@
1
+ import json
2
+ from pathlib import Path
3
+ from typing import Any, Dict, Optional
4
+
5
+ from customer_retention.core.compat import pd
6
+ from customer_retention.core.config import ColumnType
7
+
8
+ from .profile_result import ProfileResult
9
+
10
+
11
+ class ReportGenerator:
12
+ """Generate profiling reports in multiple formats (JSON, HTML, Markdown)."""
13
+
14
+ def __init__(self, profile: Optional[ProfileResult] = None):
15
+ self.profile = profile
16
+
17
+ def to_json(self, indent: int = 2) -> str:
18
+ """Generate JSON report from profile result."""
19
+ if self.profile is None:
20
+ raise ValueError("No profile set. Provide profile in constructor or set profile attribute.")
21
+
22
+ # Convert profile to dict using Pydantic's model_dump
23
+ report_dict = self.profile.model_dump()
24
+
25
+ return json.dumps(report_dict, indent=indent, default=str)
26
+
27
+ def save_json(self, filepath: str):
28
+ """Save JSON report to file."""
29
+ json_report = self.to_json()
30
+
31
+ with open(filepath, 'w') as f:
32
+ f.write(json_report)
33
+
34
+ def to_html(self) -> str:
35
+ """Generate HTML report from profile result."""
36
+ if self.profile is None:
37
+ raise ValueError("No profile set.")
38
+
39
+ summary = self.generate_executive_summary()
40
+
41
+ html = self._generate_html_template(summary)
42
+
43
+ return html
44
+
45
+ def save_html(self, filepath: str):
46
+ """Save HTML report to file."""
47
+ html_report = self.to_html()
48
+
49
+ with open(filepath, 'w') as f:
50
+ f.write(html_report)
51
+
52
+ def to_markdown(self) -> str:
53
+ """Generate Markdown report from profile result."""
54
+ if self.profile is None:
55
+ raise ValueError("No profile set.")
56
+
57
+ summary = self.generate_executive_summary()
58
+
59
+ md = self._generate_markdown_template(summary)
60
+
61
+ return md
62
+
63
+ def save_markdown(self, filepath: str):
64
+ """Save Markdown report to file."""
65
+ md_report = self.to_markdown()
66
+
67
+ with open(filepath, 'w') as f:
68
+ f.write(md_report)
69
+
70
+ def save_all_formats(self, directory: str, base_filename: str):
71
+ """Save reports in all formats to a directory."""
72
+ dir_path = Path(directory)
73
+ dir_path.mkdir(parents=True, exist_ok=True)
74
+
75
+ self.save_json(str(dir_path / f"{base_filename}.json"))
76
+ self.save_html(str(dir_path / f"{base_filename}.html"))
77
+ self.save_markdown(str(dir_path / f"{base_filename}.md"))
78
+
79
+ def generate_executive_summary(self) -> Dict[str, Any]:
80
+ """Generate executive summary of profiling results."""
81
+ if self.profile is None:
82
+ raise ValueError("No profile set.")
83
+
84
+ # Basic dataset info
85
+ summary = {
86
+ "dataset_name": self.profile.dataset_name,
87
+ "total_rows": self.profile.total_rows,
88
+ "total_columns": self.profile.total_columns,
89
+ "profiling_timestamp": self.profile.profiling_timestamp,
90
+ "profiling_duration_seconds": self.profile.profiling_duration_seconds,
91
+ }
92
+
93
+ # Column type breakdown
94
+ type_counts = {}
95
+ for col_profile in self.profile.column_profiles.values():
96
+ col_type = col_profile.configured_type.value
97
+ type_counts[col_type] = type_counts.get(col_type, 0) + 1
98
+
99
+ summary["column_types"] = type_counts
100
+
101
+ # Missing data summary
102
+ total_missing = 0
103
+ columns_with_missing = 0
104
+
105
+ for col_profile in self.profile.column_profiles.values():
106
+ if col_profile.universal_metrics.null_count > 0:
107
+ columns_with_missing += 1
108
+ total_missing += col_profile.universal_metrics.null_count
109
+
110
+ total_cells = self.profile.total_rows * self.profile.total_columns
111
+ missing_percentage = (total_missing / total_cells * 100) if total_cells > 0 else 0
112
+
113
+ summary["total_missing_cells"] = total_missing
114
+ summary["columns_with_missing"] = columns_with_missing
115
+ summary["missing_percentage"] = round(missing_percentage, 2)
116
+
117
+ # Quality score calculation (0-100)
118
+ quality_score = self._calculate_quality_score()
119
+ summary["quality_score"] = quality_score
120
+
121
+ # Memory usage estimate
122
+ total_memory = sum(
123
+ col_profile.universal_metrics.memory_size_bytes
124
+ for col_profile in self.profile.column_profiles.values()
125
+ if hasattr(col_profile.universal_metrics, 'memory_size_bytes') and
126
+ col_profile.universal_metrics.memory_size_bytes is not None
127
+ )
128
+ summary["estimated_memory_mb"] = round(total_memory / (1024 * 1024), 2) if total_memory > 0 else 0.0
129
+
130
+ return summary
131
+
132
+ def _calculate_quality_score(self) -> int:
133
+ """Calculate overall data quality score (0-100)."""
134
+ if not self.profile or not self.profile.column_profiles:
135
+ return 0
136
+
137
+ penalties = 0
138
+ max_penalties = 100
139
+
140
+ for col_profile in self.profile.column_profiles.values():
141
+ metrics = col_profile.universal_metrics
142
+
143
+ # Penalize missing values
144
+ if metrics.null_percentage > 50:
145
+ penalties += 20
146
+ elif metrics.null_percentage > 20:
147
+ penalties += 10
148
+ elif metrics.null_percentage > 5:
149
+ penalties += 5
150
+
151
+ # Penalize constant columns
152
+ if metrics.distinct_count == 1:
153
+ penalties += 15
154
+
155
+ # Penalize very high cardinality (possible identifiers)
156
+ if metrics.distinct_percentage > 95 and col_profile.configured_type not in [
157
+ ColumnType.IDENTIFIER, ColumnType.TEXT
158
+ ]:
159
+ penalties += 5
160
+
161
+ # Cap penalties at max
162
+ penalties = min(penalties, max_penalties)
163
+
164
+ return max(0, 100 - penalties)
165
+
166
+ def calculate_correlations(self, df: pd.DataFrame) -> Optional[Dict[str, Any]]:
167
+ """Calculate correlation matrix for numeric columns."""
168
+ if self.profile is None:
169
+ return None
170
+
171
+ # Get numeric columns from profile
172
+ numeric_columns = [
173
+ col_name for col_name, col_profile in self.profile.column_profiles.items()
174
+ if col_profile.configured_type in [
175
+ ColumnType.NUMERIC_CONTINUOUS,
176
+ ColumnType.NUMERIC_DISCRETE
177
+ ]
178
+ ]
179
+
180
+ if len(numeric_columns) < 2:
181
+ return None
182
+
183
+ # Filter dataframe to numeric columns that exist
184
+ numeric_cols_in_df = [col for col in numeric_columns if col in df.columns]
185
+
186
+ if len(numeric_cols_in_df) < 2:
187
+ return None
188
+
189
+ # Calculate correlations
190
+ corr_matrix = df[numeric_cols_in_df].corr()
191
+
192
+ # Convert to dictionary
193
+ correlations = {
194
+ "matrix": corr_matrix.to_dict(),
195
+ "high_correlations": []
196
+ }
197
+
198
+ # Find high correlations (>0.8 or <-0.8)
199
+ for i, col1 in enumerate(numeric_cols_in_df):
200
+ for col2 in numeric_cols_in_df[i + 1:]:
201
+ corr_value = corr_matrix.loc[col1, col2]
202
+ if abs(corr_value) > 0.8:
203
+ correlations["high_correlations"].append({
204
+ "column1": col1,
205
+ "column2": col2,
206
+ "correlation": round(corr_value, 3)
207
+ })
208
+
209
+ return correlations
210
+
211
+ def _generate_html_template(self, summary: Dict[str, Any]) -> str:
212
+ """Generate HTML report template."""
213
+ html = f"""<!DOCTYPE html>
214
+ <html lang="en">
215
+ <head>
216
+ <meta charset="UTF-8">
217
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
218
+ <title>Profiling Report - {self.profile.dataset_name}</title>
219
+ <style>
220
+ body {{
221
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Arial, sans-serif;
222
+ line-height: 1.6;
223
+ max-width: 1200px;
224
+ margin: 0 auto;
225
+ padding: 20px;
226
+ background-color: #f5f5f5;
227
+ }}
228
+ .header {{
229
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
230
+ color: white;
231
+ padding: 30px;
232
+ border-radius: 10px;
233
+ margin-bottom: 30px;
234
+ }}
235
+ .summary {{
236
+ background: white;
237
+ padding: 25px;
238
+ border-radius: 10px;
239
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
240
+ margin-bottom: 30px;
241
+ }}
242
+ .summary h2 {{
243
+ margin-top: 0;
244
+ color: #333;
245
+ }}
246
+ .metric {{
247
+ display: inline-block;
248
+ margin: 10px 20px 10px 0;
249
+ }}
250
+ .metric-label {{
251
+ font-size: 0.9em;
252
+ color: #666;
253
+ }}
254
+ .metric-value {{
255
+ font-size: 1.5em;
256
+ font-weight: bold;
257
+ color: #667eea;
258
+ }}
259
+ .quality-score {{
260
+ font-size: 3em;
261
+ font-weight: bold;
262
+ color: {self._get_quality_color(summary['quality_score'])};
263
+ text-align: center;
264
+ }}
265
+ .column-section {{
266
+ background: white;
267
+ padding: 20px;
268
+ border-radius: 10px;
269
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
270
+ margin-bottom: 20px;
271
+ }}
272
+ .column-header {{
273
+ font-size: 1.3em;
274
+ color: #333;
275
+ border-bottom: 2px solid #667eea;
276
+ padding-bottom: 10px;
277
+ margin-bottom: 15px;
278
+ }}
279
+ .column-type {{
280
+ display: inline-block;
281
+ background: #667eea;
282
+ color: white;
283
+ padding: 3px 10px;
284
+ border-radius: 5px;
285
+ font-size: 0.8em;
286
+ margin-left: 10px;
287
+ }}
288
+ table {{
289
+ width: 100%;
290
+ border-collapse: collapse;
291
+ margin: 15px 0;
292
+ }}
293
+ th, td {{
294
+ text-align: left;
295
+ padding: 10px;
296
+ border-bottom: 1px solid #ddd;
297
+ }}
298
+ th {{
299
+ background-color: #f8f9fa;
300
+ font-weight: 600;
301
+ }}
302
+ .progress-bar {{
303
+ width: 100%;
304
+ height: 20px;
305
+ background: #e9ecef;
306
+ border-radius: 10px;
307
+ overflow: hidden;
308
+ }}
309
+ .progress-fill {{
310
+ height: 100%;
311
+ background: #667eea;
312
+ transition: width 0.3s ease;
313
+ }}
314
+ </style>
315
+ </head>
316
+ <body>
317
+ <div class="header">
318
+ <h1>Data Profiling Report</h1>
319
+ <p>{self.profile.dataset_name}</p>
320
+ <p style="font-size: 0.9em; opacity: 0.9;">Generated on {summary['profiling_timestamp']}</p>
321
+ </div>
322
+
323
+ <div class="summary">
324
+ <h2>Executive Summary</h2>
325
+ <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 20px;">
326
+ <div class="metric">
327
+ <div class="metric-label">Total Rows</div>
328
+ <div class="metric-value">{summary['total_rows']:,}</div>
329
+ </div>
330
+ <div class="metric">
331
+ <div class="metric-label">Total Columns</div>
332
+ <div class="metric-value">{summary['total_columns']}</div>
333
+ </div>
334
+ <div class="metric">
335
+ <div class="metric-label">Missing Data</div>
336
+ <div class="metric-value">{summary['missing_percentage']}%</div>
337
+ </div>
338
+ <div class="metric">
339
+ <div class="metric-label">Quality Score</div>
340
+ <div class="quality-score">{summary['quality_score']}</div>
341
+ </div>
342
+ </div>
343
+
344
+ <h3>Column Types</h3>
345
+ <table>
346
+ <tr>
347
+ <th>Type</th>
348
+ <th>Count</th>
349
+ </tr>
350
+ """
351
+
352
+ for col_type, count in summary['column_types'].items():
353
+ html += f""" <tr>
354
+ <td>{col_type}</td>
355
+ <td>{count}</td>
356
+ </tr>
357
+ """
358
+
359
+ html += """ </table>
360
+ </div>
361
+
362
+ <h2>Column Details</h2>
363
+ """
364
+
365
+ # Add column sections
366
+ for col_name, col_profile in self.profile.column_profiles.items():
367
+ html += self._generate_column_section_html(col_name, col_profile)
368
+
369
+ html += """
370
+ </body>
371
+ </html>"""
372
+
373
+ return html
374
+
375
+ def _generate_column_section_html(self, col_name: str, col_profile) -> str:
376
+ """Generate HTML section for a single column."""
377
+ metrics = col_profile.universal_metrics
378
+
379
+ html = f"""
380
+ <div class="column-section">
381
+ <div class="column-header">
382
+ {col_name}
383
+ <span class="column-type">{col_profile.configured_type.value}</span>
384
+ </div>
385
+
386
+ <table>
387
+ <tr>
388
+ <th>Metric</th>
389
+ <th>Value</th>
390
+ </tr>
391
+ <tr>
392
+ <td>Total Count</td>
393
+ <td>{metrics.total_count:,}</td>
394
+ </tr>
395
+ <tr>
396
+ <td>Missing Values</td>
397
+ <td>{metrics.null_count:,} ({metrics.null_percentage:.1f}%)</td>
398
+ </tr>
399
+ <tr>
400
+ <td>Unique Values</td>
401
+ <td>{metrics.distinct_count:,} ({metrics.distinct_percentage:.1f}%)</td>
402
+ </tr>
403
+ """
404
+
405
+ # Add type-specific metrics
406
+ if col_profile.numeric_metrics:
407
+ nm = col_profile.numeric_metrics
408
+ html += f""" <tr>
409
+ <td>Mean</td>
410
+ <td>{nm.mean:.2f}</td>
411
+ </tr>
412
+ <tr>
413
+ <td>Std Dev</td>
414
+ <td>{nm.std:.2f}</td>
415
+ </tr>
416
+ <tr>
417
+ <td>Min / Max</td>
418
+ <td>{nm.min_value:.2f} / {nm.max_value:.2f}</td>
419
+ </tr>
420
+ """
421
+
422
+ elif col_profile.categorical_metrics:
423
+ cm = col_profile.categorical_metrics
424
+ top_cats = ', '.join(f"{cat}({count})" for cat, count in cm.top_categories[:5])
425
+ html += f""" <tr>
426
+ <td>Cardinality</td>
427
+ <td>{cm.cardinality}</td>
428
+ </tr>
429
+ <tr>
430
+ <td>Top Categories</td>
431
+ <td>{top_cats}</td>
432
+ </tr>
433
+ """
434
+
435
+ html += """ </table>
436
+ </div>
437
+ """
438
+
439
+ return html
440
+
441
+ def _generate_markdown_template(self, summary: Dict[str, Any]) -> str:
442
+ """Generate Markdown report template."""
443
+ md = f"""# Data Profiling Report
444
+
445
+ ## Dataset: {self.profile.dataset_name}
446
+
447
+ **Generated:** {summary['profiling_timestamp']}
448
+ **Duration:** {summary['profiling_duration_seconds']:.2f} seconds
449
+
450
+ ---
451
+
452
+ ## Executive Summary
453
+
454
+ | Metric | Value |
455
+ |--------|-------|
456
+ | Total Rows | {summary['total_rows']:,} |
457
+ | Total Columns | {summary['total_columns']} |
458
+ | Missing Data | {summary['missing_percentage']}% |
459
+ | Quality Score | **{summary['quality_score']}/100** |
460
+
461
+ ### Column Types
462
+
463
+ | Type | Count |
464
+ |------|-------|
465
+ """
466
+
467
+ for col_type, count in summary['column_types'].items():
468
+ md += f"| {col_type} | {count} |\n"
469
+
470
+ md += "\n---\n\n## Column Details\n\n"
471
+
472
+ # Add column sections
473
+ for col_name, col_profile in self.profile.column_profiles.items():
474
+ md += self._generate_column_section_markdown(col_name, col_profile)
475
+
476
+ return md
477
+
478
+ def _generate_column_section_markdown(self, col_name: str, col_profile) -> str:
479
+ """Generate Markdown section for a single column."""
480
+ metrics = col_profile.universal_metrics
481
+
482
+ md = f"""### {col_name} `({col_profile.configured_type.value})`
483
+
484
+ | Metric | Value |
485
+ |--------|-------|
486
+ | Total Count | {metrics.total_count:,} |
487
+ | Missing Values | {metrics.null_count:,} ({metrics.null_percentage:.1f}%) |
488
+ | Unique Values | {metrics.distinct_count:,} ({metrics.distinct_percentage:.1f}%) |
489
+ """
490
+
491
+ # Add type-specific metrics
492
+ if col_profile.numeric_metrics:
493
+ nm = col_profile.numeric_metrics
494
+ md += f"""| Mean | {nm.mean:.2f} |
495
+ | Std Dev | {nm.std:.2f} |
496
+ | Min / Max | {nm.min_value:.2f} / {nm.max_value:.2f} |
497
+ | Median | {nm.median:.2f} |
498
+ """
499
+
500
+ elif col_profile.categorical_metrics:
501
+ cm = col_profile.categorical_metrics
502
+ top_cats = ', '.join(f"{cat}({count})" for cat, count in cm.top_categories[:5])
503
+ md += f"""| Cardinality | {cm.cardinality} |
504
+ | Top Categories | {top_cats} |
505
+ """
506
+
507
+ md += "\n"
508
+
509
+ return md
510
+
511
+ def _get_quality_color(self, score: int) -> str:
512
+ """Get color based on quality score."""
513
+ if score >= 90:
514
+ return "#28a745" # Green
515
+ elif score >= 70:
516
+ return "#ffc107" # Yellow
517
+ elif score >= 50:
518
+ return "#fd7e14" # Orange
519
+ else:
520
+ return "#dc3545" # Red
@@ -0,0 +1,151 @@
1
+ from dataclasses import asdict, dataclass
2
+ from typing import Any, Dict, Optional
3
+
4
+ from customer_retention.core.compat import pd
5
+ from customer_retention.core.config import DataSourceConfig
6
+
7
+
8
+ @dataclass
9
+ class SCDResult:
10
+ """Result of SCD analysis for a column."""
11
+ column_name: str
12
+ changes_detected: bool
13
+ entities_with_change: int
14
+ change_percentage: float
15
+ max_changes: int
16
+ avg_changes_per_entity: float
17
+ scd_type_recommendation: str
18
+
19
+ def to_dict(self) -> dict:
20
+ """Convert to dictionary."""
21
+ return asdict(self)
22
+
23
+
24
+ class SCDAnalyzer:
25
+ """Analyzes Slowly Changing Dimension patterns in data."""
26
+
27
+ def __init__(self, entity_key: Optional[str] = None):
28
+ """
29
+ Initialize SCD Analyzer.
30
+
31
+ Args:
32
+ entity_key: Column name that identifies unique entities (e.g., customer_id)
33
+ """
34
+ self.entity_key = entity_key
35
+
36
+ def analyze(self, df: pd.DataFrame, columns: Optional[list] = None) -> Dict[str, Dict[str, Any]]:
37
+ """
38
+ Analyze SCD patterns in dataframe.
39
+
40
+ Args:
41
+ df: DataFrame with multi-row per entity data
42
+ columns: List of columns to analyze (if None, analyze all except entity_key)
43
+
44
+ Returns:
45
+ Dictionary mapping column names to SCD metrics
46
+ """
47
+ if self.entity_key is None:
48
+ raise ValueError("entity_key must be set to analyze SCD patterns")
49
+
50
+ if self.entity_key not in df.columns:
51
+ raise ValueError(f"entity_key '{self.entity_key}' not found in dataframe")
52
+
53
+ # Determine columns to analyze
54
+ if columns is None:
55
+ columns = [col for col in df.columns if col != self.entity_key]
56
+
57
+ results = {}
58
+
59
+ for column in columns:
60
+ metrics = self._analyze_column(df, column)
61
+ results[column] = metrics
62
+
63
+ return results
64
+
65
+ def _analyze_column(self, df: pd.DataFrame, column: str) -> Dict[str, Any]:
66
+ """Analyze SCD pattern for a single column."""
67
+ # Group by entity and count distinct values per entity
68
+ entity_changes = df.groupby(self.entity_key)[column].nunique()
69
+
70
+ # Entities with more than 1 value = changes detected
71
+ entities_with_change = (entity_changes > 1).sum()
72
+ total_entities = len(entity_changes)
73
+
74
+ change_percentage = (entities_with_change / total_entities * 100) if total_entities > 0 else 0.0
75
+
76
+ # Max changes for any entity
77
+ max_changes = int(entity_changes.max() - 1) if len(entity_changes) > 0 else 0
78
+
79
+ # Average changes per entity (only for entities with changes)
80
+ avg_changes = float(entity_changes[entity_changes > 1].mean() - 1) if entities_with_change > 0 else 0.0
81
+
82
+ metrics = {
83
+ "changes_detected": bool(entities_with_change > 0), # Convert numpy bool to Python bool
84
+ "entities_with_change": int(entities_with_change),
85
+ "total_entities": int(total_entities),
86
+ "change_percentage": round(change_percentage, 2),
87
+ "max_changes": max_changes,
88
+ "avg_changes_per_entity": round(avg_changes, 2),
89
+ }
90
+
91
+ # Add SCD type recommendation
92
+ metrics["scd_type_recommendation"] = self.recommend_scd_type(metrics)
93
+
94
+ return metrics
95
+
96
+ def recommend_scd_type(self, metrics: Dict[str, Any]) -> str:
97
+ """
98
+ Recommend SCD type based on change patterns.
99
+
100
+ Returns:
101
+ String describing recommended SCD type
102
+ """
103
+ if not metrics["changes_detected"]:
104
+ return "Type 0 (Static - Never changes)"
105
+
106
+ change_pct = metrics["change_percentage"]
107
+ avg_changes = metrics.get("avg_changes_per_entity", 0)
108
+
109
+ # Type 1: Rare changes, only current value matters
110
+ if change_pct < 10 and avg_changes < 2:
111
+ return "Type 1 (Overwrite - Rare changes, history not important)"
112
+
113
+ # Type 2: Frequent changes, history matters
114
+ elif change_pct >= 30 or avg_changes >= 3:
115
+ return "Type 2 (Track History - Frequent changes, full history needed)"
116
+
117
+ # Type 3: Moderate changes, only previous value matters
118
+ elif change_pct < 30 and avg_changes < 3:
119
+ return "Type 3 (Keep Previous - Only previous value matters)"
120
+
121
+ # Default
122
+ return "Type 2 (Track History - Moderate to frequent changes)"
123
+
124
+ def analyze_with_config(self, df: pd.DataFrame, config: DataSourceConfig) -> Dict[str, Dict[str, Any]]:
125
+ """
126
+ Analyze SCD patterns using configuration.
127
+
128
+ Args:
129
+ df: DataFrame to analyze
130
+ config: DataSourceConfig with entity key information
131
+
132
+ Returns:
133
+ Dictionary of SCD metrics per column
134
+ """
135
+ # Use primary key as entity key
136
+ self.entity_key = config.primary_key
137
+
138
+ # Analyze all columns except primary key
139
+ columns = [col.name for col in config.columns if col.name != config.primary_key]
140
+
141
+ return self.analyze(df, columns)
142
+
143
+ def to_dataframe(self, results: Dict[str, Dict[str, Any]]) -> pd.DataFrame:
144
+ """Convert SCD analysis results to a summary DataFrame."""
145
+ rows = []
146
+ for column_name, metrics in results.items():
147
+ row = {"column": column_name}
148
+ row.update(metrics)
149
+ rows.append(row)
150
+
151
+ return pd.DataFrame(rows)