churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,527 @@
1
+ from abc import ABC, abstractmethod
2
+ from datetime import datetime
3
+ from typing import Optional
4
+
5
+ import numpy as np
6
+
7
+ from customer_retention.core.compat import Timestamp, is_bool_dtype, is_datetime64_any_dtype, pd
8
+ from customer_retention.core.config.column_config import ColumnType
9
+
10
+ from .profile_result import (
11
+ BinaryMetrics,
12
+ CategoricalMetrics,
13
+ DatetimeMetrics,
14
+ IdentifierMetrics,
15
+ NumericMetrics,
16
+ TargetMetrics,
17
+ UniversalMetrics,
18
+ )
19
+
20
+
21
+ class ColumnProfiler(ABC):
22
+ def compute_universal_metrics(self, series: pd.Series) -> UniversalMetrics:
23
+ total_count = len(series)
24
+ null_count = int(series.isna().sum())
25
+ null_percentage = (null_count / total_count * 100) if total_count > 0 else 0
26
+
27
+ distinct_count = int(series.nunique())
28
+ distinct_percentage = (distinct_count / total_count * 100) if total_count > 0 else 0
29
+
30
+ value_counts = series.value_counts()
31
+ most_common_value = value_counts.index[0] if len(value_counts) > 0 else None
32
+ most_common_frequency = int(value_counts.iloc[0]) if len(value_counts) > 0 else None
33
+
34
+ memory_size = series.memory_usage(deep=True)
35
+
36
+ return UniversalMetrics(
37
+ total_count=total_count,
38
+ null_count=null_count,
39
+ null_percentage=round(null_percentage, 2),
40
+ distinct_count=distinct_count,
41
+ distinct_percentage=round(distinct_percentage, 2),
42
+ most_common_value=most_common_value,
43
+ most_common_frequency=most_common_frequency,
44
+ memory_size_bytes=int(memory_size)
45
+ )
46
+
47
+ @abstractmethod
48
+ def profile(self, series: pd.Series) -> dict:
49
+ pass
50
+
51
+
52
+ class IdentifierProfiler(ColumnProfiler):
53
+ def profile(self, series: pd.Series) -> dict:
54
+ is_unique = series.nunique() == len(series.dropna())
55
+ duplicates = series[series.duplicated(keep=False)]
56
+ duplicate_count = len(duplicates.unique())
57
+ duplicate_values = duplicates.unique().tolist()[:10]
58
+
59
+ str_series = series.dropna().astype(str)
60
+ lengths = str_series.str.len()
61
+
62
+ format_pattern, format_consistency = self.detect_format_pattern(str_series)
63
+
64
+ return {
65
+ "identifier_metrics": IdentifierMetrics(
66
+ is_unique=is_unique,
67
+ duplicate_count=duplicate_count,
68
+ duplicate_values=duplicate_values,
69
+ format_pattern=format_pattern,
70
+ format_consistency=format_consistency,
71
+ length_min=int(lengths.min()) if len(lengths) > 0 else None,
72
+ length_max=int(lengths.max()) if len(lengths) > 0 else None,
73
+ length_mode=int(lengths.mode().iloc[0]) if len(lengths.mode()) > 0 else None
74
+ )
75
+ }
76
+
77
+ def detect_format_pattern(self, str_series: pd.Series) -> tuple[Optional[str], Optional[float]]:
78
+ if len(str_series) == 0:
79
+ return None, None
80
+
81
+ str_series.head(min(100, len(str_series)))
82
+ pattern_map = {
83
+ r'^[A-Z]{3}-\d{5}$': 'AAA-99999',
84
+ r'^\d{3}-\d{3}-\d{4}$': '999-999-9999',
85
+ r'^[A-Z]{2}\d{6}$': 'AA999999',
86
+ r'^\d+$': 'numeric_only',
87
+ r'^[A-Za-z]+$': 'alpha_only',
88
+ r'^[A-Z][0-9]{4,}$': 'A9999+',
89
+ r'^\w+-\d+$': 'text-digits',
90
+ r'^[A-Z0-9]+$': 'alphanumeric'
91
+ }
92
+
93
+ for pattern, desc in pattern_map.items():
94
+ matches = str_series.str.match(pattern, na=False)
95
+ match_pct = (matches.sum() / len(str_series)) * 100
96
+ if match_pct > 80:
97
+ return desc, round(match_pct, 2)
98
+
99
+ return 'mixed', 0.0
100
+
101
+
102
+ class TargetProfiler(ColumnProfiler):
103
+ def profile(self, series: pd.Series) -> dict:
104
+ value_counts = series.value_counts()
105
+ class_distribution = {str(k): int(v) for k, v in value_counts.items()}
106
+
107
+ total = len(series.dropna())
108
+ class_percentages = {str(k): round((v / total * 100), 2) for k, v in value_counts.items()}
109
+
110
+ minority_class = value_counts.idxmin()
111
+ minority_count = value_counts.min()
112
+ majority_count = value_counts.max()
113
+ minority_percentage = round((minority_count / total * 100), 2) if total > 0 else 0
114
+ imbalance_ratio = round((majority_count / minority_count), 2) if minority_count > 0 else float('inf')
115
+
116
+ return {
117
+ "target_metrics": TargetMetrics(
118
+ class_distribution=class_distribution,
119
+ class_percentages=class_percentages,
120
+ imbalance_ratio=imbalance_ratio,
121
+ minority_class=minority_class,
122
+ minority_percentage=minority_percentage,
123
+ n_classes=len(value_counts)
124
+ )
125
+ }
126
+
127
+
128
+ class NumericProfiler(ColumnProfiler):
129
+ def profile(self, series: pd.Series) -> dict:
130
+ clean_series = series.dropna()
131
+ if len(clean_series) == 0:
132
+ return {"numeric_metrics": None}
133
+
134
+ mean_val = float(clean_series.mean())
135
+ std_val = float(clean_series.std())
136
+ min_val = float(clean_series.min())
137
+ max_val = float(clean_series.max())
138
+ range_val = max_val - min_val
139
+
140
+ median_val = float(clean_series.median())
141
+ q1 = float(clean_series.quantile(0.25))
142
+ q3 = float(clean_series.quantile(0.75))
143
+ iqr = q3 - q1
144
+
145
+ try:
146
+ skewness_val = float(clean_series.skew())
147
+ kurtosis_val = float(clean_series.kurtosis())
148
+ except Exception:
149
+ skewness_val = None
150
+ kurtosis_val = None
151
+
152
+ zero_count = int((clean_series == 0).sum())
153
+ zero_percentage = round((zero_count / len(clean_series) * 100), 2)
154
+
155
+ negative_count = int((clean_series < 0).sum())
156
+ negative_percentage = round((negative_count / len(clean_series) * 100), 2)
157
+
158
+ inf_count = int(np.isinf(clean_series).sum())
159
+ inf_percentage = round((inf_count / len(clean_series) * 100), 2)
160
+
161
+ outliers_iqr = ((clean_series < (q1 - 1.5 * iqr)) | (clean_series > (q3 + 1.5 * iqr)))
162
+ outlier_count_iqr = int(outliers_iqr.sum())
163
+
164
+ if std_val > 0:
165
+ z_scores = np.abs((clean_series - mean_val) / std_val)
166
+ outlier_count_zscore = int((z_scores > 3).sum())
167
+ else:
168
+ outlier_count_zscore = 0
169
+
170
+ outlier_percentage = round((outlier_count_iqr / len(clean_series) * 100), 2)
171
+
172
+ # Filter out infinite values for histogram calculation
173
+ finite_series = clean_series[np.isfinite(clean_series)]
174
+ if len(finite_series) > 0:
175
+ histogram, bin_edges = np.histogram(finite_series, bins=10)
176
+ histogram_bins = [
177
+ (round(float(bin_edges[i]), 4), round(float(bin_edges[i + 1]), 4), int(histogram[i]))
178
+ for i in range(len(histogram))
179
+ ]
180
+ else:
181
+ histogram_bins = []
182
+
183
+ return {
184
+ "numeric_metrics": NumericMetrics(
185
+ mean=round(mean_val, 4),
186
+ std=round(std_val, 4),
187
+ min_value=round(min_val, 4),
188
+ max_value=round(max_val, 4),
189
+ range_value=round(range_val, 4),
190
+ median=round(median_val, 4),
191
+ q1=round(q1, 4),
192
+ q3=round(q3, 4),
193
+ iqr=round(iqr, 4),
194
+ skewness=round(skewness_val, 4) if skewness_val is not None else None,
195
+ kurtosis=round(kurtosis_val, 4) if kurtosis_val is not None else None,
196
+ zero_count=zero_count,
197
+ zero_percentage=zero_percentage,
198
+ negative_count=negative_count,
199
+ negative_percentage=negative_percentage,
200
+ inf_count=inf_count,
201
+ inf_percentage=inf_percentage,
202
+ outlier_count_iqr=outlier_count_iqr,
203
+ outlier_count_zscore=outlier_count_zscore,
204
+ outlier_percentage=outlier_percentage,
205
+ histogram_bins=histogram_bins
206
+ )
207
+ }
208
+
209
+
210
+ class CategoricalProfiler(ColumnProfiler):
211
+ def profile(self, series: pd.Series) -> dict:
212
+ clean_series = series.dropna()
213
+ if len(clean_series) == 0:
214
+ return {"categorical_metrics": None}
215
+
216
+ cardinality = int(series.nunique())
217
+ cardinality_ratio = round((cardinality / len(clean_series)), 4)
218
+
219
+ value_counts = clean_series.value_counts()
220
+ value_counts_dict = {str(k): int(v) for k, v in value_counts.items()}
221
+
222
+ top_categories = [(str(k), int(v)) for k, v in value_counts.head(10).items()]
223
+
224
+ rare_threshold = len(clean_series) * 0.01
225
+ rare_categories = [str(k) for k, v in value_counts.items() if v < rare_threshold]
226
+ rare_category_count = len(rare_categories)
227
+
228
+ rare_rows = sum(v for k, v in value_counts.items() if v < rare_threshold)
229
+ rare_category_percentage = round((rare_rows / len(clean_series) * 100), 2)
230
+
231
+ unknown_values = {"unknown", "other", "n/a", "na", "none", "null", "missing"}
232
+ contains_unknown = any(str(v).lower() in unknown_values for v in clean_series.unique()[:100])
233
+
234
+ case_variations = self.detect_case_variations(clean_series)
235
+ whitespace_issues = self.detect_whitespace_issues(clean_series)
236
+
237
+ encoding_recommendation = self.recommend_encoding(cardinality, rare_category_percentage)
238
+
239
+ return {
240
+ "categorical_metrics": CategoricalMetrics(
241
+ cardinality=cardinality,
242
+ cardinality_ratio=cardinality_ratio,
243
+ value_counts=value_counts_dict,
244
+ top_categories=top_categories,
245
+ rare_categories=rare_categories[:20],
246
+ rare_category_count=rare_category_count,
247
+ rare_category_percentage=rare_category_percentage,
248
+ contains_unknown=contains_unknown,
249
+ case_variations=case_variations,
250
+ whitespace_issues=whitespace_issues,
251
+ encoding_recommendation=encoding_recommendation
252
+ )
253
+ }
254
+
255
+ def detect_case_variations(self, clean_series: pd.Series) -> list[str]:
256
+ str_series = clean_series.astype(str)
257
+ lower_to_originals = {}
258
+
259
+ for value in str_series.unique():
260
+ lower_val = value.lower()
261
+ if lower_val not in lower_to_originals:
262
+ lower_to_originals[lower_val] = []
263
+ lower_to_originals[lower_val].append(value)
264
+
265
+ variations = []
266
+ for lower_val, originals in lower_to_originals.items():
267
+ if len(originals) > 1:
268
+ variations.append(f"{originals[0]} vs {originals[1]}")
269
+
270
+ return variations[:10]
271
+
272
+ def detect_whitespace_issues(self, clean_series: pd.Series) -> list[str]:
273
+ str_series = clean_series.astype(str)
274
+ issues = []
275
+
276
+ for value in str_series.unique()[:100]:
277
+ if value != value.strip():
278
+ issues.append(value)
279
+
280
+ return issues[:10]
281
+
282
+ def recommend_encoding(self, cardinality: int, rare_pct: float) -> str:
283
+ if cardinality <= 5:
284
+ return "one_hot"
285
+ elif cardinality <= 15:
286
+ return "one_hot_or_target"
287
+ elif cardinality <= 50:
288
+ return "target_or_embedding"
289
+ else:
290
+ return "hashing_or_embedding"
291
+
292
+
293
+ class DatetimeProfiler(ColumnProfiler):
294
+ def profile(self, series: pd.Series) -> dict:
295
+ clean_series = series.dropna()
296
+ if len(clean_series) == 0:
297
+ return {"datetime_metrics": None}
298
+
299
+ format_detected, format_consistency = self.detect_datetime_format(series)
300
+
301
+ if not is_datetime64_any_dtype(clean_series):
302
+ sample = clean_series.head(10)
303
+ if len(sample) > 0 and all(isinstance(v, (Timestamp, datetime)) for v in sample):
304
+ pass
305
+ else:
306
+ try:
307
+ clean_series = pd.to_datetime(clean_series, errors='coerce', format='mixed')
308
+ except Exception:
309
+ return {"datetime_metrics": None}
310
+
311
+ min_date = clean_series.min()
312
+ max_date = clean_series.max()
313
+ date_range_days = (max_date - min_date).days
314
+
315
+ now = Timestamp.now()
316
+ future_date_count = int((clean_series > now).sum())
317
+
318
+ placeholder_dates = [
319
+ Timestamp('1970-01-01'),
320
+ Timestamp('1900-01-01'),
321
+ Timestamp('9999-12-31')
322
+ ]
323
+ placeholder_count = int(sum((clean_series == pd_date).sum() for pd_date in placeholder_dates))
324
+
325
+ if is_datetime64_any_dtype(clean_series):
326
+ weekend_count = int(clean_series.dt.dayofweek.isin([5, 6]).sum())
327
+ else:
328
+ weekend_count = int(sum(1 for v in clean_series if isinstance(v, Timestamp) and v.dayofweek in [5, 6]))
329
+ weekend_percentage = round((weekend_count / len(clean_series) * 100), 2)
330
+
331
+ return {
332
+ "datetime_metrics": DatetimeMetrics(
333
+ min_date=str(min_date),
334
+ max_date=str(max_date),
335
+ date_range_days=date_range_days,
336
+ format_detected=format_detected,
337
+ format_consistency=format_consistency,
338
+ future_date_count=future_date_count,
339
+ placeholder_count=placeholder_count,
340
+ timezone_consistent=True,
341
+ weekend_percentage=weekend_percentage
342
+ )
343
+ }
344
+
345
+ def detect_datetime_format(self, series: pd.Series) -> tuple[Optional[str], Optional[float]]:
346
+ if is_datetime64_any_dtype(series):
347
+ return 'datetime64', 100.0
348
+
349
+ sample = series.dropna().astype(str).head(min(100, len(series)))
350
+ if len(sample) == 0:
351
+ return None, None
352
+
353
+ formats = [
354
+ '%Y-%m-%d',
355
+ '%Y/%m/%d',
356
+ '%d-%m-%Y',
357
+ '%d/%m/%Y',
358
+ '%Y-%m-%d %H:%M:%S',
359
+ '%Y/%m/%d %H:%M:%S',
360
+ '%d-%m-%Y %H:%M:%S',
361
+ '%d/%m/%Y %H:%M:%S',
362
+ '%Y-%m-%dT%H:%M:%S',
363
+ '%m/%d/%Y',
364
+ '%m-%d-%Y',
365
+ ]
366
+
367
+ best_format = None
368
+ best_match_pct = 0.0
369
+
370
+ for fmt in formats:
371
+ matches = 0
372
+ for val in sample:
373
+ try:
374
+ datetime.strptime(val, fmt)
375
+ matches += 1
376
+ except Exception:
377
+ pass
378
+
379
+ match_pct = (matches / len(sample)) * 100
380
+ if match_pct > best_match_pct:
381
+ best_match_pct = match_pct
382
+ best_format = fmt
383
+
384
+ if best_format and best_match_pct > 80:
385
+ return best_format, round(best_match_pct, 2)
386
+
387
+ return 'mixed', 0.0
388
+
389
+
390
+ class BinaryProfiler(ColumnProfiler):
391
+ def profile(self, series: pd.Series) -> dict:
392
+ clean_series = series.dropna()
393
+ if len(clean_series) == 0:
394
+ return {"binary_metrics": None}
395
+
396
+ value_counts = clean_series.value_counts()
397
+ values_found = value_counts.index.tolist()
398
+
399
+ true_values = {1, 1.0, True, "1", "yes", "Yes", "YES", "true", "True", "TRUE", "y", "Y"}
400
+ false_values = {0, 0.0, False, "0", "no", "No", "NO", "false", "False", "FALSE", "n", "N"}
401
+
402
+ true_count = int(sum(value_counts.get(v, 0) for v in values_found if v in true_values))
403
+ false_count = int(sum(value_counts.get(v, 0) for v in values_found if v in false_values))
404
+
405
+ if true_count == 0 and false_count == 0:
406
+ true_count = int(value_counts.iloc[0]) if len(value_counts) > 0 else 0
407
+ false_count = int(value_counts.iloc[1]) if len(value_counts) > 1 else 0
408
+
409
+ total = true_count + false_count
410
+ true_percentage = round((true_count / total * 100), 2) if total > 0 else 0
411
+
412
+ balance_ratio = round((max(true_count, false_count) / min(true_count, false_count)), 2) \
413
+ if min(true_count, false_count) > 0 else float('inf')
414
+
415
+ is_boolean = is_bool_dtype(series)
416
+
417
+ return {
418
+ "binary_metrics": BinaryMetrics(
419
+ true_count=true_count,
420
+ false_count=false_count,
421
+ true_percentage=true_percentage,
422
+ balance_ratio=balance_ratio,
423
+ values_found=values_found,
424
+ is_boolean=is_boolean
425
+ )
426
+ }
427
+
428
+
429
+ class TextProfiler(ColumnProfiler):
430
+ """Profile text columns with PII detection."""
431
+
432
+ def profile(self, series: pd.Series) -> dict:
433
+ """Profile text column."""
434
+
435
+ clean_series = series.dropna()
436
+
437
+ # Calculate text lengths
438
+ lengths = clean_series.astype(str).str.len()
439
+ length_min = int(lengths.min()) if len(lengths) > 0 else 0
440
+ length_max = int(lengths.max()) if len(lengths) > 0 else 0
441
+ length_mean = float(lengths.mean()) if len(lengths) > 0 else 0.0
442
+ length_median = float(lengths.median()) if len(lengths) > 0 else 0.0
443
+
444
+ # Empty text detection
445
+ empty_count = int((clean_series.astype(str) == "").sum())
446
+ empty_percentage = (empty_count / len(series) * 100) if len(series) > 0 else 0.0
447
+
448
+ # Word count
449
+ word_counts = clean_series.astype(str).str.split().str.len()
450
+ word_count_mean = float(word_counts.mean()) if len(word_counts) > 0 else 0.0
451
+
452
+ # Contains digits
453
+ contains_digits = clean_series.astype(str).str.contains(r'\d', regex=True, na=False)
454
+ contains_digits_pct = float(contains_digits.sum() / len(clean_series) * 100) if len(clean_series) > 0 else 0.0
455
+
456
+ # Contains special characters
457
+ contains_special = clean_series.astype(str).str.contains(r'[!@#$%^&*(),.?":{}|<>]', regex=True, na=False)
458
+ contains_special_pct = float(contains_special.sum() / len(clean_series) * 100) if len(clean_series) > 0 else 0.0
459
+
460
+ # PII Detection
461
+ pii_detected = False
462
+ pii_types = []
463
+
464
+ # Email pattern
465
+ email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
466
+ if clean_series.astype(str).str.contains(email_pattern, regex=True, na=False).any():
467
+ pii_detected = True
468
+ pii_types.append("email")
469
+
470
+ # Phone pattern (US format)
471
+ phone_pattern = r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b'
472
+ if clean_series.astype(str).str.contains(phone_pattern, regex=True, na=False).any():
473
+ pii_detected = True
474
+ pii_types.append("phone")
475
+
476
+ # SSN pattern
477
+ ssn_pattern = r'\b\d{3}-\d{2}-\d{4}\b'
478
+ if clean_series.astype(str).str.contains(ssn_pattern, regex=True, na=False).any():
479
+ pii_detected = True
480
+ pii_types.append("ssn")
481
+
482
+ # Credit card pattern (basic)
483
+ cc_pattern = r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b'
484
+ if clean_series.astype(str).str.contains(cc_pattern, regex=True, na=False).any():
485
+ pii_detected = True
486
+ pii_types.append("credit_card")
487
+
488
+ from .profile_result import TextMetrics
489
+
490
+ return {
491
+ "text_metrics": TextMetrics(
492
+ length_min=length_min,
493
+ length_max=length_max,
494
+ length_mean=length_mean,
495
+ length_median=length_median,
496
+ empty_count=empty_count,
497
+ empty_percentage=round(empty_percentage, 2),
498
+ word_count_mean=round(word_count_mean, 2),
499
+ contains_digits_pct=round(contains_digits_pct, 2),
500
+ contains_special_pct=round(contains_special_pct, 2),
501
+ pii_detected=pii_detected,
502
+ pii_types=pii_types,
503
+ language_detected=None # TODO: Can add language detection later
504
+ )
505
+ }
506
+
507
+
508
+ class ProfilerFactory:
509
+ _profilers = {
510
+ ColumnType.IDENTIFIER: IdentifierProfiler,
511
+ ColumnType.TARGET: TargetProfiler,
512
+ ColumnType.FEATURE_TIMESTAMP: DatetimeProfiler,
513
+ ColumnType.LABEL_TIMESTAMP: DatetimeProfiler,
514
+ ColumnType.NUMERIC_CONTINUOUS: NumericProfiler,
515
+ ColumnType.NUMERIC_DISCRETE: NumericProfiler,
516
+ ColumnType.CATEGORICAL_NOMINAL: CategoricalProfiler,
517
+ ColumnType.CATEGORICAL_ORDINAL: CategoricalProfiler,
518
+ ColumnType.CATEGORICAL_CYCLICAL: CategoricalProfiler,
519
+ ColumnType.DATETIME: DatetimeProfiler,
520
+ ColumnType.BINARY: BinaryProfiler,
521
+ ColumnType.TEXT: TextProfiler,
522
+ }
523
+
524
+ @classmethod
525
+ def get_profiler(cls, column_type: ColumnType) -> Optional[ColumnProfiler]:
526
+ profiler_class = cls._profilers.get(column_type)
527
+ return profiler_class() if profiler_class else None