churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,193 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any, Union
4
+
5
+ import numpy as np
6
+ import pandas as _pandas
7
+
8
+ from .detection import (
9
+ configure_spark_pandas,
10
+ enable_arrow_optimization,
11
+ get_dbutils,
12
+ get_display_function,
13
+ get_spark_session,
14
+ is_databricks,
15
+ is_notebook,
16
+ is_pandas_api_on_spark,
17
+ is_spark_available,
18
+ set_spark_config,
19
+ )
20
+ from .ops import DataOps, ops
21
+
22
+ _SPARK_PANDAS_AVAILABLE = is_spark_available()
23
+
24
+ if _SPARK_PANDAS_AVAILABLE:
25
+ try:
26
+ import pyspark.pandas as ps
27
+ pd = ps
28
+ DataFrame = Union[ps.DataFrame, _pandas.DataFrame]
29
+ Series = Union[ps.Series, _pandas.Series]
30
+ except ImportError:
31
+ import databricks.koalas as ps
32
+ pd = ps
33
+ DataFrame = Union[ps.DataFrame, _pandas.DataFrame]
34
+ Series = Union[ps.Series, _pandas.Series]
35
+ else:
36
+ pd = _pandas
37
+ DataFrame = _pandas.DataFrame
38
+ Series = _pandas.Series
39
+
40
+
41
+ def get_pandas() -> Any:
42
+ return _pandas
43
+
44
+
45
+ def to_pandas(df: Any) -> _pandas.DataFrame:
46
+ if isinstance(df, _pandas.DataFrame):
47
+ return df
48
+ if _SPARK_PANDAS_AVAILABLE:
49
+ try:
50
+ import pyspark.pandas as ps
51
+ if isinstance(df, ps.DataFrame):
52
+ return df.to_pandas()
53
+ except ImportError:
54
+ pass
55
+ try:
56
+ from pyspark.sql import DataFrame as NativeSparkDF
57
+ if isinstance(df, NativeSparkDF):
58
+ return df.toPandas()
59
+ except ImportError:
60
+ pass
61
+ return _pandas.DataFrame(df)
62
+
63
+
64
+ def to_spark_pandas(df: Any) -> Any:
65
+ if not _SPARK_PANDAS_AVAILABLE:
66
+ return df if isinstance(df, _pandas.DataFrame) else _pandas.DataFrame(df)
67
+ try:
68
+ import pyspark.pandas as ps
69
+ if isinstance(df, ps.DataFrame):
70
+ return df
71
+ if isinstance(df, _pandas.DataFrame):
72
+ return ps.from_pandas(df)
73
+ return ps.DataFrame(df)
74
+ except ImportError:
75
+ return df
76
+
77
+
78
+ def ensure_pandas_series(series: Any) -> _pandas.Series:
79
+ if isinstance(series, _pandas.Series):
80
+ return series
81
+ if _SPARK_PANDAS_AVAILABLE:
82
+ try:
83
+ import pyspark.pandas as ps
84
+ if isinstance(series, ps.Series):
85
+ return series.to_pandas()
86
+ except ImportError:
87
+ pass
88
+ return _pandas.Series(series)
89
+
90
+
91
+ def concat(objs: list, axis: int = 0, ignore_index: bool = False, **kwargs: Any) -> Any:
92
+ if not objs:
93
+ return pd.DataFrame()
94
+ return pd.concat(objs, axis=axis, ignore_index=ignore_index, **kwargs)
95
+
96
+
97
+ def merge(left: Any, right: Any, how: str = "inner", on: Any = None, **kwargs: Any) -> Any:
98
+ return pd.merge(left, right, how=how, on=on, **kwargs)
99
+
100
+
101
+ Timestamp = _pandas.Timestamp
102
+ Timedelta = _pandas.Timedelta
103
+ DatetimeIndex = _pandas.DatetimeIndex
104
+ CategoricalDtype = _pandas.CategoricalDtype
105
+ NA = _pandas.NA
106
+ NaT = _pandas.NaT
107
+
108
+ api_types = _pandas.api.types
109
+
110
+
111
+ def is_numeric_dtype(arr_or_dtype: Any) -> bool:
112
+ return _pandas.api.types.is_numeric_dtype(arr_or_dtype)
113
+
114
+
115
+ def is_string_dtype(arr_or_dtype: Any) -> bool:
116
+ return _pandas.api.types.is_string_dtype(arr_or_dtype)
117
+
118
+
119
+ def is_datetime64_any_dtype(arr_or_dtype: Any) -> bool:
120
+ return _pandas.api.types.is_datetime64_any_dtype(arr_or_dtype)
121
+
122
+
123
+ def is_bool_dtype(arr_or_dtype: Any) -> bool:
124
+ return _pandas.api.types.is_bool_dtype(arr_or_dtype)
125
+
126
+
127
+ def is_categorical_dtype(arr_or_dtype: Any) -> bool:
128
+ return _pandas.api.types.is_categorical_dtype(arr_or_dtype)
129
+
130
+
131
+ def is_integer_dtype(arr_or_dtype: Any) -> bool:
132
+ return _pandas.api.types.is_integer_dtype(arr_or_dtype)
133
+
134
+
135
+ def is_float_dtype(arr_or_dtype: Any) -> bool:
136
+ return _pandas.api.types.is_float_dtype(arr_or_dtype)
137
+
138
+
139
+ class PandasCompat:
140
+ @staticmethod
141
+ def value_counts_normalize(series: Any, normalize: bool = False) -> Any:
142
+ return series.value_counts(normalize=normalize)
143
+
144
+ @staticmethod
145
+ def apply_with_meta(df: Any, func: Any, meta: Any = None, **kwargs: Any) -> Any:
146
+ return df.apply(func, **kwargs)
147
+
148
+ @staticmethod
149
+ def groupby_apply(grouped: Any, func: Any, **kwargs: Any) -> Any:
150
+ return grouped.apply(func, **kwargs)
151
+
152
+
153
+ compat = PandasCompat()
154
+
155
+ __all__ = [
156
+ "pd",
157
+ "DataFrame",
158
+ "Series",
159
+ "Timestamp",
160
+ "Timedelta",
161
+ "DatetimeIndex",
162
+ "CategoricalDtype",
163
+ "NA",
164
+ "NaT",
165
+ "is_spark_available",
166
+ "is_pandas_api_on_spark",
167
+ "get_pandas",
168
+ "to_pandas",
169
+ "to_spark_pandas",
170
+ "ensure_pandas_series",
171
+ "concat",
172
+ "merge",
173
+ "api_types",
174
+ "is_numeric_dtype",
175
+ "is_string_dtype",
176
+ "is_datetime64_any_dtype",
177
+ "is_bool_dtype",
178
+ "is_categorical_dtype",
179
+ "is_integer_dtype",
180
+ "is_float_dtype",
181
+ "get_spark_session",
182
+ "set_spark_config",
183
+ "enable_arrow_optimization",
184
+ "configure_spark_pandas",
185
+ "compat",
186
+ "PandasCompat",
187
+ "is_databricks",
188
+ "is_notebook",
189
+ "get_display_function",
190
+ "get_dbutils",
191
+ "ops",
192
+ "DataOps",
193
+ ]
@@ -0,0 +1,99 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from typing import Any, Callable, Optional
5
+
6
+ _SPARK_PANDAS_AVAILABLE = False
7
+ _PANDAS_API_ON_SPARK = False
8
+
9
+ try:
10
+ import pyspark.pandas as ps
11
+ _SPARK_PANDAS_AVAILABLE = True
12
+ _PANDAS_API_ON_SPARK = True
13
+ except ImportError:
14
+ pass
15
+
16
+ if not _SPARK_PANDAS_AVAILABLE:
17
+ try:
18
+ import databricks.koalas as ps
19
+ _SPARK_PANDAS_AVAILABLE = True
20
+ except ImportError:
21
+ pass
22
+
23
+
24
+ def is_spark_available() -> bool:
25
+ return _SPARK_PANDAS_AVAILABLE
26
+
27
+
28
+ def is_pandas_api_on_spark() -> bool:
29
+ return _PANDAS_API_ON_SPARK
30
+
31
+
32
+ def is_databricks() -> bool:
33
+ return bool(os.environ.get("DATABRICKS_RUNTIME_VERSION"))
34
+
35
+
36
+ def is_notebook() -> bool:
37
+ try:
38
+ shell = get_ipython().__class__.__name__ # type: ignore[name-defined]
39
+ return shell in ("ZMQInteractiveShell", "DatabricksShell", "Shell")
40
+ except NameError:
41
+ return False
42
+
43
+
44
+ def get_spark_session() -> Optional[Any]:
45
+ if not _SPARK_PANDAS_AVAILABLE:
46
+ return None
47
+ try:
48
+ from pyspark.sql import SparkSession
49
+ return SparkSession.getActiveSession()
50
+ except Exception:
51
+ return None
52
+
53
+
54
+ def get_display_function() -> Callable[[str], None]:
55
+ if is_databricks():
56
+ try:
57
+ return displayHTML # type: ignore[name-defined]
58
+ except NameError:
59
+ pass
60
+ if is_notebook():
61
+ from IPython.display import HTML, display
62
+ return lambda html: display(HTML(html))
63
+ return print
64
+
65
+
66
+ def get_dbutils() -> Optional[Any]:
67
+ if not is_databricks():
68
+ return None
69
+ try:
70
+ return dbutils # type: ignore[name-defined]
71
+ except NameError:
72
+ spark = get_spark_session()
73
+ if spark:
74
+ try:
75
+ from pyspark.dbutils import DBUtils
76
+ return DBUtils(spark)
77
+ except ImportError:
78
+ pass
79
+ return None
80
+
81
+
82
+ def set_spark_config(key: str, value: Any) -> None:
83
+ spark = get_spark_session()
84
+ if spark:
85
+ spark.conf.set(key, value)
86
+
87
+
88
+ def enable_arrow_optimization() -> None:
89
+ set_spark_config("spark.sql.execution.arrow.pyspark.enabled", "true")
90
+
91
+
92
+ def configure_spark_pandas(compute_max_rows: int = 1000, display_max_rows: int = 100) -> None:
93
+ if _PANDAS_API_ON_SPARK:
94
+ try:
95
+ import pyspark.pandas as ps
96
+ ps.set_option("compute.max_rows", compute_max_rows)
97
+ ps.set_option("display.max_rows", display_max_rows)
98
+ except Exception:
99
+ pass
@@ -0,0 +1,48 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, List, Optional, Union
4
+
5
+ import pandas as pd
6
+
7
+ from . import pandas_backend
8
+ from .detection import is_spark_available
9
+
10
+
11
+ class DataOps:
12
+ def __init__(self):
13
+ self._use_spark = is_spark_available()
14
+
15
+ def _get_backend(self) -> Any:
16
+ if self._use_spark:
17
+ from . import spark_backend
18
+ return spark_backend
19
+ return pandas_backend
20
+
21
+ def read_csv(self, path: str, **kwargs: Any) -> pd.DataFrame:
22
+ return self._get_backend().read_csv(path, **kwargs)
23
+
24
+ def read_delta(self, path: str, version: Optional[int] = None) -> pd.DataFrame:
25
+ return self._get_backend().read_delta(path, version=version)
26
+
27
+ def write_delta(self, df: Union[pd.DataFrame, Any], path: str, mode: str = "overwrite",
28
+ partition_by: Optional[List[str]] = None) -> None:
29
+ self._get_backend().write_delta(df, path, mode=mode, partition_by=partition_by)
30
+
31
+ def get_missing_stats(self, df: Union[pd.DataFrame, Any]) -> Dict[str, float]:
32
+ return self._get_backend().get_missing_stats(df)
33
+
34
+ def correlation_matrix(self, df: Union[pd.DataFrame, Any],
35
+ columns: Optional[List[str]] = None) -> pd.DataFrame:
36
+ return self._get_backend().correlation_matrix(df, columns=columns)
37
+
38
+ def get_dtype_info(self, df: Union[pd.DataFrame, Any]) -> Dict[str, str]:
39
+ return self._get_backend().get_dtype_info(df)
40
+
41
+ def sample(self, df: Union[pd.DataFrame, Any], n: int, random_state: int = 42) -> pd.DataFrame:
42
+ return self._get_backend().sample(df, n=n, random_state=random_state)
43
+
44
+ def concat(self, dfs: List[Union[pd.DataFrame, Any]], axis: int = 0) -> pd.DataFrame:
45
+ return self._get_backend().concat(dfs, axis=axis)
46
+
47
+
48
+ ops = DataOps()
@@ -0,0 +1,57 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ import pandas as pd
6
+
7
+ try:
8
+ import deltalake
9
+ DELTA_RS_AVAILABLE = True
10
+ except ImportError:
11
+ DELTA_RS_AVAILABLE = False
12
+
13
+
14
+ def read_csv(path: str, **kwargs: Any) -> pd.DataFrame:
15
+ return pd.read_csv(path, **kwargs)
16
+
17
+
18
+ def read_delta(path: str, version: Optional[int] = None) -> pd.DataFrame:
19
+ if not DELTA_RS_AVAILABLE:
20
+ raise ImportError("deltalake package required: pip install deltalake")
21
+ if version is not None:
22
+ dt = deltalake.DeltaTable(path, version=version)
23
+ else:
24
+ dt = deltalake.DeltaTable(path)
25
+ return dt.to_pandas()
26
+
27
+
28
+ def write_delta(df: pd.DataFrame, path: str, mode: str = "overwrite",
29
+ partition_by: Optional[List[str]] = None) -> None:
30
+ if not DELTA_RS_AVAILABLE:
31
+ raise ImportError("deltalake package required: pip install deltalake")
32
+ from deltalake import write_deltalake
33
+ write_deltalake(path, df, mode=mode, partition_by=partition_by)
34
+
35
+
36
+ def get_missing_stats(df: pd.DataFrame) -> Dict[str, float]:
37
+ return (df.isnull().sum() / len(df)).to_dict()
38
+
39
+
40
+ def correlation_matrix(df: pd.DataFrame, columns: Optional[List[str]] = None) -> pd.DataFrame:
41
+ if columns:
42
+ return df[columns].corr()
43
+ return df.select_dtypes(include=["number"]).corr()
44
+
45
+
46
+ def get_dtype_info(df: pd.DataFrame) -> Dict[str, str]:
47
+ return {col: str(dtype) for col, dtype in df.dtypes.items()}
48
+
49
+
50
+ def sample(df: pd.DataFrame, n: int, random_state: int = 42) -> pd.DataFrame:
51
+ return df.sample(n=min(n, len(df)), random_state=random_state)
52
+
53
+
54
+ def concat(dfs: List[pd.DataFrame], axis: int = 0, ignore_index: bool = True) -> pd.DataFrame:
55
+ if axis == 1:
56
+ return pd.concat(dfs, axis=axis, ignore_index=False)
57
+ return pd.concat(dfs, axis=axis, ignore_index=ignore_index)
@@ -0,0 +1,75 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ try:
6
+ import pyspark.pandas as ps
7
+ from pyspark.sql import SparkSession
8
+ SPARK_AVAILABLE = True
9
+ except ImportError:
10
+ SPARK_AVAILABLE = False
11
+
12
+
13
+ def _get_spark() -> Any:
14
+ if not SPARK_AVAILABLE:
15
+ raise ImportError("pyspark required for Spark backend")
16
+ return SparkSession.getActiveSession() or SparkSession.builder.getOrCreate()
17
+
18
+
19
+ def read_csv(path: str, **kwargs: Any) -> Any:
20
+ if not SPARK_AVAILABLE:
21
+ raise ImportError("pyspark required")
22
+ return ps.read_csv(path, **kwargs)
23
+
24
+
25
+ def read_delta(path: str, version: Optional[int] = None) -> Any:
26
+ if not SPARK_AVAILABLE:
27
+ raise ImportError("pyspark required")
28
+ spark = _get_spark()
29
+ reader = spark.read.format("delta")
30
+ if version is not None:
31
+ reader = reader.option("versionAsOf", version)
32
+ return reader.load(path).to_pandas_on_spark()
33
+
34
+
35
+ def write_delta(df: Any, path: str, mode: str = "overwrite",
36
+ partition_by: Optional[List[str]] = None) -> None:
37
+ if not SPARK_AVAILABLE:
38
+ raise ImportError("pyspark required")
39
+ spark_df = df.to_spark() if hasattr(df, "to_spark") else df
40
+ writer = spark_df.write.format("delta").mode(mode)
41
+ if partition_by:
42
+ writer = writer.partitionBy(*partition_by)
43
+ writer.save(path)
44
+
45
+
46
+ def get_missing_stats(df: Any) -> Dict[str, float]:
47
+ if not SPARK_AVAILABLE:
48
+ raise ImportError("pyspark required")
49
+ pdf = df.to_pandas() if hasattr(df, "to_pandas") else df
50
+ return (pdf.isnull().sum() / len(pdf)).to_dict()
51
+
52
+
53
+ def correlation_matrix(df: Any, columns: Optional[List[str]] = None) -> Any:
54
+ if not SPARK_AVAILABLE:
55
+ raise ImportError("pyspark required")
56
+ if columns:
57
+ return df[columns].to_pandas().corr()
58
+ return df.select_dtypes(include=["number"]).to_pandas().corr()
59
+
60
+
61
+ def get_dtype_info(df: Any) -> Dict[str, str]:
62
+ return {col: str(dtype) for col, dtype in df.dtypes.items()}
63
+
64
+
65
+ def sample(df: Any, n: int, random_state: int = 42) -> Any:
66
+ if not SPARK_AVAILABLE:
67
+ raise ImportError("pyspark required")
68
+ fraction = min(1.0, n / len(df))
69
+ return df.sample(frac=fraction, random_state=random_state).head(n)
70
+
71
+
72
+ def concat(dfs: List[Any], axis: int = 0, ignore_index: bool = True) -> Any:
73
+ if not SPARK_AVAILABLE:
74
+ raise ImportError("pyspark required")
75
+ return ps.concat(dfs, axis=axis, ignore_index=ignore_index)
@@ -0,0 +1,11 @@
1
+ from .base import Component, ComponentResult, ComponentStatus
2
+ from .enums import ModelType, Severity
3
+ from .orchestrator import Orchestrator, OrchestratorResult
4
+ from .registry import ComponentRegistration, ComponentRegistry, get_default_registry
5
+
6
+ __all__ = [
7
+ "Component", "ComponentResult", "ComponentStatus",
8
+ "ComponentRegistry", "ComponentRegistration", "get_default_registry",
9
+ "Orchestrator", "OrchestratorResult",
10
+ "Severity", "ModelType"
11
+ ]
@@ -0,0 +1,79 @@
1
+ import time
2
+ from abc import ABC, abstractmethod
3
+ from dataclasses import dataclass, field
4
+ from enum import Enum
5
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional
6
+
7
+ if TYPE_CHECKING:
8
+ from customer_retention.generators.orchestration.context import PipelineContext
9
+
10
+
11
+ class ComponentStatus(str, Enum):
12
+ PENDING = "pending"
13
+ RUNNING = "running"
14
+ COMPLETED = "completed"
15
+ FAILED = "failed"
16
+ SKIPPED = "skipped"
17
+
18
+
19
+ @dataclass
20
+ class ComponentResult:
21
+ success: bool
22
+ status: ComponentStatus
23
+ artifacts: Dict[str, str] = field(default_factory=dict)
24
+ metrics: Dict[str, float] = field(default_factory=dict)
25
+ errors: List[str] = field(default_factory=list)
26
+ warnings: List[str] = field(default_factory=list)
27
+ duration_seconds: float = 0.0
28
+ output_data: Optional[Any] = None
29
+
30
+ def get_summary(self) -> str:
31
+ return f"{self.status.value.upper()} in {self.duration_seconds:.1f}s"
32
+
33
+ def to_dict(self) -> Dict[str, Any]:
34
+ return {
35
+ "success": self.success,
36
+ "status": self.status.value,
37
+ "artifacts": self.artifacts,
38
+ "metrics": self.metrics,
39
+ "errors": self.errors,
40
+ "warnings": self.warnings,
41
+ "duration_seconds": self.duration_seconds,
42
+ }
43
+
44
+
45
+ class Component(ABC):
46
+ def __init__(self, name: str, chapters: List[int]):
47
+ self.name = name
48
+ self.chapters = chapters
49
+ self._start_time: Optional[float] = None
50
+
51
+ @abstractmethod
52
+ def validate_inputs(self, context: "PipelineContext") -> List[str]:
53
+ pass
54
+
55
+ @abstractmethod
56
+ def run(self, context: "PipelineContext") -> ComponentResult:
57
+ pass
58
+
59
+ def should_skip(self, context: "PipelineContext") -> bool:
60
+ return False
61
+
62
+ def create_result(self, success: bool, artifacts: Optional[Dict[str, str]] = None,
63
+ metrics: Optional[Dict[str, float]] = None, errors: Optional[List[str]] = None,
64
+ warnings: Optional[List[str]] = None, output_data: Optional[Any] = None) -> ComponentResult:
65
+ duration = time.time() - self._start_time if self._start_time else 0.0
66
+ status = ComponentStatus.COMPLETED if success else ComponentStatus.FAILED
67
+ return ComponentResult(
68
+ success=success,
69
+ status=status,
70
+ artifacts=artifacts or {},
71
+ metrics=metrics or {},
72
+ errors=errors or [],
73
+ warnings=warnings or [],
74
+ duration_seconds=duration,
75
+ output_data=output_data
76
+ )
77
+
78
+ def _start_timer(self) -> None:
79
+ self._start_time = time.time()
@@ -0,0 +1,13 @@
1
+ from .deployer import Deployer
2
+ from .explainer import Explainer
3
+ from .feature_eng import FeatureEngineer
4
+ from .ingester import Ingester
5
+ from .profiler import Profiler
6
+ from .trainer import Trainer
7
+ from .transformer import Transformer
8
+ from .validator import Validator
9
+
10
+ __all__ = [
11
+ "Ingester", "Profiler", "Transformer", "FeatureEngineer",
12
+ "Trainer", "Validator", "Explainer", "Deployer"
13
+ ]
@@ -0,0 +1,26 @@
1
+ from typing import List
2
+
3
+ from customer_retention.generators.orchestration.context import PipelineContext
4
+
5
+ from ..base import Component, ComponentResult
6
+
7
+
8
+ class Deployer(Component):
9
+ def __init__(self):
10
+ super().__init__(name="Deployer", chapters=[8])
11
+
12
+ def validate_inputs(self, context: PipelineContext) -> List[str]:
13
+ errors = []
14
+ if not context.model_results:
15
+ errors.append("No model results available for deployment")
16
+ return errors
17
+
18
+ def run(self, context: PipelineContext) -> ComponentResult:
19
+ self._start_timer()
20
+ try:
21
+ return self.create_result(
22
+ success=True,
23
+ metrics={"models_registered": 1}
24
+ )
25
+ except Exception as e:
26
+ return self.create_result(success=False, errors=[str(e)])
@@ -0,0 +1,26 @@
1
+ from typing import List
2
+
3
+ from customer_retention.generators.orchestration.context import PipelineContext
4
+
5
+ from ..base import Component, ComponentResult
6
+
7
+
8
+ class Explainer(Component):
9
+ def __init__(self):
10
+ super().__init__(name="Explainer", chapters=[7])
11
+
12
+ def validate_inputs(self, context: PipelineContext) -> List[str]:
13
+ errors = []
14
+ if not context.model_results:
15
+ errors.append("No model results available for explanation")
16
+ return errors
17
+
18
+ def run(self, context: PipelineContext) -> ComponentResult:
19
+ self._start_timer()
20
+ try:
21
+ return self.create_result(
22
+ success=True,
23
+ metrics={"explanations_generated": 1}
24
+ )
25
+ except Exception as e:
26
+ return self.create_result(success=False, errors=[str(e)])
@@ -0,0 +1,33 @@
1
+ from typing import List
2
+
3
+ from customer_retention.generators.orchestration.context import PipelineContext
4
+
5
+ from ..base import Component, ComponentResult
6
+
7
+
8
+ class FeatureEngineer(Component):
9
+ def __init__(self):
10
+ super().__init__(name="FeatureEngineer", chapters=[4])
11
+
12
+ def validate_inputs(self, context: PipelineContext) -> List[str]:
13
+ errors = []
14
+ if context.current_df is None:
15
+ errors.append("No DataFrame available for feature engineering")
16
+ return errors
17
+
18
+ def run(self, context: PipelineContext) -> ComponentResult:
19
+ self._start_timer()
20
+ try:
21
+ from customer_retention.stages.features.feature_engineer import FeatureEngineer as FE
22
+ df = context.current_df
23
+ fe = FE()
24
+ df = fe.engineer_all(df, context.column_configs)
25
+ context.current_df = df
26
+ context.current_stage = "gold"
27
+ return self.create_result(
28
+ success=True,
29
+ artifacts={"gold_data": context.gold_path} if context.gold_path else {},
30
+ metrics={"feature_count": len(df.columns)}
31
+ )
32
+ except Exception as e:
33
+ return self.create_result(success=False, errors=[str(e)])