churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,488 @@
1
+ from dataclasses import dataclass
2
+ from typing import Dict, List, Optional
3
+
4
+ import pandas as pd
5
+
6
+ from .window_recommendation import WINDOW_DAYS_MAP
7
+
8
+ DEFAULT_CANDIDATE_WINDOWS = ["7d", "30d", "90d", "180d", "365d", "all_time"]
9
+ GAP_THRESHOLD_MULTIPLIER = 3.0
10
+ VOLUME_CHANGE_GROWING = 0.25
11
+ VOLUME_CHANGE_DECLINING = -0.25
12
+
13
+
14
+ @dataclass
15
+ class FeatureAvailability:
16
+ column: str
17
+ first_valid_date: Optional[pd.Timestamp]
18
+ last_valid_date: Optional[pd.Timestamp]
19
+ valid_count: int
20
+ total_count: int
21
+ coverage_pct: float
22
+ availability_type: str
23
+ days_from_start: Optional[int]
24
+ days_before_end: Optional[int]
25
+
26
+
27
+ @dataclass
28
+ class FeatureAvailabilityResult:
29
+ data_start: pd.Timestamp
30
+ data_end: pd.Timestamp
31
+ time_span_days: int
32
+ features: List[FeatureAvailability]
33
+ new_tracking: List[str]
34
+ retired_tracking: List[str]
35
+ partial_window: List[str]
36
+ recommendations: List[Dict]
37
+
38
+
39
+ @dataclass
40
+ class TemporalGap:
41
+ start: pd.Timestamp
42
+ end: pd.Timestamp
43
+ duration_days: float
44
+ severity: str
45
+
46
+
47
+ @dataclass
48
+ class EntityWindowCoverage:
49
+ window: str
50
+ window_days: Optional[float]
51
+ active_entities: int
52
+ coverage_pct: float
53
+
54
+
55
+ @dataclass
56
+ class DriftImplication:
57
+ risk_level: str
58
+ volume_drift_risk: str
59
+ population_stability: float
60
+ regime_count: int
61
+ regime_boundaries: List[pd.Timestamp]
62
+ recommended_training_start: Optional[pd.Timestamp]
63
+ rationale: List[str]
64
+
65
+
66
+ @dataclass
67
+ class TemporalCoverageResult:
68
+ time_span_days: int
69
+ first_event: pd.Timestamp
70
+ last_event: pd.Timestamp
71
+ gaps: List[TemporalGap]
72
+ entity_window_coverage: List[EntityWindowCoverage]
73
+ volume_trend: str
74
+ volume_change_pct: float
75
+ recommendations: List[str]
76
+ events_over_time: pd.Series
77
+ new_entities_over_time: pd.Series
78
+
79
+
80
+ def analyze_temporal_coverage(
81
+ df: pd.DataFrame, entity_column: str, time_column: str,
82
+ candidate_windows: Optional[List[str]] = None,
83
+ reference_date: Optional[pd.Timestamp] = None,
84
+ ) -> TemporalCoverageResult:
85
+ times = pd.to_datetime(df[time_column])
86
+ first_event = times.min()
87
+ last_event = times.max()
88
+ time_span_days = max(0, (last_event - first_event).days)
89
+ ref_date = reference_date if reference_date is not None else last_event
90
+ windows = candidate_windows if candidate_windows is not None else DEFAULT_CANDIDATE_WINDOWS
91
+
92
+ grouper_freq, range_freq = _choose_freq(time_span_days)
93
+ df_indexed = pd.DataFrame({"_t": times, "_e": df[entity_column].values})
94
+ df_indexed = df_indexed.set_index("_t").sort_index()
95
+
96
+ events_over_time = df_indexed.resample(grouper_freq).size()
97
+ events_over_time.name = "event_count"
98
+
99
+ first_per_entity = df.assign(_t=times).groupby(entity_column)["_t"].min()
100
+ fpe_indexed = pd.DataFrame({"_count": 1}, index=first_per_entity.values)
101
+ fpe_indexed.index.name = "_t"
102
+ new_entities = fpe_indexed.resample(grouper_freq)["_count"].sum().fillna(0).astype(int)
103
+ new_entities.name = "new_entities"
104
+
105
+ gaps = _detect_gaps(events_over_time, range_freq)
106
+ coverage = _compute_entity_window_coverage(df, entity_column, times, ref_date, windows)
107
+ volume_trend, volume_change = _assess_volume_trend(events_over_time)
108
+ recommendations = _build_recommendations(gaps, volume_trend, volume_change, time_span_days, coverage)
109
+
110
+ return TemporalCoverageResult(
111
+ time_span_days=time_span_days, first_event=first_event, last_event=last_event,
112
+ gaps=gaps, entity_window_coverage=coverage,
113
+ volume_trend=volume_trend, volume_change_pct=volume_change,
114
+ recommendations=recommendations,
115
+ events_over_time=events_over_time, new_entities_over_time=new_entities,
116
+ )
117
+
118
+
119
+ def derive_drift_implications(result: TemporalCoverageResult) -> DriftImplication:
120
+ major_gaps = [g for g in result.gaps if g.severity == "major"]
121
+ regime_boundaries = [g.end for g in major_gaps]
122
+ regime_count = len(regime_boundaries) + 1
123
+ recommended_start = regime_boundaries[-1] if regime_boundaries else None
124
+ volume_drift_risk = _volume_to_drift_risk(result.volume_trend)
125
+ population_stability = _compute_population_stability(result.new_entities_over_time)
126
+ risk_level = _assess_overall_drift_risk(
127
+ volume_drift_risk, population_stability, regime_count, result.time_span_days,
128
+ )
129
+ rationale = _build_drift_rationale(
130
+ volume_drift_risk, result.volume_change_pct, population_stability,
131
+ regime_count, result.time_span_days, major_gaps,
132
+ )
133
+ return DriftImplication(
134
+ risk_level=risk_level, volume_drift_risk=volume_drift_risk,
135
+ population_stability=population_stability, regime_count=regime_count,
136
+ regime_boundaries=regime_boundaries, recommended_training_start=recommended_start,
137
+ rationale=rationale,
138
+ )
139
+
140
+
141
+ def _volume_to_drift_risk(volume_trend: str) -> str:
142
+ if volume_trend == "growing":
143
+ return "growing"
144
+ if volume_trend == "declining":
145
+ return "declining"
146
+ return "none"
147
+
148
+
149
+ def _compute_population_stability(new_entities: pd.Series) -> float:
150
+ if len(new_entities) < 4:
151
+ return 0.5
152
+ total_new = new_entities.sum()
153
+ if total_new == 0:
154
+ return 1.0
155
+ mid = len(new_entities) // 2
156
+ second_half_new = new_entities.iloc[mid:].sum()
157
+ fresh_fraction = second_half_new / total_new
158
+ positive = new_entities[new_entities > 0]
159
+ burstiness = min(1.0, (positive.std() / positive.mean()) / 2.0) if len(positive) >= 2 and positive.mean() > 0 else 0.5
160
+ return round(max(0.0, min(1.0, 1.0 - fresh_fraction * 0.6 - burstiness * 0.4)), 4)
161
+
162
+
163
+ def _assess_overall_drift_risk(
164
+ volume_drift_risk: str, population_stability: float,
165
+ regime_count: int, time_span_days: int,
166
+ ) -> str:
167
+ risk_score = 0.0
168
+ if volume_drift_risk != "none":
169
+ risk_score += 0.3 if volume_drift_risk == "growing" else 0.4
170
+ if population_stability < 0.5:
171
+ risk_score += 0.3
172
+ elif population_stability < 0.7:
173
+ risk_score += 0.15
174
+ if regime_count > 1:
175
+ risk_score += 0.2 * min(regime_count - 1, 3)
176
+ if time_span_days < 90:
177
+ risk_score += 0.3
178
+ if risk_score < 0.25:
179
+ return "low"
180
+ if risk_score < 0.5:
181
+ return "moderate"
182
+ return "high"
183
+
184
+
185
+ def _build_drift_rationale(
186
+ volume_drift_risk: str, volume_change_pct: float,
187
+ population_stability: float, regime_count: int,
188
+ time_span_days: int, major_gaps: List[TemporalGap],
189
+ ) -> List[str]:
190
+ rationale = []
191
+ if volume_drift_risk == "declining":
192
+ rationale.append(
193
+ f"Volume declining ({volume_change_pct:+.0%}) — feature distributions "
194
+ f"computed over recent windows will differ from historical baselines"
195
+ )
196
+ elif volume_drift_risk == "growing":
197
+ rationale.append(
198
+ f"Volume growing ({volume_change_pct:+.0%}) — earlier periods have sparser "
199
+ f"data; model trained on full history may underweight recent patterns"
200
+ )
201
+ if regime_count > 1:
202
+ total_gap_days = sum(g.duration_days for g in major_gaps)
203
+ rationale.append(
204
+ f"{regime_count} distinct data regimes separated by {len(major_gaps)} major "
205
+ f"gap(s) ({total_gap_days:.0f}d total) — training across regime boundaries "
206
+ f"mixes incompatible distributions"
207
+ )
208
+ if population_stability < 0.5:
209
+ rationale.append(
210
+ f"Low population stability ({population_stability:.2f}) — entity influx is "
211
+ f"highly uneven, indicating population composition drift"
212
+ )
213
+ elif population_stability < 0.7:
214
+ rationale.append(
215
+ f"Moderate population stability ({population_stability:.2f}) — some variation "
216
+ f"in entity influx rate suggests gradual population shift"
217
+ )
218
+ if time_span_days < 90:
219
+ rationale.append(
220
+ f"Short observation span ({time_span_days}d) — insufficient history to "
221
+ f"establish stable baselines for drift detection"
222
+ )
223
+ if not rationale:
224
+ rationale.append("Stable volume, consistent population influx, no regime breaks detected")
225
+ return rationale
226
+
227
+
228
+ def _choose_freq(time_span_days: int) -> tuple:
229
+ if time_span_days <= 90:
230
+ return "D", "D"
231
+ if time_span_days <= 730:
232
+ return "W-MON", "W-MON"
233
+ return "ME", "ME"
234
+
235
+
236
+ def _detect_gaps(events_over_time: pd.Series, freq: str) -> List[TemporalGap]:
237
+ if len(events_over_time) < 3:
238
+ return []
239
+ series = events_over_time.copy()
240
+ median_volume = series[series > 0].median() if (series > 0).any() else 0
241
+ if median_volume == 0:
242
+ return []
243
+ threshold = max(1, median_volume / GAP_THRESHOLD_MULTIPLIER)
244
+
245
+ gaps: List[TemporalGap] = []
246
+ gap_start = None
247
+ for ts, vol in series.items():
248
+ if vol < threshold:
249
+ if gap_start is None:
250
+ gap_start = ts
251
+ else:
252
+ if gap_start is not None:
253
+ duration = (ts - gap_start).days
254
+ if duration >= 3:
255
+ gaps.append(TemporalGap(
256
+ start=gap_start, end=ts,
257
+ duration_days=float(duration),
258
+ severity=_classify_gap_severity(duration),
259
+ ))
260
+ gap_start = None
261
+ if gap_start is not None:
262
+ end = series.index[-1]
263
+ duration = (end - gap_start).days
264
+ if duration >= 3:
265
+ gaps.append(TemporalGap(
266
+ start=gap_start, end=end,
267
+ duration_days=float(duration),
268
+ severity=_classify_gap_severity(duration),
269
+ ))
270
+ return gaps
271
+
272
+
273
+ def _classify_gap_severity(duration_days: float) -> str:
274
+ if duration_days < 7:
275
+ return "minor"
276
+ if duration_days < 30:
277
+ return "moderate"
278
+ return "major"
279
+
280
+
281
+ def _compute_entity_window_coverage(
282
+ df: pd.DataFrame, entity_column: str, times: pd.Series,
283
+ reference_date: pd.Timestamp, windows: List[str],
284
+ ) -> List[EntityWindowCoverage]:
285
+ total_entities = df[entity_column].nunique()
286
+ results = []
287
+ for window in windows:
288
+ window_days = WINDOW_DAYS_MAP.get(window)
289
+ if window_days is None:
290
+ results.append(EntityWindowCoverage(
291
+ window=window, window_days=None,
292
+ active_entities=total_entities, coverage_pct=1.0,
293
+ ))
294
+ continue
295
+ cutoff = reference_date - pd.Timedelta(days=window_days)
296
+ mask = (times >= cutoff) & (times <= reference_date)
297
+ active = df.loc[mask, entity_column].nunique()
298
+ results.append(EntityWindowCoverage(
299
+ window=window, window_days=window_days,
300
+ active_entities=active, coverage_pct=active / total_entities if total_entities > 0 else 0.0,
301
+ ))
302
+ return results
303
+
304
+
305
+ def _assess_volume_trend(events_over_time: pd.Series) -> tuple:
306
+ if len(events_over_time) < 4:
307
+ return "stable", 0.0
308
+ mid = len(events_over_time) // 2
309
+ first_half = events_over_time.iloc[:mid].mean()
310
+ second_half = events_over_time.iloc[mid:].mean()
311
+ if first_half == 0:
312
+ change_pct = 1.0 if second_half > 0 else 0.0
313
+ else:
314
+ change_pct = (second_half - first_half) / first_half
315
+ if change_pct > VOLUME_CHANGE_GROWING:
316
+ return "growing", round(float(change_pct), 4)
317
+ if change_pct < VOLUME_CHANGE_DECLINING:
318
+ return "declining", round(float(change_pct), 4)
319
+ return "stable", round(float(change_pct), 4)
320
+
321
+
322
+ def _build_recommendations(
323
+ gaps: List[TemporalGap], volume_trend: str, volume_change: float,
324
+ time_span_days: int, coverage: List[EntityWindowCoverage],
325
+ ) -> List[str]:
326
+ recs = []
327
+ major_gaps = [g for g in gaps if g.severity == "major"]
328
+ if major_gaps:
329
+ total_gap_days = sum(g.duration_days for g in major_gaps)
330
+ recs.append(
331
+ f"Data has {len(major_gaps)} major gap(s) totaling {total_gap_days:.0f} days "
332
+ f"— consider excluding gap periods or treating them as separate epochs"
333
+ )
334
+ if volume_trend == "declining":
335
+ recs.append(
336
+ f"Volume declining ({volume_change:+.0%}) — recent data may underrepresent entity activity; "
337
+ f"verify data pipeline completeness"
338
+ )
339
+ if volume_trend == "growing":
340
+ recs.append(
341
+ f"Volume growing ({volume_change:+.0%}) — earlier periods have sparser data; "
342
+ f"longer windows may mix density regimes"
343
+ )
344
+ if time_span_days < 90:
345
+ recs.append(
346
+ f"Limited time span ({time_span_days}d) — only short aggregation windows (7d, 30d) are reliable"
347
+ )
348
+ low_coverage = [c for c in coverage if c.window_days is not None and c.coverage_pct < 0.10]
349
+ if low_coverage:
350
+ windows_str = ", ".join(c.window for c in low_coverage)
351
+ recs.append(f"Very few entities active in windows [{windows_str}] — these may produce mostly zeros")
352
+ return recs
353
+
354
+
355
+ def analyze_feature_availability(df: pd.DataFrame, time_column: str, exclude_columns: Optional[List[str]] = None, late_start_threshold_pct: float = 10.0, early_end_threshold_pct: float = 10.0) -> FeatureAvailabilityResult:
356
+ times = pd.to_datetime(df[time_column])
357
+ data_start, data_end = times.min(), times.max()
358
+ time_span_days = max(1, (data_end - data_start).days)
359
+ late_threshold_days = time_span_days * late_start_threshold_pct / 100
360
+ early_threshold_days = time_span_days * early_end_threshold_pct / 100
361
+
362
+ exclude = set(exclude_columns or []) | {time_column}
363
+ columns_to_check = [c for c in df.columns if c not in exclude]
364
+
365
+ features = []
366
+ new_tracking, retired_tracking, partial_window = [], [], []
367
+
368
+ for col in columns_to_check:
369
+ valid_mask = df[col].notna()
370
+ valid_count = valid_mask.sum()
371
+ total_count = len(df)
372
+ coverage_pct = valid_count / total_count * 100 if total_count > 0 else 0
373
+
374
+ if valid_count == 0:
375
+ features.append(FeatureAvailability(
376
+ column=col, first_valid_date=None, last_valid_date=None,
377
+ valid_count=0, total_count=total_count, coverage_pct=0,
378
+ availability_type="empty", days_from_start=None, days_before_end=None
379
+ ))
380
+ continue
381
+
382
+ valid_times = times[valid_mask]
383
+ first_valid, last_valid = valid_times.min(), valid_times.max()
384
+ days_from_start = (first_valid - data_start).days
385
+ days_before_end = (data_end - last_valid).days
386
+
387
+ is_late_start = days_from_start > late_threshold_days
388
+ is_early_end = days_before_end > early_threshold_days
389
+
390
+ if is_late_start and is_early_end:
391
+ availability_type = "partial_window"
392
+ partial_window.append(col)
393
+ elif is_late_start:
394
+ availability_type = "new_tracking"
395
+ new_tracking.append(col)
396
+ elif is_early_end:
397
+ availability_type = "retired"
398
+ retired_tracking.append(col)
399
+ else:
400
+ availability_type = "full"
401
+
402
+ features.append(FeatureAvailability(
403
+ column=col, first_valid_date=first_valid, last_valid_date=last_valid,
404
+ valid_count=valid_count, total_count=total_count, coverage_pct=coverage_pct,
405
+ availability_type=availability_type, days_from_start=days_from_start,
406
+ days_before_end=days_before_end
407
+ ))
408
+
409
+ recommendations = _build_availability_recommendations(
410
+ features, new_tracking, retired_tracking, partial_window, time_span_days
411
+ )
412
+
413
+ return FeatureAvailabilityResult(
414
+ data_start=data_start, data_end=data_end, time_span_days=time_span_days,
415
+ features=features, new_tracking=new_tracking, retired_tracking=retired_tracking,
416
+ partial_window=partial_window, recommendations=recommendations
417
+ )
418
+
419
+
420
+ def _find_feature(features: List[FeatureAvailability], col: str) -> Optional[FeatureAvailability]:
421
+ return next((f for f in features if f.column == col), None)
422
+
423
+
424
+ def _build_new_tracking_rec(feat: FeatureAvailability, col: str) -> Dict:
425
+ return {
426
+ "column": col, "issue": "new_tracking", "priority": "high",
427
+ "reason": f"Tracking started {feat.days_from_start}d after data start ({feat.coverage_pct:.0f}% coverage)",
428
+ "options": [
429
+ f"Filter training data to start from {feat.first_valid_date.date()}",
430
+ f"Create '{col}_available' indicator for models",
431
+ "Exclude from features if coverage too low"
432
+ ]
433
+ }
434
+
435
+
436
+ def _build_retired_rec(feat: FeatureAvailability, col: str) -> Dict:
437
+ return {
438
+ "column": col, "issue": "retired", "priority": "high",
439
+ "reason": f"Tracking stopped {feat.days_before_end}d before data end ({feat.coverage_pct:.0f}% coverage)",
440
+ "options": [
441
+ f"Filter data to end at {feat.last_valid_date.date()} for this feature",
442
+ f"Create '{col}_available' indicator",
443
+ "Exclude if feature won't be available for scoring"
444
+ ]
445
+ }
446
+
447
+
448
+ def _build_partial_window_rec(feat: FeatureAvailability, col: str) -> Dict:
449
+ return {
450
+ "column": col, "issue": "partial_window", "priority": "high",
451
+ "reason": f"Only available {feat.first_valid_date.date()} to {feat.last_valid_date.date()} ({feat.coverage_pct:.0f}% coverage)",
452
+ "options": [
453
+ "Use only within available window",
454
+ "Consider excluding - limited applicability",
455
+ f"Create '{col}_available' indicator if keeping"
456
+ ]
457
+ }
458
+
459
+
460
+ def _build_availability_recommendations(
461
+ features: List[FeatureAvailability], new_tracking: List[str],
462
+ retired_tracking: List[str], partial_window: List[str], time_span_days: int,
463
+ ) -> List[Dict]:
464
+ recs = []
465
+ builders = [
466
+ (new_tracking, _build_new_tracking_rec),
467
+ (retired_tracking, _build_retired_rec),
468
+ (partial_window, _build_partial_window_rec),
469
+ ]
470
+ for tracking_list, build_fn in builders:
471
+ for col in tracking_list:
472
+ feat = _find_feature(features, col)
473
+ if feat is not None:
474
+ recs.append(build_fn(feat, col))
475
+
476
+ problem_cols = new_tracking + retired_tracking + partial_window
477
+ if problem_cols:
478
+ recs.append({
479
+ "column": "_general_", "issue": "train_test_split", "priority": "high",
480
+ "reason": f"{len(problem_cols)} columns have availability boundaries",
481
+ "options": [
482
+ "Ensure train/test split doesn't cross availability boundaries",
483
+ "Use time-based split after latest tracking start date",
484
+ "Document which features are unavailable for which periods"
485
+ ]
486
+ })
487
+
488
+ return recs