churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,513 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Any, Dict, List, Optional, Tuple
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+
7
+ from customer_retention.core.compat import DataFrame
8
+
9
+
10
+ @dataclass
11
+ class PatternAnalysisConfig:
12
+ entity_column: str
13
+ time_column: str
14
+ target_column: Optional[str] = None
15
+ aggregation_windows: List[str] = field(default_factory=list)
16
+ velocity_window_days: int = 7
17
+ short_momentum_window: int = 7
18
+ long_momentum_window: int = 30
19
+ rolling_window: int = 7
20
+ sparkline_columns: List[str] = field(default_factory=list)
21
+ sparkline_freq: str = "W"
22
+ sparkline_agg: str = "mean"
23
+ has_target: bool = False
24
+ is_event_level: bool = True
25
+
26
+ @classmethod
27
+ def from_findings(cls, findings: Any, target_column: Optional[str] = None, window_override: Optional[List[str]] = None) -> "PatternAnalysisConfig":
28
+ ts_meta = findings.time_series_metadata
29
+ if ts_meta is None:
30
+ raise ValueError("Findings do not contain time series metadata. Run notebook 01a first.")
31
+
32
+ windows = window_override or ts_meta.suggested_aggregations or ["7d", "30d", "90d"]
33
+ target_col = target_column or findings.target_column
34
+
35
+ config = cls(
36
+ entity_column=ts_meta.entity_column, time_column=ts_meta.time_column,
37
+ target_column=target_col, aggregation_windows=windows,
38
+ has_target=target_col is not None, is_event_level=True)
39
+ config._derive_window_settings()
40
+ return config
41
+
42
+ def _derive_window_settings(self):
43
+ if not self.aggregation_windows:
44
+ return
45
+ parsed = [self._parse_window_to_days(w) for w in self.aggregation_windows]
46
+ window_days = sorted([d for d in parsed if d is not None])
47
+ if not window_days:
48
+ return
49
+ shortest = window_days[0]
50
+ self.velocity_window_days = shortest
51
+ self.rolling_window = shortest
52
+ self.short_momentum_window = shortest
53
+ self.long_momentum_window = window_days[1] if len(window_days) >= 2 else shortest * 4
54
+
55
+ def _parse_window_to_days(self, window: str) -> Optional[int]:
56
+ if not window:
57
+ return None
58
+ w = window.lower().strip()
59
+ multipliers = {"d": 1, "w": 7, "m": 30}
60
+ for suffix, mult in multipliers.items():
61
+ if w.endswith(suffix):
62
+ try:
63
+ return int(w[:-1]) * mult
64
+ except ValueError:
65
+ return None
66
+ try:
67
+ return int(w)
68
+ except ValueError:
69
+ return None
70
+
71
+ def get_momentum_pairs(self) -> List[Tuple[int, int]]:
72
+ if len(self.aggregation_windows) < 2:
73
+ return [(self.short_momentum_window, self.long_momentum_window)]
74
+ window_days = sorted({d for w in self.aggregation_windows if (d := self._parse_window_to_days(w))})
75
+ pairs = [(window_days[i], window_days[i + 1]) for i in range(len(window_days) - 1)]
76
+ return pairs if pairs else [(self.short_momentum_window, self.long_momentum_window)]
77
+
78
+ def format_config(self) -> str:
79
+ lines = [
80
+ "=" * 70,
81
+ "PATTERN ANALYSIS CONFIGURATION",
82
+ "=" * 70,
83
+ f"\nCore Columns:\n Entity: {self.entity_column}\n Time: {self.time_column}",
84
+ f" Target: {self.target_column or '(none)'}",
85
+ f"\nAggregation Windows (from findings):\n {self.aggregation_windows}",
86
+ f"\nDerived Settings:\n Velocity window: {self.velocity_window_days} days",
87
+ f" Rolling window: {self.rolling_window} days\n Momentum pairs: {self.get_momentum_pairs()}",
88
+ ]
89
+ if self.sparkline_columns:
90
+ lines.append(f"\nSparkline Config:\n Columns: {self.sparkline_columns}")
91
+ lines.append(f" Frequency: {self.sparkline_freq}\n Aggregation: {self.sparkline_agg}")
92
+ return "\n".join(lines)
93
+
94
+ def print_config(self):
95
+ print(self.format_config())
96
+
97
+ def configure_sparklines(self, df: DataFrame, columns: Optional[List[str]] = None, max_columns: int = 5):
98
+ if columns:
99
+ self.sparkline_columns = columns[:max_columns]
100
+ return
101
+ exclude = {self.entity_column, self.time_column, self.target_column} - {None}
102
+ candidates = [c for c in df.select_dtypes(include=[np.number]).columns if c not in exclude]
103
+ self.sparkline_columns = candidates[:max_columns]
104
+
105
+
106
+ @dataclass
107
+ class PatternAnalysisResult:
108
+ trend_detected: bool = False
109
+ trend_direction: Optional[str] = None
110
+ trend_strength: float = 0.0
111
+ seasonality_detected: bool = False
112
+ seasonality_periods: List[str] = field(default_factory=list)
113
+ recency_effect: bool = False
114
+ recency_correlation: float = 0.0
115
+ cohort_effect: bool = False
116
+ cohort_trend: Optional[str] = None
117
+ velocity_features_recommended: List[str] = field(default_factory=list)
118
+ momentum_features_recommended: List[str] = field(default_factory=list)
119
+
120
+ def format_summary(self) -> str:
121
+ lines = ["\n" + "=" * 70 + "\nPATTERN ANALYSIS SUMMARY\n" + "=" * 70]
122
+ patterns = []
123
+ if self.trend_detected:
124
+ patterns.append(f"Trend: {self.trend_direction} (strength: {self.trend_strength:.2f})")
125
+ if self.seasonality_detected:
126
+ patterns.append(f"Seasonality: {', '.join(self.seasonality_periods)}")
127
+ if self.recency_effect:
128
+ patterns.append(f"Recency effect: r={self.recency_correlation:.2f}")
129
+ if self.cohort_effect:
130
+ patterns.append(f"Cohort effect: {self.cohort_trend}")
131
+ if patterns:
132
+ lines.append("\nDetected Patterns:")
133
+ for p in patterns:
134
+ lines.append(f" - {p}")
135
+ else:
136
+ lines.append("\n No significant patterns detected")
137
+ if self.velocity_features_recommended:
138
+ lines.append(f"\nRecommended velocity features: {self.velocity_features_recommended}")
139
+ if self.momentum_features_recommended:
140
+ lines.append(f"Recommended momentum features: {self.momentum_features_recommended}")
141
+ return "\n".join(lines)
142
+
143
+ def print_summary(self):
144
+ print(self.format_summary())
145
+
146
+
147
+ def get_sparkline_frequency(time_span_days: int) -> str:
148
+ if time_span_days <= 60:
149
+ return "D"
150
+ return "W" if time_span_days <= 365 else "ME"
151
+
152
+
153
+ def select_columns_by_variance(df: DataFrame, numeric_cols: List[str], max_cols: int = 6) -> List[str]:
154
+ scores = {}
155
+ for col in numeric_cols:
156
+ if col not in df.columns:
157
+ continue
158
+ col_data = df[col].dropna()
159
+ if len(col_data) == 0:
160
+ continue
161
+ std_val, mean_val = col_data.std(), abs(col_data.mean())
162
+ if std_val == 0 or mean_val < 1e-10:
163
+ continue
164
+ cv = std_val / mean_val
165
+ scores[col] = cv if not np.isnan(cv) else 0
166
+ return sorted(scores, key=scores.get, reverse=True)[:max_cols]
167
+
168
+
169
+ def validate_not_event_level(
170
+ df: DataFrame, entity_column: str, target_column: Optional[str]
171
+ ) -> None:
172
+ if target_column is None:
173
+ return
174
+ n_entities, n_rows = df[entity_column].nunique(), len(df)
175
+ if n_entities < n_rows:
176
+ raise ValueError(
177
+ f"Target comparisons not allowed on event-level data. "
178
+ f"Found {n_rows:,} rows but only {n_entities:,} entities. "
179
+ f"Aggregate to entity level first using TimeWindowAggregator, "
180
+ f"or use select_columns_by_variance() for column selection."
181
+ )
182
+
183
+
184
+ def get_analysis_frequency(time_span_days: int) -> Tuple[str, str]:
185
+ if time_span_days <= 90:
186
+ return "D", "Daily"
187
+ return ("W", "Weekly") if time_span_days <= 365 else ("ME", "Monthly")
188
+
189
+
190
+ @dataclass
191
+ class SparklineData:
192
+ column: str
193
+ weeks: List
194
+ retained_values: List[float]
195
+ churned_values: Optional[List[float]] = None
196
+ has_target_split: bool = False
197
+
198
+ @property
199
+ def divergence_score(self) -> float:
200
+ if not self.has_target_split or self.churned_values is None:
201
+ return 0.0
202
+ import numpy as np
203
+ ret_arr = np.array([v for v in self.retained_values if v is not None and not np.isnan(v)])
204
+ churn_arr = np.array([v for v in self.churned_values if v is not None and not np.isnan(v)])
205
+ if len(ret_arr) == 0 or len(churn_arr) == 0:
206
+ return 0.0
207
+ return abs(ret_arr.mean() - churn_arr.mean()) / max(ret_arr.std(), churn_arr.std(), 0.001)
208
+
209
+
210
+ class SparklineDataBuilder:
211
+ def __init__(self, entity_column: str, time_column: str,
212
+ target_column: Optional[str] = None, freq: str = "W"):
213
+ self.entity_column = entity_column
214
+ self.time_column = time_column
215
+ self.target_column = target_column
216
+ self.freq = freq
217
+
218
+ def build(self, df: DataFrame, columns: List[str]) -> Tuple[List[SparklineData], bool]:
219
+ import pandas as pd
220
+ has_target = self.target_column is not None and self.target_column in df.columns
221
+ if has_target:
222
+ validate_not_event_level(df, self.entity_column, self.target_column)
223
+ df_work = self._prepare_working_df(df, has_target)
224
+ df_work['_period'] = pd.to_datetime(df_work[self.time_column]).dt.to_period(self.freq).dt.start_time
225
+ results = [self._build_sparkline_for_column(df_work, col, has_target)
226
+ for col in columns if col in df_work.columns]
227
+ return results, has_target
228
+
229
+ def _prepare_working_df(self, df: DataFrame, has_target: bool) -> DataFrame:
230
+ if has_target:
231
+ entity_target = df.groupby(self.entity_column)[self.target_column].first()
232
+ return df.merge(
233
+ entity_target.reset_index().rename(columns={self.target_column: '_target'}),
234
+ on=self.entity_column)
235
+ df_work = df.copy()
236
+ df_work['_target'] = 1
237
+ return df_work
238
+
239
+ def _build_sparkline_for_column(self, df_work: DataFrame, col: str, has_target: bool) -> SparklineData:
240
+ import numpy as np
241
+ if has_target:
242
+ retained = df_work[df_work['_target'] == 1].groupby('_period')[col].mean()
243
+ churned = df_work[df_work['_target'] == 0].groupby('_period')[col].mean()
244
+ all_periods = sorted(set(retained.index) | set(churned.index))
245
+ retained_vals = [retained.get(p, np.nan) for p in all_periods]
246
+ churned_vals = [churned.get(p, np.nan) for p in all_periods]
247
+ else:
248
+ overall = df_work.groupby('_period')[col].mean()
249
+ all_periods, retained_vals, churned_vals = sorted(overall.index), overall.tolist(), None
250
+ return SparklineData(column=col, weeks=all_periods, retained_values=retained_vals,
251
+ churned_values=churned_vals, has_target_split=has_target)
252
+
253
+ def format_summary(self, sparkline_data: List[SparklineData], has_target: bool) -> str:
254
+ lines = ["=" * 70]
255
+ if has_target:
256
+ lines.append("SPARKLINE COMPARISON: Retained vs Churned Trends\n" + "=" * 70)
257
+ lines.append("\n Retained (target=1) | Churned (target=0)\n")
258
+ else:
259
+ lines.append("SPARKLINE TRENDS: Overall Patterns\n" + "=" * 70)
260
+ for data in sparkline_data:
261
+ if data.has_target_split:
262
+ lines.append(f" {data.column}: divergence={data.divergence_score:.2f}")
263
+ return "\n".join(lines)
264
+
265
+ def print_summary(self, sparkline_data: List[SparklineData], has_target: bool):
266
+ print(self.format_summary(sparkline_data, has_target))
267
+
268
+
269
+ @dataclass
270
+ class FindingsValidationResult:
271
+ valid: bool
272
+ missing_sections: List[str] = field(default_factory=list)
273
+ warnings: List[str] = field(default_factory=list)
274
+
275
+ def format_summary(self) -> str:
276
+ lines = []
277
+ if not self.valid:
278
+ lines.append("MISSING REQUIRED ANALYSIS:")
279
+ for m in self.missing_sections:
280
+ lines.append(f" - {m}")
281
+ for w in self.warnings:
282
+ lines.append(f" Warning: {w}")
283
+ return "\n".join(lines)
284
+
285
+ def print_summary(self):
286
+ print(self.format_summary())
287
+
288
+
289
+ def validate_temporal_findings(findings: Any) -> FindingsValidationResult:
290
+ missing: List[str] = []
291
+ warnings: List[str] = []
292
+
293
+ if findings.time_series_metadata is None:
294
+ missing.append("time_series_metadata (run 01a first)")
295
+ elif not findings.time_series_metadata.suggested_aggregations:
296
+ warnings.append("No aggregation windows defined - defaults will be used")
297
+
298
+ pattern_meta = findings.metadata.get("temporal_patterns", {}) if findings.metadata else {}
299
+ if not pattern_meta:
300
+ missing.append("temporal_patterns (run 01c first)")
301
+ else:
302
+ for section in ["trend", "recency", "momentum"]:
303
+ if section not in pattern_meta:
304
+ warnings.append(f"No {section} analysis found in 01c")
305
+
306
+ return FindingsValidationResult(
307
+ valid=len(missing) == 0,
308
+ missing_sections=missing,
309
+ warnings=warnings,
310
+ )
311
+
312
+
313
+ @dataclass
314
+ class AggregationFeatureConfig:
315
+ trend_features: List[str] = field(default_factory=list)
316
+ seasonality_features: List[str] = field(default_factory=list)
317
+ cohort_features: List[str] = field(default_factory=list)
318
+ recency_features: List[str] = field(default_factory=list)
319
+ categorical_features: List[str] = field(default_factory=list)
320
+ velocity_features: List[str] = field(default_factory=list)
321
+ momentum_features: List[str] = field(default_factory=list)
322
+ lag_features: List[str] = field(default_factory=list)
323
+ sparkline_features: List[str] = field(default_factory=list)
324
+ priority_features: List[str] = field(default_factory=list)
325
+ text_pca_columns: List[str] = field(default_factory=list)
326
+ scaling_recommendations: List[Dict[str, Any]] = field(default_factory=list)
327
+ divergent_columns: List[str] = field(default_factory=list)
328
+ feature_flags: Dict[str, Any] = field(default_factory=dict)
329
+
330
+ @classmethod
331
+ def from_findings(cls, findings: Any) -> "AggregationFeatureConfig":
332
+ pattern_meta = findings.metadata.get("temporal_patterns", {}) if findings.metadata else {}
333
+
334
+ def extract_features(section: str) -> List[str]:
335
+ features = []
336
+ for rec in pattern_meta.get(section, {}).get("recommendations", []):
337
+ features.extend(rec.get("features", []))
338
+ return features
339
+
340
+ def extract_priority_features(section: str, priority_values: tuple = ("high",)) -> List[str]:
341
+ features = []
342
+ for rec in pattern_meta.get(section, {}).get("recommendations", []):
343
+ if rec.get("priority") in priority_values or rec.get("priority") == 1:
344
+ features.extend(rec.get("features", []))
345
+ if rec.get("feature"):
346
+ features.append(rec["feature"])
347
+ return features
348
+
349
+ cohort_features = [f for rec in pattern_meta.get("cohort", {}).get("recommendations", []) if rec.get("action") != "skip_cohort_features" for f in rec.get("features", [])]
350
+ sparkline_features = extract_features("sparkline")
351
+ scaling_recs = [rec for rec in pattern_meta.get("sparkline", {}).get("recommendations", []) if rec.get("action") in ("robust_scale", "normalize")]
352
+ priority_set: set = set()
353
+ for section in ["effect_size", "predictive_power", "velocity", "momentum"]:
354
+ priority_set.update(extract_priority_features(section))
355
+ for rec in pattern_meta.get("effect_size", {}).get("recommendations", []):
356
+ if rec.get("action") == "prioritize_feature" and rec.get("feature"):
357
+ priority_set.add(rec["feature"])
358
+ for rec in pattern_meta.get("predictive_power", {}).get("recommendations", []):
359
+ if rec.get("action") == "include_feature" and rec.get("feature"):
360
+ priority_set.add(rec["feature"])
361
+
362
+ text_pca_cols = _extract_text_pca_columns(findings)
363
+
364
+ return cls(
365
+ trend_features=extract_features("trend"),
366
+ seasonality_features=extract_features("seasonality"),
367
+ cohort_features=cohort_features,
368
+ recency_features=extract_features("recency"),
369
+ categorical_features=extract_features("categorical"),
370
+ velocity_features=extract_features("velocity"),
371
+ momentum_features=extract_features("momentum"),
372
+ lag_features=extract_features("lag"),
373
+ sparkline_features=sparkline_features,
374
+ text_pca_columns=text_pca_cols,
375
+ priority_features=list(priority_set),
376
+ scaling_recommendations=scaling_recs,
377
+ divergent_columns=pattern_meta.get("momentum", {}).get("_divergent_columns", []),
378
+ feature_flags=pattern_meta.get("feature_flags", {}),
379
+ )
380
+
381
+ def get_all_features(self) -> List[str]:
382
+ all_feats = (
383
+ self.trend_features + self.seasonality_features + self.cohort_features
384
+ + self.recency_features + self.categorical_features + self.velocity_features
385
+ + self.momentum_features + self.lag_features + self.sparkline_features
386
+ + self.text_pca_columns
387
+ )
388
+ return list(dict.fromkeys(all_feats))
389
+
390
+ def get_priority_features(self) -> List[str]:
391
+ return self.priority_features
392
+
393
+ def format_summary(self) -> str:
394
+ lines = ["=" * 70, "AGGREGATION FEATURE CONFIG", "=" * 70]
395
+ if self.trend_features:
396
+ lines.append(f"\nTrend features: {self.trend_features}")
397
+ if self.seasonality_features:
398
+ lines.append(f"Seasonality features: {self.seasonality_features}")
399
+ if self.cohort_features:
400
+ lines.append(f"Cohort features: {self.cohort_features}")
401
+ if self.recency_features:
402
+ lines.append(f"Recency features: {self.recency_features}")
403
+ if self.categorical_features:
404
+ lines.append(f"Categorical features: {self.categorical_features}")
405
+ if self.velocity_features:
406
+ lines.append(f"Velocity features: {self.velocity_features}")
407
+ if self.momentum_features:
408
+ lines.append(f"Momentum features: {self.momentum_features}")
409
+ if self.lag_features:
410
+ lines.append(f"Lag features: {self.lag_features}")
411
+ if self.sparkline_features:
412
+ lines.append(f"Sparkline features: {self.sparkline_features}")
413
+ if self.priority_features:
414
+ lines.append(f"\nPriority features (from effect size/IV): {self.priority_features}")
415
+ if self.scaling_recommendations:
416
+ lines.append(f"Scaling recommendations: {len(self.scaling_recommendations)} features")
417
+ if self.divergent_columns:
418
+ lines.append(f"\nDivergent columns: {self.divergent_columns}")
419
+ if self.text_pca_columns:
420
+ lines.append(f"Text PCA columns: {self.text_pca_columns}")
421
+ if self.feature_flags:
422
+ lines.append(f"\nFeature flags: {self.feature_flags}")
423
+ return "\n".join(lines)
424
+
425
+ def format_recommendation_summary(self) -> str:
426
+ sections = [
427
+ ("trend", self.trend_features),
428
+ ("seasonality", self.seasonality_features),
429
+ ("recency", self.recency_features),
430
+ ("cohort", self.cohort_features),
431
+ ("velocity", self.velocity_features),
432
+ ("momentum", self.momentum_features),
433
+ ("lag", self.lag_features),
434
+ ("sparkline", self.sparkline_features),
435
+ ("effect_size", self.priority_features),
436
+ ("predictive_power", self.priority_features),
437
+ ("text_pca", self.text_pca_columns),
438
+ ]
439
+ lines = ["RECOMMENDATION APPLICATION SUMMARY", "=" * 50]
440
+ lines.append(f"{'Section':<20} {'Features':>8}")
441
+ lines.append("-" * 30)
442
+ total = 0
443
+ for name, features in sections:
444
+ n = len(features)
445
+ total += n
446
+ lines.append(f"{name:<20} {n:>8}")
447
+ lines.append("-" * 30)
448
+ lines.append(f"{'Total':<20} {total:>8}")
449
+ if self.feature_flags:
450
+ lines.append(f"\nFeature flags: {self.feature_flags}")
451
+ if self.scaling_recommendations:
452
+ lines.append(f"Scaling recs: {len(self.scaling_recommendations)}")
453
+ return "\n".join(lines)
454
+
455
+ def print_recommendation_summary(self):
456
+ print(self.format_recommendation_summary())
457
+
458
+ def print_summary(self):
459
+ print(self.format_summary())
460
+
461
+
462
+ def _extract_text_pca_columns(findings: Any) -> List[str]:
463
+ text_processing = getattr(findings, "text_processing", None)
464
+ if not text_processing:
465
+ return []
466
+ columns = []
467
+ for meta in text_processing.values():
468
+ cols = getattr(meta, "component_columns", None) or []
469
+ columns.extend(cols)
470
+ return columns
471
+
472
+
473
+ def get_duplicate_event_count(findings: Any) -> int:
474
+ metadata = getattr(findings, "metadata", None) or {}
475
+ issues = (metadata.get("temporal_quality") or {}).get("issues") or {}
476
+ return issues.get("duplicate_events", 0)
477
+
478
+
479
+ def deduplicate_events(df: DataFrame, entity_column: str, time_column: str, duplicate_count: int = 0) -> Tuple[DataFrame, int]:
480
+ if duplicate_count <= 0:
481
+ return df, 0
482
+ before = len(df)
483
+ df = df.drop_duplicates(subset=[entity_column, time_column], keep="first")
484
+ return df, before - len(df)
485
+
486
+
487
+ def create_recency_bucket_feature(df: DataFrame, recency_column: str = "days_since_last_event") -> DataFrame:
488
+ if recency_column not in df.columns:
489
+ return df
490
+ edges = [0, 7, 30, 90, 180, float("inf")]
491
+ labels = ["0-7d", "8-30d", "31-90d", "91-180d", ">180d"]
492
+ df = df.copy()
493
+ df["recency_bucket"] = pd.cut(df[recency_column], bins=edges, labels=labels, include_lowest=True).astype("object")
494
+ df.loc[df[recency_column].isna(), "recency_bucket"] = np.nan
495
+ return df
496
+
497
+
498
+ def create_momentum_ratio_features(df: DataFrame, momentum_recs: List[Dict[str, Any]]) -> DataFrame:
499
+ df = df.copy()
500
+ for rec in momentum_recs:
501
+ params = rec.get("params", {})
502
+ short_w, long_w = params.get("short_window"), params.get("long_window")
503
+ source = rec.get("source_column", "")
504
+ if not (short_w and long_w and source):
505
+ continue
506
+ short_col = f"{source}_mean_{short_w}d"
507
+ long_col = f"{source}_mean_{long_w}d"
508
+ if short_col not in df.columns or long_col not in df.columns:
509
+ continue
510
+ feature_name = f"{source}_momentum_{short_w}_{long_w}"
511
+ df[feature_name] = df[short_col] / df[long_col].replace(0, np.nan)
512
+ df[feature_name] = df[feature_name].fillna(1.0)
513
+ return df