churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,470 @@
1
+ """
2
+ Exploration Manager for managing multiple dataset explorations.
3
+
4
+ Provides functionality for:
5
+ - Discovering and loading exploration findings
6
+ - Managing dataset inclusion/exclusion
7
+ - Detecting relationships between datasets
8
+ - Planning aggregations for multi-dataset analysis
9
+ """
10
+ from dataclasses import dataclass, field
11
+ from pathlib import Path
12
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional
13
+
14
+ import yaml
15
+
16
+ from customer_retention.core.config.column_config import DatasetGranularity
17
+
18
+ from .findings import ExplorationFindings
19
+
20
+ if TYPE_CHECKING:
21
+ from .layered_recommendations import RecommendationRegistry
22
+
23
+
24
+ @dataclass
25
+ class DatasetInfo:
26
+ """Information about a discovered dataset."""
27
+ name: str
28
+ findings_path: str
29
+ source_path: str
30
+ granularity: DatasetGranularity
31
+ row_count: int
32
+ column_count: int
33
+ entity_column: Optional[str] = None
34
+ time_column: Optional[str] = None
35
+ target_column: Optional[str] = None
36
+ excluded: bool = False
37
+
38
+
39
+ @dataclass
40
+ class DatasetRelationshipInfo:
41
+ """Information about relationship between two datasets."""
42
+ left_dataset: str
43
+ right_dataset: str
44
+ left_column: str
45
+ right_column: str
46
+ relationship_type: str # one_to_one, one_to_many, many_to_many
47
+ confidence: float = 1.0
48
+ auto_detected: bool = False
49
+
50
+
51
+ @dataclass
52
+ class AggregationPlanItem:
53
+ """Plan for aggregating one event dataset."""
54
+ dataset_name: str
55
+ entity_column: str
56
+ time_column: str
57
+ windows: List[str]
58
+ value_columns: List[str]
59
+ agg_funcs: List[str]
60
+
61
+
62
+ @dataclass
63
+ class MultiDatasetFindings:
64
+ """Findings for multiple related datasets."""
65
+ datasets: Dict[str, DatasetInfo] = field(default_factory=dict)
66
+ relationships: List[DatasetRelationshipInfo] = field(default_factory=list)
67
+ primary_entity_dataset: Optional[str] = None
68
+ event_datasets: List[str] = field(default_factory=list)
69
+ excluded_datasets: List[str] = field(default_factory=list)
70
+ aggregation_windows: List[str] = field(default_factory=lambda: ["24h", "7d", "30d", "90d", "180d", "365d", "all_time"])
71
+ notes: Dict[str, Any] = field(default_factory=dict)
72
+
73
+ @property
74
+ def selected_datasets(self) -> Dict[str, DatasetInfo]:
75
+ """Return only datasets that are not excluded."""
76
+ return {name: info for name, info in self.datasets.items()
77
+ if name not in self.excluded_datasets and not info.excluded}
78
+
79
+ def exclude_dataset(self, name: str) -> None:
80
+ """Exclude a dataset from the pipeline."""
81
+ if name in self.datasets:
82
+ if name not in self.excluded_datasets:
83
+ self.excluded_datasets.append(name)
84
+ self.datasets[name].excluded = True
85
+
86
+ def select_dataset(self, name: str) -> None:
87
+ """Re-include a previously excluded dataset."""
88
+ if name in self.excluded_datasets:
89
+ self.excluded_datasets.remove(name)
90
+ if name in self.datasets:
91
+ self.datasets[name].excluded = False
92
+
93
+ def add_relationship(
94
+ self,
95
+ left_dataset: str,
96
+ right_dataset: str,
97
+ left_column: str,
98
+ right_column: str,
99
+ relationship_type: str = "one_to_many",
100
+ confidence: float = 1.0,
101
+ ) -> None:
102
+ """Add a relationship between datasets."""
103
+ rel = DatasetRelationshipInfo(
104
+ left_dataset=left_dataset,
105
+ right_dataset=right_dataset,
106
+ left_column=left_column,
107
+ right_column=right_column,
108
+ relationship_type=relationship_type,
109
+ confidence=confidence,
110
+ auto_detected=False,
111
+ )
112
+ self.relationships.append(rel)
113
+
114
+ def get_aggregation_plan(self) -> Dict[str, AggregationPlanItem]:
115
+ """Generate aggregation plan for all event datasets."""
116
+ plan = {}
117
+
118
+ for dataset_name in self.event_datasets:
119
+ if dataset_name in self.excluded_datasets:
120
+ continue
121
+
122
+ dataset_info = self.datasets.get(dataset_name)
123
+ if dataset_info and dataset_info.entity_column and dataset_info.time_column:
124
+ plan[dataset_name] = AggregationPlanItem(
125
+ dataset_name=dataset_name,
126
+ entity_column=dataset_info.entity_column,
127
+ time_column=dataset_info.time_column,
128
+ windows=self.aggregation_windows.copy(),
129
+ value_columns=[], # To be filled by user
130
+ agg_funcs=["sum", "mean", "count"],
131
+ )
132
+
133
+ return plan
134
+
135
+ def to_recommendation_registry(self) -> "RecommendationRegistry":
136
+ """Create a RecommendationRegistry from selected datasets."""
137
+ from .layered_recommendations import RecommendationRegistry
138
+
139
+ registry = RecommendationRegistry()
140
+
141
+ for name, info in self.selected_datasets.items():
142
+ registry.add_source(name, info.source_path)
143
+
144
+ if self.primary_entity_dataset and self.primary_entity_dataset in self.selected_datasets:
145
+ primary_info = self.datasets[self.primary_entity_dataset]
146
+ entity_col = primary_info.entity_column or "id"
147
+ time_col = primary_info.time_column
148
+ registry.init_silver(entity_col, time_col)
149
+
150
+ if primary_info.target_column:
151
+ registry.init_gold(primary_info.target_column)
152
+
153
+ for rel in self.relationships:
154
+ if (rel.left_dataset in self.selected_datasets and
155
+ rel.right_dataset in self.selected_datasets and
156
+ registry.silver):
157
+ registry.add_silver_join(
158
+ rel.left_dataset, rel.right_dataset,
159
+ [rel.left_column], rel.relationship_type,
160
+ f"Join {rel.left_dataset} with {rel.right_dataset}"
161
+ )
162
+
163
+ return registry
164
+
165
+ def save(self, path: str) -> None:
166
+ """Save multi-dataset findings to YAML."""
167
+ data = {
168
+ "datasets": {
169
+ name: {
170
+ "name": info.name,
171
+ "findings_path": info.findings_path,
172
+ "source_path": info.source_path,
173
+ "granularity": info.granularity.value,
174
+ "row_count": info.row_count,
175
+ "column_count": info.column_count,
176
+ "entity_column": info.entity_column,
177
+ "time_column": info.time_column,
178
+ "target_column": info.target_column,
179
+ "excluded": info.excluded,
180
+ }
181
+ for name, info in self.datasets.items()
182
+ },
183
+ "relationships": [
184
+ {
185
+ "left_dataset": rel.left_dataset,
186
+ "right_dataset": rel.right_dataset,
187
+ "left_column": rel.left_column,
188
+ "right_column": rel.right_column,
189
+ "relationship_type": rel.relationship_type,
190
+ "confidence": rel.confidence,
191
+ "auto_detected": rel.auto_detected,
192
+ }
193
+ for rel in self.relationships
194
+ ],
195
+ "primary_entity_dataset": self.primary_entity_dataset,
196
+ "event_datasets": self.event_datasets,
197
+ "excluded_datasets": self.excluded_datasets,
198
+ "aggregation_windows": self.aggregation_windows,
199
+ "notes": self.notes,
200
+ }
201
+
202
+ with open(path, "w") as f:
203
+ yaml.dump(data, f, default_flow_style=False, sort_keys=False)
204
+
205
+ @classmethod
206
+ def load(cls, path: str) -> "MultiDatasetFindings":
207
+ """Load multi-dataset findings from YAML."""
208
+ with open(path, "r") as f:
209
+ data = yaml.safe_load(f)
210
+
211
+ datasets = {}
212
+ for name, info in data.get("datasets", {}).items():
213
+ datasets[name] = DatasetInfo(
214
+ name=info["name"],
215
+ findings_path=info["findings_path"],
216
+ source_path=info["source_path"],
217
+ granularity=DatasetGranularity(info["granularity"]),
218
+ row_count=info["row_count"],
219
+ column_count=info["column_count"],
220
+ entity_column=info.get("entity_column"),
221
+ time_column=info.get("time_column"),
222
+ target_column=info.get("target_column"),
223
+ excluded=info.get("excluded", False),
224
+ )
225
+
226
+ relationships = [
227
+ DatasetRelationshipInfo(
228
+ left_dataset=rel["left_dataset"],
229
+ right_dataset=rel["right_dataset"],
230
+ left_column=rel["left_column"],
231
+ right_column=rel["right_column"],
232
+ relationship_type=rel["relationship_type"],
233
+ confidence=rel.get("confidence", 1.0),
234
+ auto_detected=rel.get("auto_detected", False),
235
+ )
236
+ for rel in data.get("relationships", [])
237
+ ]
238
+
239
+ return cls(
240
+ datasets=datasets,
241
+ relationships=relationships,
242
+ primary_entity_dataset=data.get("primary_entity_dataset"),
243
+ event_datasets=data.get("event_datasets", []),
244
+ excluded_datasets=data.get("excluded_datasets", []),
245
+ aggregation_windows=data.get("aggregation_windows", ["24h", "7d", "30d", "90d", "180d", "365d", "all_time"]),
246
+ notes=data.get("notes", {}),
247
+ )
248
+
249
+
250
+ class ExplorationManager:
251
+ """Manages multiple exploration findings."""
252
+
253
+ def __init__(self, explorations_dir: Path):
254
+ self.explorations_dir = Path(explorations_dir)
255
+ self._findings_cache: Dict[str, ExplorationFindings] = {}
256
+ self._excluded_datasets: set = set()
257
+
258
+ def discover_findings(self, prefer_aggregated: bool = True) -> List[Path]:
259
+ """Discover all findings files in the explorations directory.
260
+
261
+ Excludes multi_dataset_findings.yaml as it has a different structure.
262
+
263
+ Args:
264
+ prefer_aggregated: If True, when both event-level and aggregated findings
265
+ exist for the same dataset, only return the aggregated one.
266
+ """
267
+ if not self.explorations_dir.exists():
268
+ return []
269
+
270
+ all_files = [
271
+ f for f in self.explorations_dir.glob("*_findings.yaml")
272
+ if "multi_dataset" not in f.name
273
+ ]
274
+
275
+ if not prefer_aggregated:
276
+ return all_files
277
+
278
+ return self._filter_prefer_aggregated(all_files)
279
+
280
+ def _filter_prefer_aggregated(self, files: List[Path]) -> List[Path]:
281
+ """Filter findings files to prefer aggregated over event-level.
282
+
283
+ Groups files by base dataset name and returns aggregated version when available.
284
+ """
285
+ aggregated = {f for f in files if "_aggregated" in f.name}
286
+ non_aggregated = [f for f in files if "_aggregated" not in f.name]
287
+
288
+ if not aggregated:
289
+ return files
290
+
291
+ result = list(aggregated)
292
+ aggregated_base_names = {self._get_base_name(f) for f in aggregated}
293
+
294
+ for f in non_aggregated:
295
+ base_name = self._get_base_name(f)
296
+ if base_name not in aggregated_base_names:
297
+ result.append(f)
298
+
299
+ return result
300
+
301
+ def _get_base_name(self, path: Path) -> str:
302
+ """Extract base dataset name without hash or aggregated suffix."""
303
+ stem = path.stem.replace("_findings", "")
304
+ if "_aggregated" in stem:
305
+ stem = stem.rsplit("_aggregated", 1)[0]
306
+ parts = stem.rsplit("_", 1)
307
+ return parts[0] if len(parts) == 2 and len(parts[1]) == 6 else stem
308
+
309
+ def get_skipped_event_findings(self) -> List[Path]:
310
+ """Return event-level findings that were skipped in favor of aggregated versions."""
311
+ if not self.explorations_dir.exists():
312
+ return []
313
+
314
+ all_files = [
315
+ f for f in self.explorations_dir.glob("*_findings.yaml")
316
+ if "multi_dataset" not in f.name
317
+ ]
318
+
319
+ aggregated = {f for f in all_files if "_aggregated" in f.name}
320
+ if not aggregated:
321
+ return []
322
+
323
+ aggregated_base_names = {self._get_base_name(f) for f in aggregated}
324
+ return [f for f in all_files if "_aggregated" not in f.name
325
+ and self._get_base_name(f) in aggregated_base_names]
326
+
327
+ def load_findings(self, name_pattern: str) -> Optional[ExplorationFindings]:
328
+ """Load findings by name pattern (partial match)."""
329
+ for path in self.discover_findings():
330
+ if name_pattern.lower() in path.stem.lower():
331
+ if str(path) not in self._findings_cache:
332
+ self._findings_cache[str(path)] = ExplorationFindings.load(str(path))
333
+ return self._findings_cache[str(path)]
334
+ return None
335
+
336
+ def list_datasets(self, include_excluded: bool = False) -> List[DatasetInfo]:
337
+ """List all discovered datasets with their info."""
338
+ datasets = []
339
+
340
+ for path in self.discover_findings():
341
+ findings = ExplorationFindings.load(str(path))
342
+
343
+ # Determine granularity
344
+ if findings.is_time_series and findings.time_series_metadata:
345
+ granularity = findings.time_series_metadata.granularity
346
+ entity_col = findings.time_series_metadata.entity_column
347
+ time_col = findings.time_series_metadata.time_column
348
+ else:
349
+ granularity = DatasetGranularity.ENTITY_LEVEL
350
+ entity_col = None
351
+ time_col = None
352
+
353
+ # Extract dataset name from path
354
+ name = self._extract_dataset_name(path)
355
+ is_excluded = name in self._excluded_datasets
356
+
357
+ if not include_excluded and is_excluded:
358
+ continue
359
+
360
+ datasets.append(DatasetInfo(
361
+ name=name,
362
+ findings_path=str(path),
363
+ source_path=findings.source_path,
364
+ granularity=granularity,
365
+ row_count=findings.row_count,
366
+ column_count=findings.column_count,
367
+ entity_column=entity_col,
368
+ time_column=time_col,
369
+ target_column=findings.target_column,
370
+ excluded=is_excluded,
371
+ ))
372
+
373
+ return datasets
374
+
375
+ def create_multi_dataset_findings(
376
+ self, dataset_names: Optional[List[str]] = None
377
+ ) -> MultiDatasetFindings:
378
+ """Create a MultiDatasetFindings from discovered datasets.
379
+
380
+ Args:
381
+ dataset_names: Optional list of dataset names to include.
382
+ If None, all discovered datasets are included.
383
+ If provided, only datasets matching these names are included.
384
+ """
385
+ datasets_info = self.list_datasets(include_excluded=True)
386
+
387
+ # Filter to specified datasets if provided
388
+ if dataset_names:
389
+ datasets_info = [d for d in datasets_info if d.name in dataset_names]
390
+
391
+ datasets = {d.name: d for d in datasets_info}
392
+ event_datasets = [d.name for d in datasets_info
393
+ if d.granularity == DatasetGranularity.EVENT_LEVEL]
394
+ excluded = [d.name for d in datasets_info if d.excluded]
395
+
396
+ # Determine primary entity dataset (one with target, or largest entity-level)
397
+ primary = None
398
+ for d in datasets_info:
399
+ if d.granularity == DatasetGranularity.ENTITY_LEVEL:
400
+ if d.target_column:
401
+ primary = d.name
402
+ break
403
+ elif primary is None:
404
+ primary = d.name
405
+
406
+ return MultiDatasetFindings(
407
+ datasets=datasets,
408
+ relationships=[],
409
+ primary_entity_dataset=primary,
410
+ event_datasets=event_datasets,
411
+ excluded_datasets=excluded,
412
+ )
413
+
414
+ def exclude_dataset(self, name_pattern: str) -> None:
415
+ """Exclude a dataset from multi-dataset analysis."""
416
+ for dataset in self.list_datasets(include_excluded=True):
417
+ if name_pattern.lower() in dataset.name.lower():
418
+ self._excluded_datasets.add(dataset.name)
419
+ return
420
+
421
+ def include_dataset(self, name_pattern: str) -> None:
422
+ """Re-include a previously excluded dataset."""
423
+ to_remove = None
424
+ for name in self._excluded_datasets:
425
+ if name_pattern.lower() in name.lower():
426
+ to_remove = name
427
+ break
428
+ if to_remove:
429
+ self._excluded_datasets.remove(to_remove)
430
+
431
+ def get_aggregated_path(self, original_findings_path: str) -> Optional[str]:
432
+ """Get the aggregated findings path for an event-level dataset.
433
+
434
+ Returns the path to the aggregated findings file if:
435
+ - The original findings is event-level
436
+ - Aggregation has been executed (via 01d notebook)
437
+
438
+ Returns None if the dataset is entity-level or not yet aggregated.
439
+ """
440
+ findings = ExplorationFindings.load(original_findings_path)
441
+
442
+ if not findings.has_aggregated_output:
443
+ return None
444
+
445
+ return findings.time_series_metadata.aggregated_findings_path
446
+
447
+ def _extract_dataset_name(self, path: Path) -> str:
448
+ """Extract dataset name from findings path."""
449
+ # Pattern: {name}_{hash}_findings.yaml or {name}_{hash}_aggregated_findings.yaml
450
+ stem = path.stem # e.g., "customers_abc123_findings" or "customers_abc123_aggregated_findings"
451
+
452
+ # Remove _findings suffix
453
+ stem = stem.replace("_findings", "")
454
+
455
+ # Check for _aggregated suffix
456
+ if "_aggregated" in stem:
457
+ # Keep "aggregated" as part of name to distinguish from original
458
+ parts = stem.rsplit("_aggregated", 1)
459
+ base_name = parts[0]
460
+ # Remove hash from base_name
461
+ name_parts = base_name.rsplit("_", 1)
462
+ if len(name_parts) == 2:
463
+ return f"{name_parts[0]}_aggregated"
464
+ return f"{base_name}_aggregated"
465
+
466
+ # Regular findings - remove hash
467
+ parts = stem.rsplit("_", 1)
468
+ if len(parts) == 2:
469
+ return parts[0] # Return name without hash
470
+ return stem