churnkit 0.75.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
- churnkit-0.75.0a1.dist-info/METADATA +229 -0
- churnkit-0.75.0a1.dist-info/RECORD +302 -0
- churnkit-0.75.0a1.dist-info/WHEEL +4 -0
- churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
- churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
- customer_retention/__init__.py +37 -0
- customer_retention/analysis/__init__.py +0 -0
- customer_retention/analysis/auto_explorer/__init__.py +62 -0
- customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
- customer_retention/analysis/auto_explorer/explorer.py +258 -0
- customer_retention/analysis/auto_explorer/findings.py +291 -0
- customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
- customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
- customer_retention/analysis/auto_explorer/recommendations.py +418 -0
- customer_retention/analysis/business/__init__.py +26 -0
- customer_retention/analysis/business/ab_test_designer.py +144 -0
- customer_retention/analysis/business/fairness_analyzer.py +166 -0
- customer_retention/analysis/business/intervention_matcher.py +121 -0
- customer_retention/analysis/business/report_generator.py +222 -0
- customer_retention/analysis/business/risk_profile.py +199 -0
- customer_retention/analysis/business/roi_analyzer.py +139 -0
- customer_retention/analysis/diagnostics/__init__.py +20 -0
- customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
- customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
- customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
- customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
- customer_retention/analysis/diagnostics/noise_tester.py +140 -0
- customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
- customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
- customer_retention/analysis/discovery/__init__.py +8 -0
- customer_retention/analysis/discovery/config_generator.py +49 -0
- customer_retention/analysis/discovery/discovery_flow.py +19 -0
- customer_retention/analysis/discovery/type_inferencer.py +147 -0
- customer_retention/analysis/interpretability/__init__.py +13 -0
- customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
- customer_retention/analysis/interpretability/counterfactual.py +175 -0
- customer_retention/analysis/interpretability/individual_explainer.py +141 -0
- customer_retention/analysis/interpretability/pdp_generator.py +103 -0
- customer_retention/analysis/interpretability/shap_explainer.py +106 -0
- customer_retention/analysis/jupyter_save_hook.py +28 -0
- customer_retention/analysis/notebook_html_exporter.py +136 -0
- customer_retention/analysis/notebook_progress.py +60 -0
- customer_retention/analysis/plotly_preprocessor.py +154 -0
- customer_retention/analysis/recommendations/__init__.py +54 -0
- customer_retention/analysis/recommendations/base.py +158 -0
- customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
- customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
- customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
- customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
- customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
- customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
- customer_retention/analysis/recommendations/datetime/extract.py +149 -0
- customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
- customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
- customer_retention/analysis/recommendations/pipeline.py +74 -0
- customer_retention/analysis/recommendations/registry.py +76 -0
- customer_retention/analysis/recommendations/selection/__init__.py +3 -0
- customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
- customer_retention/analysis/recommendations/transform/__init__.py +4 -0
- customer_retention/analysis/recommendations/transform/power.py +94 -0
- customer_retention/analysis/recommendations/transform/scale.py +112 -0
- customer_retention/analysis/visualization/__init__.py +15 -0
- customer_retention/analysis/visualization/chart_builder.py +2619 -0
- customer_retention/analysis/visualization/console.py +122 -0
- customer_retention/analysis/visualization/display.py +171 -0
- customer_retention/analysis/visualization/number_formatter.py +36 -0
- customer_retention/artifacts/__init__.py +3 -0
- customer_retention/artifacts/fit_artifact_registry.py +146 -0
- customer_retention/cli.py +93 -0
- customer_retention/core/__init__.py +0 -0
- customer_retention/core/compat/__init__.py +193 -0
- customer_retention/core/compat/detection.py +99 -0
- customer_retention/core/compat/ops.py +48 -0
- customer_retention/core/compat/pandas_backend.py +57 -0
- customer_retention/core/compat/spark_backend.py +75 -0
- customer_retention/core/components/__init__.py +11 -0
- customer_retention/core/components/base.py +79 -0
- customer_retention/core/components/components/__init__.py +13 -0
- customer_retention/core/components/components/deployer.py +26 -0
- customer_retention/core/components/components/explainer.py +26 -0
- customer_retention/core/components/components/feature_eng.py +33 -0
- customer_retention/core/components/components/ingester.py +34 -0
- customer_retention/core/components/components/profiler.py +34 -0
- customer_retention/core/components/components/trainer.py +38 -0
- customer_retention/core/components/components/transformer.py +36 -0
- customer_retention/core/components/components/validator.py +37 -0
- customer_retention/core/components/enums.py +33 -0
- customer_retention/core/components/orchestrator.py +94 -0
- customer_retention/core/components/registry.py +59 -0
- customer_retention/core/config/__init__.py +39 -0
- customer_retention/core/config/column_config.py +95 -0
- customer_retention/core/config/experiments.py +71 -0
- customer_retention/core/config/pipeline_config.py +117 -0
- customer_retention/core/config/source_config.py +83 -0
- customer_retention/core/utils/__init__.py +28 -0
- customer_retention/core/utils/leakage.py +85 -0
- customer_retention/core/utils/severity.py +53 -0
- customer_retention/core/utils/statistics.py +90 -0
- customer_retention/generators/__init__.py +0 -0
- customer_retention/generators/notebook_generator/__init__.py +167 -0
- customer_retention/generators/notebook_generator/base.py +55 -0
- customer_retention/generators/notebook_generator/cell_builder.py +49 -0
- customer_retention/generators/notebook_generator/config.py +47 -0
- customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
- customer_retention/generators/notebook_generator/local_generator.py +48 -0
- customer_retention/generators/notebook_generator/project_init.py +174 -0
- customer_retention/generators/notebook_generator/runner.py +150 -0
- customer_retention/generators/notebook_generator/script_generator.py +110 -0
- customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
- customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
- customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
- customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
- customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
- customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
- customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
- customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
- customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
- customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
- customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
- customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
- customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
- customer_retention/generators/orchestration/__init__.py +23 -0
- customer_retention/generators/orchestration/code_generator.py +196 -0
- customer_retention/generators/orchestration/context.py +147 -0
- customer_retention/generators/orchestration/data_materializer.py +188 -0
- customer_retention/generators/orchestration/databricks_exporter.py +411 -0
- customer_retention/generators/orchestration/doc_generator.py +311 -0
- customer_retention/generators/pipeline_generator/__init__.py +26 -0
- customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
- customer_retention/generators/pipeline_generator/generator.py +142 -0
- customer_retention/generators/pipeline_generator/models.py +166 -0
- customer_retention/generators/pipeline_generator/renderer.py +2125 -0
- customer_retention/generators/spec_generator/__init__.py +37 -0
- customer_retention/generators/spec_generator/databricks_generator.py +433 -0
- customer_retention/generators/spec_generator/generic_generator.py +373 -0
- customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
- customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
- customer_retention/integrations/__init__.py +0 -0
- customer_retention/integrations/adapters/__init__.py +13 -0
- customer_retention/integrations/adapters/base.py +10 -0
- customer_retention/integrations/adapters/factory.py +25 -0
- customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
- customer_retention/integrations/adapters/feature_store/base.py +57 -0
- customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
- customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
- customer_retention/integrations/adapters/feature_store/local.py +75 -0
- customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
- customer_retention/integrations/adapters/mlflow/base.py +32 -0
- customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
- customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
- customer_retention/integrations/adapters/mlflow/local.py +50 -0
- customer_retention/integrations/adapters/storage/__init__.py +5 -0
- customer_retention/integrations/adapters/storage/base.py +33 -0
- customer_retention/integrations/adapters/storage/databricks.py +76 -0
- customer_retention/integrations/adapters/storage/local.py +59 -0
- customer_retention/integrations/feature_store/__init__.py +47 -0
- customer_retention/integrations/feature_store/definitions.py +215 -0
- customer_retention/integrations/feature_store/manager.py +744 -0
- customer_retention/integrations/feature_store/registry.py +412 -0
- customer_retention/integrations/iteration/__init__.py +28 -0
- customer_retention/integrations/iteration/context.py +212 -0
- customer_retention/integrations/iteration/feedback_collector.py +184 -0
- customer_retention/integrations/iteration/orchestrator.py +168 -0
- customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
- customer_retention/integrations/iteration/signals.py +212 -0
- customer_retention/integrations/llm_context/__init__.py +4 -0
- customer_retention/integrations/llm_context/context_builder.py +201 -0
- customer_retention/integrations/llm_context/prompts.py +100 -0
- customer_retention/integrations/streaming/__init__.py +103 -0
- customer_retention/integrations/streaming/batch_integration.py +149 -0
- customer_retention/integrations/streaming/early_warning_model.py +227 -0
- customer_retention/integrations/streaming/event_schema.py +214 -0
- customer_retention/integrations/streaming/online_store_writer.py +249 -0
- customer_retention/integrations/streaming/realtime_scorer.py +261 -0
- customer_retention/integrations/streaming/trigger_engine.py +293 -0
- customer_retention/integrations/streaming/window_aggregator.py +393 -0
- customer_retention/stages/__init__.py +0 -0
- customer_retention/stages/cleaning/__init__.py +9 -0
- customer_retention/stages/cleaning/base.py +28 -0
- customer_retention/stages/cleaning/missing_handler.py +160 -0
- customer_retention/stages/cleaning/outlier_handler.py +204 -0
- customer_retention/stages/deployment/__init__.py +28 -0
- customer_retention/stages/deployment/batch_scorer.py +106 -0
- customer_retention/stages/deployment/champion_challenger.py +299 -0
- customer_retention/stages/deployment/model_registry.py +182 -0
- customer_retention/stages/deployment/retraining_trigger.py +245 -0
- customer_retention/stages/features/__init__.py +73 -0
- customer_retention/stages/features/behavioral_features.py +266 -0
- customer_retention/stages/features/customer_segmentation.py +505 -0
- customer_retention/stages/features/feature_definitions.py +265 -0
- customer_retention/stages/features/feature_engineer.py +551 -0
- customer_retention/stages/features/feature_manifest.py +340 -0
- customer_retention/stages/features/feature_selector.py +239 -0
- customer_retention/stages/features/interaction_features.py +160 -0
- customer_retention/stages/features/temporal_features.py +243 -0
- customer_retention/stages/ingestion/__init__.py +9 -0
- customer_retention/stages/ingestion/load_result.py +32 -0
- customer_retention/stages/ingestion/loaders.py +195 -0
- customer_retention/stages/ingestion/source_registry.py +130 -0
- customer_retention/stages/modeling/__init__.py +31 -0
- customer_retention/stages/modeling/baseline_trainer.py +139 -0
- customer_retention/stages/modeling/cross_validator.py +125 -0
- customer_retention/stages/modeling/data_splitter.py +205 -0
- customer_retention/stages/modeling/feature_scaler.py +99 -0
- customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
- customer_retention/stages/modeling/imbalance_handler.py +282 -0
- customer_retention/stages/modeling/mlflow_logger.py +95 -0
- customer_retention/stages/modeling/model_comparator.py +149 -0
- customer_retention/stages/modeling/model_evaluator.py +138 -0
- customer_retention/stages/modeling/threshold_optimizer.py +131 -0
- customer_retention/stages/monitoring/__init__.py +37 -0
- customer_retention/stages/monitoring/alert_manager.py +328 -0
- customer_retention/stages/monitoring/drift_detector.py +201 -0
- customer_retention/stages/monitoring/performance_monitor.py +242 -0
- customer_retention/stages/preprocessing/__init__.py +5 -0
- customer_retention/stages/preprocessing/transformer_manager.py +284 -0
- customer_retention/stages/profiling/__init__.py +256 -0
- customer_retention/stages/profiling/categorical_distribution.py +269 -0
- customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
- customer_retention/stages/profiling/column_profiler.py +527 -0
- customer_retention/stages/profiling/distribution_analysis.py +483 -0
- customer_retention/stages/profiling/drift_detector.py +310 -0
- customer_retention/stages/profiling/feature_capacity.py +507 -0
- customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
- customer_retention/stages/profiling/profile_result.py +212 -0
- customer_retention/stages/profiling/quality_checks.py +1632 -0
- customer_retention/stages/profiling/relationship_detector.py +256 -0
- customer_retention/stages/profiling/relationship_recommender.py +454 -0
- customer_retention/stages/profiling/report_generator.py +520 -0
- customer_retention/stages/profiling/scd_analyzer.py +151 -0
- customer_retention/stages/profiling/segment_analyzer.py +632 -0
- customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
- customer_retention/stages/profiling/target_level_analyzer.py +217 -0
- customer_retention/stages/profiling/temporal_analyzer.py +388 -0
- customer_retention/stages/profiling/temporal_coverage.py +488 -0
- customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
- customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
- customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
- customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
- customer_retention/stages/profiling/text_embedder.py +87 -0
- customer_retention/stages/profiling/text_processor.py +115 -0
- customer_retention/stages/profiling/text_reducer.py +60 -0
- customer_retention/stages/profiling/time_series_profiler.py +303 -0
- customer_retention/stages/profiling/time_window_aggregator.py +376 -0
- customer_retention/stages/profiling/type_detector.py +382 -0
- customer_retention/stages/profiling/window_recommendation.py +288 -0
- customer_retention/stages/temporal/__init__.py +166 -0
- customer_retention/stages/temporal/access_guard.py +180 -0
- customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
- customer_retention/stages/temporal/data_preparer.py +178 -0
- customer_retention/stages/temporal/point_in_time_join.py +134 -0
- customer_retention/stages/temporal/point_in_time_registry.py +148 -0
- customer_retention/stages/temporal/scenario_detector.py +163 -0
- customer_retention/stages/temporal/snapshot_manager.py +259 -0
- customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
- customer_retention/stages/temporal/timestamp_discovery.py +531 -0
- customer_retention/stages/temporal/timestamp_manager.py +255 -0
- customer_retention/stages/transformation/__init__.py +13 -0
- customer_retention/stages/transformation/binary_handler.py +85 -0
- customer_retention/stages/transformation/categorical_encoder.py +245 -0
- customer_retention/stages/transformation/datetime_transformer.py +97 -0
- customer_retention/stages/transformation/numeric_transformer.py +181 -0
- customer_retention/stages/transformation/pipeline.py +257 -0
- customer_retention/stages/validation/__init__.py +60 -0
- customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
- customer_retention/stages/validation/business_sense_gate.py +173 -0
- customer_retention/stages/validation/data_quality_gate.py +235 -0
- customer_retention/stages/validation/data_validators.py +511 -0
- customer_retention/stages/validation/feature_quality_gate.py +183 -0
- customer_retention/stages/validation/gates.py +117 -0
- customer_retention/stages/validation/leakage_gate.py +352 -0
- customer_retention/stages/validation/model_validity_gate.py +213 -0
- customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
- customer_retention/stages/validation/quality_scorer.py +544 -0
- customer_retention/stages/validation/rule_generator.py +57 -0
- customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
- customer_retention/stages/validation/timeseries_detector.py +769 -0
- customer_retention/transforms/__init__.py +47 -0
- customer_retention/transforms/artifact_store.py +50 -0
- customer_retention/transforms/executor.py +157 -0
- customer_retention/transforms/fitted.py +92 -0
- customer_retention/transforms/ops.py +148 -0
|
@@ -0,0 +1,382 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from customer_retention.core.compat import DataFrame, is_datetime64_any_dtype, is_numeric_dtype, is_string_dtype, pd
|
|
4
|
+
from customer_retention.core.config.column_config import ColumnType, DatasetGranularity
|
|
5
|
+
|
|
6
|
+
from .profile_result import GranularityResult, TypeConfidence, TypeInference
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TypeDetector:
|
|
10
|
+
IDENTIFIER_PATTERNS = ["id", "key", "code", "uuid", "guid"]
|
|
11
|
+
TARGET_PATTERNS_PRIMARY = ["churned", "retained", "churn", "retention", "attrition"]
|
|
12
|
+
TARGET_PATTERNS_SECONDARY = [
|
|
13
|
+
"unsubscribe", "unsubscribed", "terminate", "terminated", "cancel", "cancelled",
|
|
14
|
+
"close", "closed", "discontinue", "discontinued", "exit", "exited", "leave", "left",
|
|
15
|
+
]
|
|
16
|
+
TARGET_PATTERNS_GENERIC = ["target", "label", "outcome", "class", "flag"]
|
|
17
|
+
CYCLICAL_DAY_PATTERNS = ["mon", "tue", "wed", "thu", "fri", "sat", "sun", "monday", "tuesday", "wednesday"]
|
|
18
|
+
CYCLICAL_MONTH_PATTERNS = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]
|
|
19
|
+
|
|
20
|
+
def __init__(self):
|
|
21
|
+
self.evidence = []
|
|
22
|
+
|
|
23
|
+
def detect_type(self, series: pd.Series, column_name: str) -> TypeInference:
|
|
24
|
+
self.evidence = []
|
|
25
|
+
|
|
26
|
+
if self.is_identifier(series, column_name):
|
|
27
|
+
return TypeInference(
|
|
28
|
+
inferred_type=ColumnType.IDENTIFIER,
|
|
29
|
+
confidence=TypeConfidence.HIGH,
|
|
30
|
+
evidence=self.evidence.copy()
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
if self.is_target(series, column_name):
|
|
34
|
+
return TypeInference(
|
|
35
|
+
inferred_type=ColumnType.TARGET,
|
|
36
|
+
confidence=TypeConfidence.HIGH,
|
|
37
|
+
evidence=self.evidence.copy()
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
if self.is_binary(series):
|
|
41
|
+
return TypeInference(
|
|
42
|
+
inferred_type=ColumnType.BINARY,
|
|
43
|
+
confidence=TypeConfidence.HIGH,
|
|
44
|
+
evidence=self.evidence.copy()
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
if self.is_datetime(series):
|
|
48
|
+
return TypeInference(
|
|
49
|
+
inferred_type=ColumnType.DATETIME,
|
|
50
|
+
confidence=TypeConfidence.HIGH,
|
|
51
|
+
evidence=self.evidence.copy()
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
if is_numeric_dtype(series):
|
|
55
|
+
return self.detect_numeric_type(series)
|
|
56
|
+
|
|
57
|
+
if is_string_dtype(series) or series.dtype == object:
|
|
58
|
+
return self.detect_categorical_type(series)
|
|
59
|
+
|
|
60
|
+
return TypeInference(
|
|
61
|
+
inferred_type=ColumnType.UNKNOWN,
|
|
62
|
+
confidence=TypeConfidence.LOW,
|
|
63
|
+
evidence=["Could not determine type"]
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
def is_identifier(self, series: pd.Series, column_name: str) -> bool:
|
|
67
|
+
column_lower = column_name.lower()
|
|
68
|
+
if any(pattern in column_lower for pattern in self.IDENTIFIER_PATTERNS):
|
|
69
|
+
self.evidence.append("Column name contains identifier pattern")
|
|
70
|
+
return True
|
|
71
|
+
|
|
72
|
+
if len(series) == 0:
|
|
73
|
+
return False
|
|
74
|
+
|
|
75
|
+
if is_datetime64_any_dtype(series):
|
|
76
|
+
return False
|
|
77
|
+
|
|
78
|
+
if is_numeric_dtype(series):
|
|
79
|
+
return False
|
|
80
|
+
|
|
81
|
+
distinct_count = series.nunique()
|
|
82
|
+
distinct_ratio = distinct_count / len(series)
|
|
83
|
+
|
|
84
|
+
if distinct_ratio == 1.0 and distinct_count <= 100:
|
|
85
|
+
if series.dtype == object:
|
|
86
|
+
sample = series.dropna().head(100)
|
|
87
|
+
if len(sample) > 0:
|
|
88
|
+
parseable_count = 0
|
|
89
|
+
for value in sample:
|
|
90
|
+
try:
|
|
91
|
+
pd.to_datetime(value, format='mixed')
|
|
92
|
+
parseable_count += 1
|
|
93
|
+
except (ValueError, TypeError):
|
|
94
|
+
pass
|
|
95
|
+
|
|
96
|
+
if parseable_count / len(sample) > 0.8:
|
|
97
|
+
return False
|
|
98
|
+
|
|
99
|
+
self.evidence.append("All values are unique (100%)")
|
|
100
|
+
return True
|
|
101
|
+
|
|
102
|
+
return False
|
|
103
|
+
|
|
104
|
+
def is_target(self, series: pd.Series, column_name: str) -> bool:
|
|
105
|
+
column_lower = column_name.lower()
|
|
106
|
+
distinct_count = series.nunique()
|
|
107
|
+
if distinct_count > 10:
|
|
108
|
+
return False
|
|
109
|
+
|
|
110
|
+
for pattern in self.TARGET_PATTERNS_PRIMARY:
|
|
111
|
+
if pattern in column_lower:
|
|
112
|
+
self.evidence.append(f"Column name contains primary target pattern '{pattern}' with {distinct_count} classes")
|
|
113
|
+
return True
|
|
114
|
+
|
|
115
|
+
for pattern in self.TARGET_PATTERNS_SECONDARY:
|
|
116
|
+
if pattern in column_lower:
|
|
117
|
+
self.evidence.append(f"Column name contains secondary target pattern '{pattern}' with {distinct_count} classes")
|
|
118
|
+
return True
|
|
119
|
+
|
|
120
|
+
for pattern in self.TARGET_PATTERNS_GENERIC:
|
|
121
|
+
if pattern in column_lower:
|
|
122
|
+
self.evidence.append(f"Column name contains generic target pattern '{pattern}' with {distinct_count} classes")
|
|
123
|
+
return True
|
|
124
|
+
|
|
125
|
+
return False
|
|
126
|
+
|
|
127
|
+
def is_binary(self, series: pd.Series) -> bool:
|
|
128
|
+
distinct_count = series.nunique()
|
|
129
|
+
if distinct_count != 2:
|
|
130
|
+
return False
|
|
131
|
+
|
|
132
|
+
unique_values = set(series.dropna().unique())
|
|
133
|
+
|
|
134
|
+
binary_sets = [
|
|
135
|
+
{0, 1}, {0.0, 1.0},
|
|
136
|
+
{True, False},
|
|
137
|
+
{"0", "1"},
|
|
138
|
+
{"yes", "no"}, {"Yes", "No"}, {"YES", "NO"},
|
|
139
|
+
{"true", "false"}, {"True", "False"}, {"TRUE", "FALSE"},
|
|
140
|
+
{"y", "n"}, {"Y", "N"}
|
|
141
|
+
]
|
|
142
|
+
|
|
143
|
+
for binary_set in binary_sets:
|
|
144
|
+
if unique_values == binary_set or unique_values.issubset(binary_set):
|
|
145
|
+
self.evidence.append(f"Exactly 2 unique values: {unique_values}")
|
|
146
|
+
return True
|
|
147
|
+
|
|
148
|
+
if distinct_count == 2:
|
|
149
|
+
self.evidence.append(f"Exactly 2 unique values (non-standard): {unique_values}")
|
|
150
|
+
return True
|
|
151
|
+
|
|
152
|
+
return False
|
|
153
|
+
|
|
154
|
+
def is_datetime(self, series: pd.Series) -> bool:
|
|
155
|
+
if is_datetime64_any_dtype(series):
|
|
156
|
+
self.evidence.append("Column is datetime dtype")
|
|
157
|
+
return True
|
|
158
|
+
|
|
159
|
+
if series.dtype == object:
|
|
160
|
+
sample = series.dropna().head(100)
|
|
161
|
+
if len(sample) == 0:
|
|
162
|
+
return False
|
|
163
|
+
|
|
164
|
+
parseable_count = 0
|
|
165
|
+
for value in sample:
|
|
166
|
+
try:
|
|
167
|
+
pd.to_datetime(value, format='mixed')
|
|
168
|
+
parseable_count += 1
|
|
169
|
+
except (ValueError, TypeError):
|
|
170
|
+
pass
|
|
171
|
+
|
|
172
|
+
if parseable_count / len(sample) > 0.8:
|
|
173
|
+
self.evidence.append(f"{parseable_count}/{len(sample)} values parseable as datetime")
|
|
174
|
+
return True
|
|
175
|
+
|
|
176
|
+
return False
|
|
177
|
+
|
|
178
|
+
def detect_numeric_type(self, series: pd.Series) -> TypeInference:
|
|
179
|
+
distinct_count = series.nunique()
|
|
180
|
+
|
|
181
|
+
if distinct_count <= 20:
|
|
182
|
+
self.evidence.append(f"Numeric with {distinct_count} unique values (≤20)")
|
|
183
|
+
return TypeInference(
|
|
184
|
+
inferred_type=ColumnType.NUMERIC_DISCRETE,
|
|
185
|
+
confidence=TypeConfidence.MEDIUM,
|
|
186
|
+
evidence=self.evidence.copy(),
|
|
187
|
+
alternatives=[ColumnType.NUMERIC_CONTINUOUS]
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
self.evidence.append(f"Numeric with {distinct_count} unique values (>20)")
|
|
191
|
+
return TypeInference(
|
|
192
|
+
inferred_type=ColumnType.NUMERIC_CONTINUOUS,
|
|
193
|
+
confidence=TypeConfidence.HIGH,
|
|
194
|
+
evidence=self.evidence.copy()
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
def detect_categorical_type(self, series: pd.Series) -> TypeInference:
|
|
198
|
+
if len(series) == 0 or series.dropna().empty:
|
|
199
|
+
return TypeInference(
|
|
200
|
+
inferred_type=ColumnType.UNKNOWN,
|
|
201
|
+
confidence=TypeConfidence.LOW,
|
|
202
|
+
evidence=["Empty or all-null series"]
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
distinct_count = series.nunique()
|
|
206
|
+
|
|
207
|
+
if self.is_cyclical_pattern(series):
|
|
208
|
+
return TypeInference(
|
|
209
|
+
inferred_type=ColumnType.CATEGORICAL_CYCLICAL,
|
|
210
|
+
confidence=TypeConfidence.MEDIUM,
|
|
211
|
+
evidence=self.evidence.copy()
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
if distinct_count <= 10:
|
|
215
|
+
self.evidence.append(f"String with {distinct_count} unique values (≤10)")
|
|
216
|
+
return TypeInference(
|
|
217
|
+
inferred_type=ColumnType.CATEGORICAL_NOMINAL,
|
|
218
|
+
confidence=TypeConfidence.HIGH,
|
|
219
|
+
evidence=self.evidence.copy()
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
if distinct_count <= 100:
|
|
223
|
+
self.evidence.append(f"String with {distinct_count} unique values (≤100)")
|
|
224
|
+
return TypeInference(
|
|
225
|
+
inferred_type=ColumnType.CATEGORICAL_NOMINAL,
|
|
226
|
+
confidence=TypeConfidence.MEDIUM,
|
|
227
|
+
evidence=self.evidence.copy()
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
self.evidence.append(f"String with {distinct_count} unique values (>100)")
|
|
231
|
+
return TypeInference(
|
|
232
|
+
inferred_type=ColumnType.TEXT,
|
|
233
|
+
confidence=TypeConfidence.MEDIUM,
|
|
234
|
+
evidence=self.evidence.copy(),
|
|
235
|
+
alternatives=[ColumnType.CATEGORICAL_NOMINAL]
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
def is_cyclical_pattern(self, series: pd.Series) -> bool:
|
|
239
|
+
sample_values = [str(v).lower() for v in series.dropna().unique()[:20]]
|
|
240
|
+
|
|
241
|
+
if len(sample_values) == 0:
|
|
242
|
+
return False
|
|
243
|
+
|
|
244
|
+
day_matches = sum(1 for v in sample_values if any(day in v for day in self.CYCLICAL_DAY_PATTERNS))
|
|
245
|
+
if day_matches >= min(3, len(sample_values)):
|
|
246
|
+
self.evidence.append("Contains day name patterns (cyclical)")
|
|
247
|
+
return True
|
|
248
|
+
|
|
249
|
+
month_matches = sum(1 for v in sample_values if any(month in v for month in self.CYCLICAL_MONTH_PATTERNS))
|
|
250
|
+
if month_matches >= min(3, len(sample_values)):
|
|
251
|
+
self.evidence.append("Contains month name patterns (cyclical)")
|
|
252
|
+
return True
|
|
253
|
+
|
|
254
|
+
return False
|
|
255
|
+
|
|
256
|
+
def detect_granularity(self, df: DataFrame) -> GranularityResult:
|
|
257
|
+
"""Detect whether dataset is entity-level or event-level (time series)."""
|
|
258
|
+
evidence = []
|
|
259
|
+
|
|
260
|
+
if df is None or len(df) == 0 or len(df.columns) == 0:
|
|
261
|
+
return GranularityResult(
|
|
262
|
+
granularity=DatasetGranularity.UNKNOWN,
|
|
263
|
+
evidence=["Empty or invalid DataFrame"]
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
entity_column = self._detect_entity_column(df)
|
|
267
|
+
time_column = self._detect_time_column(df)
|
|
268
|
+
|
|
269
|
+
if entity_column is None:
|
|
270
|
+
evidence.append("No clear entity/ID column detected")
|
|
271
|
+
return GranularityResult(
|
|
272
|
+
granularity=DatasetGranularity.UNKNOWN,
|
|
273
|
+
evidence=evidence
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
unique_entities = df[entity_column].nunique()
|
|
277
|
+
total_rows = len(df)
|
|
278
|
+
avg_events = total_rows / unique_entities if unique_entities > 0 else 0
|
|
279
|
+
|
|
280
|
+
if unique_entities == total_rows:
|
|
281
|
+
evidence.append(f"Each {entity_column} appears exactly once")
|
|
282
|
+
return GranularityResult(
|
|
283
|
+
granularity=DatasetGranularity.ENTITY_LEVEL,
|
|
284
|
+
entity_column=entity_column,
|
|
285
|
+
time_column=time_column,
|
|
286
|
+
unique_entities=unique_entities,
|
|
287
|
+
total_rows=total_rows,
|
|
288
|
+
avg_events_per_entity=1.0,
|
|
289
|
+
evidence=evidence
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
if avg_events > 1.5 and time_column is not None:
|
|
293
|
+
evidence.append(f"Multiple rows per {entity_column} (avg {avg_events:.1f})")
|
|
294
|
+
evidence.append(f"Temporal column detected: {time_column}")
|
|
295
|
+
return GranularityResult(
|
|
296
|
+
granularity=DatasetGranularity.EVENT_LEVEL,
|
|
297
|
+
entity_column=entity_column,
|
|
298
|
+
time_column=time_column,
|
|
299
|
+
unique_entities=unique_entities,
|
|
300
|
+
total_rows=total_rows,
|
|
301
|
+
avg_events_per_entity=round(avg_events, 2),
|
|
302
|
+
evidence=evidence
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
if avg_events > 1.5:
|
|
306
|
+
evidence.append(f"Multiple rows per {entity_column} but no datetime column")
|
|
307
|
+
return GranularityResult(
|
|
308
|
+
granularity=DatasetGranularity.EVENT_LEVEL,
|
|
309
|
+
entity_column=entity_column,
|
|
310
|
+
time_column=None,
|
|
311
|
+
unique_entities=unique_entities,
|
|
312
|
+
total_rows=total_rows,
|
|
313
|
+
avg_events_per_entity=round(avg_events, 2),
|
|
314
|
+
evidence=evidence
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
evidence.append("Could not determine granularity with confidence")
|
|
318
|
+
return GranularityResult(
|
|
319
|
+
granularity=DatasetGranularity.UNKNOWN,
|
|
320
|
+
entity_column=entity_column,
|
|
321
|
+
time_column=time_column,
|
|
322
|
+
evidence=evidence
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
def _detect_entity_column(self, df: DataFrame) -> Optional[str]:
|
|
326
|
+
"""Find the most likely entity/ID column."""
|
|
327
|
+
candidates = []
|
|
328
|
+
|
|
329
|
+
for col in df.columns:
|
|
330
|
+
col_lower = col.lower()
|
|
331
|
+
|
|
332
|
+
if any(pattern in col_lower for pattern in self.IDENTIFIER_PATTERNS):
|
|
333
|
+
unique_ratio = df[col].nunique() / len(df)
|
|
334
|
+
if 0.01 < unique_ratio < 1.0:
|
|
335
|
+
candidates.append((col, unique_ratio, "name_match"))
|
|
336
|
+
elif unique_ratio == 1.0:
|
|
337
|
+
candidates.append((col, unique_ratio, "unique_id"))
|
|
338
|
+
|
|
339
|
+
if not candidates:
|
|
340
|
+
for col in df.columns:
|
|
341
|
+
if df[col].dtype == object or str(df[col].dtype).startswith("str"):
|
|
342
|
+
unique_ratio = df[col].nunique() / len(df)
|
|
343
|
+
if 0.01 < unique_ratio < 0.5:
|
|
344
|
+
candidates.append((col, unique_ratio, "string_repeating"))
|
|
345
|
+
|
|
346
|
+
if not candidates:
|
|
347
|
+
return None
|
|
348
|
+
|
|
349
|
+
for col, ratio, match_type in candidates:
|
|
350
|
+
if match_type == "name_match" and ratio < 1.0:
|
|
351
|
+
return col
|
|
352
|
+
|
|
353
|
+
for col, ratio, match_type in candidates:
|
|
354
|
+
if match_type == "unique_id":
|
|
355
|
+
return col
|
|
356
|
+
|
|
357
|
+
return candidates[0][0] if candidates else None
|
|
358
|
+
|
|
359
|
+
def _detect_time_column(self, df: DataFrame) -> Optional[str]:
|
|
360
|
+
"""Find the most likely datetime/timestamp column."""
|
|
361
|
+
for col in df.columns:
|
|
362
|
+
if is_datetime64_any_dtype(df[col]):
|
|
363
|
+
return col
|
|
364
|
+
|
|
365
|
+
datetime_patterns = ["date", "time", "timestamp", "created", "updated", "sent", "event"]
|
|
366
|
+
for col in df.columns:
|
|
367
|
+
col_lower = col.lower()
|
|
368
|
+
if any(pattern in col_lower for pattern in datetime_patterns):
|
|
369
|
+
if df[col].dtype == object:
|
|
370
|
+
sample = df[col].dropna().head(20)
|
|
371
|
+
if len(sample) > 0:
|
|
372
|
+
parseable = 0
|
|
373
|
+
for val in sample:
|
|
374
|
+
try:
|
|
375
|
+
pd.to_datetime(val, format='mixed')
|
|
376
|
+
parseable += 1
|
|
377
|
+
except (ValueError, TypeError):
|
|
378
|
+
pass
|
|
379
|
+
if parseable / len(sample) > 0.8:
|
|
380
|
+
return col
|
|
381
|
+
|
|
382
|
+
return None
|
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Dict, List, Optional, Tuple
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from .temporal_pattern_analyzer import SeasonalityPeriod
|
|
7
|
+
from .time_series_profiler import ActivitySegmentResult, LifecycleQuadrantResult
|
|
8
|
+
|
|
9
|
+
WINDOW_DAYS_MAP: Dict[str, Optional[float]] = {
|
|
10
|
+
"24h": 1.0, "7d": 7.0, "14d": 14.0, "30d": 30.0,
|
|
11
|
+
"90d": 90.0, "180d": 180.0, "365d": 365.0, "all_time": None,
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
SEASONALITY_WINDOW_MAP: Dict[int, str] = {
|
|
15
|
+
1: "24h", 7: "7d", 14: "14d", 30: "30d", 90: "90d", 365: "365d",
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
TIMING_TOLERANCE = 0.5
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class TemporalHeterogeneityResult:
|
|
23
|
+
eta_squared_intensity: float
|
|
24
|
+
eta_squared_event_count: float
|
|
25
|
+
heterogeneity_level: str
|
|
26
|
+
segmentation_advisory: str
|
|
27
|
+
advisory_rationale: List[str]
|
|
28
|
+
coverage_table: pd.DataFrame
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class WindowUnionResult:
|
|
33
|
+
windows: List[str]
|
|
34
|
+
explanation: pd.DataFrame
|
|
35
|
+
heterogeneity: TemporalHeterogeneityResult
|
|
36
|
+
coverage_threshold: float
|
|
37
|
+
feature_count_estimate: int
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class WindowRecommendationCollector:
|
|
41
|
+
ALL_CANDIDATE_WINDOWS = ["24h", "7d", "14d", "30d", "90d", "180d", "365d", "all_time"]
|
|
42
|
+
|
|
43
|
+
def __init__(self, coverage_threshold: float = 0.10, always_include: Optional[List[str]] = None):
|
|
44
|
+
self._coverage_threshold = coverage_threshold
|
|
45
|
+
self._always_include = always_include if always_include is not None else ["all_time"]
|
|
46
|
+
self._segment_lifecycles: Optional[pd.DataFrame] = None
|
|
47
|
+
self._quadrant_lifecycles: Optional[pd.DataFrame] = None
|
|
48
|
+
self._seasonality_periods: List[SeasonalityPeriod] = []
|
|
49
|
+
self._inter_event_median: Optional[float] = None
|
|
50
|
+
self._inter_event_mean: Optional[float] = None
|
|
51
|
+
|
|
52
|
+
def add_segment_context(self, result: ActivitySegmentResult) -> None:
|
|
53
|
+
self._segment_lifecycles = result.lifecycles
|
|
54
|
+
|
|
55
|
+
def add_quadrant_context(self, result: LifecycleQuadrantResult) -> None:
|
|
56
|
+
self._quadrant_lifecycles = result.lifecycles
|
|
57
|
+
|
|
58
|
+
def add_seasonality_context(self, periods: List[SeasonalityPeriod]) -> None:
|
|
59
|
+
self._seasonality_periods = periods
|
|
60
|
+
|
|
61
|
+
def add_inter_event_context(self, median_days: float, mean_days: float) -> None:
|
|
62
|
+
self._inter_event_median = median_days
|
|
63
|
+
self._inter_event_mean = mean_days
|
|
64
|
+
|
|
65
|
+
def compute_union(
|
|
66
|
+
self, lifecycles: pd.DataFrame, time_span_days: int,
|
|
67
|
+
min_coverage_ratio: float = 2.0,
|
|
68
|
+
value_columns: int = 0, agg_funcs: int = 4,
|
|
69
|
+
) -> WindowUnionResult:
|
|
70
|
+
rows = self._compute_coverage_rows(lifecycles, time_span_days, min_coverage_ratio)
|
|
71
|
+
self._annotate_context(rows, lifecycles)
|
|
72
|
+
selected = [r["window"] for r in rows if r["included"]]
|
|
73
|
+
explanation = pd.DataFrame(rows)
|
|
74
|
+
heterogeneity = self._compute_heterogeneity(lifecycles, selected)
|
|
75
|
+
feature_count = value_columns * agg_funcs * len(selected) + len(selected) if value_columns > 0 else len(selected)
|
|
76
|
+
return WindowUnionResult(
|
|
77
|
+
windows=selected, explanation=explanation,
|
|
78
|
+
heterogeneity=heterogeneity, coverage_threshold=self._coverage_threshold,
|
|
79
|
+
feature_count_estimate=feature_count,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
def _compute_coverage_rows(
|
|
83
|
+
self, lifecycles: pd.DataFrame, time_span_days: int, min_coverage_ratio: float,
|
|
84
|
+
) -> List[Dict]:
|
|
85
|
+
duration = lifecycles["duration_days"].astype(float)
|
|
86
|
+
event_count = lifecycles["event_count"].astype(float)
|
|
87
|
+
n = len(lifecycles)
|
|
88
|
+
rows = []
|
|
89
|
+
for window in self.ALL_CANDIDATE_WINDOWS:
|
|
90
|
+
window_days = WINDOW_DAYS_MAP[window]
|
|
91
|
+
if window_days is None:
|
|
92
|
+
rows.append(self._all_time_row(n))
|
|
93
|
+
continue
|
|
94
|
+
has_span = duration >= window_days
|
|
95
|
+
expected_events = event_count * (window_days / duration.clip(lower=1))
|
|
96
|
+
has_density = expected_events >= 2
|
|
97
|
+
beneficial = has_span & has_density
|
|
98
|
+
coverage_pct = beneficial.mean()
|
|
99
|
+
meaningful_pct = has_density[has_span].mean() if has_span.any() else 0.0
|
|
100
|
+
beneficial_count = int(beneficial.sum())
|
|
101
|
+
hard_excluded = time_span_days < window_days * min_coverage_ratio
|
|
102
|
+
included, exclusion_reason = self._determine_inclusion(
|
|
103
|
+
window, coverage_pct, hard_excluded,
|
|
104
|
+
)
|
|
105
|
+
rows.append({
|
|
106
|
+
"window": window, "window_days": window_days,
|
|
107
|
+
"coverage_pct": round(coverage_pct, 4),
|
|
108
|
+
"meaningful_pct": round(meaningful_pct, 4),
|
|
109
|
+
"beneficial_entities": beneficial_count,
|
|
110
|
+
"primary_segments": [], "included": included,
|
|
111
|
+
"exclusion_reason": exclusion_reason, "note": "",
|
|
112
|
+
})
|
|
113
|
+
return rows
|
|
114
|
+
|
|
115
|
+
def _all_time_row(self, n: int) -> Dict:
|
|
116
|
+
return {
|
|
117
|
+
"window": "all_time", "window_days": None,
|
|
118
|
+
"coverage_pct": 1.0, "meaningful_pct": 1.0,
|
|
119
|
+
"beneficial_entities": n, "primary_segments": [],
|
|
120
|
+
"included": True, "exclusion_reason": "", "note": "",
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
def _determine_inclusion(self, window: str, coverage_pct: float, hard_excluded: bool) -> Tuple[bool, str]:
|
|
124
|
+
if hard_excluded:
|
|
125
|
+
if window in self._always_include:
|
|
126
|
+
return True, ""
|
|
127
|
+
return False, f"Excluded: span < {WINDOW_DAYS_MAP[window] * 2:.0f}d required"
|
|
128
|
+
if window in self._always_include:
|
|
129
|
+
return True, ""
|
|
130
|
+
if coverage_pct >= self._coverage_threshold:
|
|
131
|
+
return True, ""
|
|
132
|
+
return False, f"Coverage {coverage_pct:.1%} < threshold {self._coverage_threshold:.1%}"
|
|
133
|
+
|
|
134
|
+
def _annotate_context(self, rows: List[Dict], lifecycles: pd.DataFrame) -> None:
|
|
135
|
+
self._annotate_segments(rows, lifecycles)
|
|
136
|
+
self._annotate_seasonality(rows)
|
|
137
|
+
self._annotate_timing(rows)
|
|
138
|
+
|
|
139
|
+
def _annotate_segments(self, rows: List[Dict], lifecycles: pd.DataFrame) -> None:
|
|
140
|
+
context_lc = self._segment_lifecycles if self._segment_lifecycles is not None else self._quadrant_lifecycles
|
|
141
|
+
if context_lc is None:
|
|
142
|
+
return
|
|
143
|
+
group_col = "activity_segment" if "activity_segment" in (context_lc.columns if self._segment_lifecycles is not None else []) else None
|
|
144
|
+
if group_col is None and self._quadrant_lifecycles is not None and "lifecycle_quadrant" in self._quadrant_lifecycles.columns:
|
|
145
|
+
group_col = "lifecycle_quadrant"
|
|
146
|
+
context_lc = self._quadrant_lifecycles
|
|
147
|
+
if group_col is None:
|
|
148
|
+
return
|
|
149
|
+
duration = context_lc["duration_days"].astype(float)
|
|
150
|
+
event_count = context_lc["event_count"].astype(float)
|
|
151
|
+
groups = context_lc[group_col]
|
|
152
|
+
for row in rows:
|
|
153
|
+
window_days = row["window_days"]
|
|
154
|
+
if window_days is None:
|
|
155
|
+
row["primary_segments"] = sorted(groups.unique().tolist())
|
|
156
|
+
continue
|
|
157
|
+
has_span = duration >= window_days
|
|
158
|
+
expected_events = event_count * (window_days / duration.clip(lower=1))
|
|
159
|
+
beneficial = has_span & (expected_events >= 2)
|
|
160
|
+
if not beneficial.any():
|
|
161
|
+
continue
|
|
162
|
+
group_coverage = groups[beneficial].value_counts(normalize=True)
|
|
163
|
+
top = group_coverage[group_coverage >= 0.15].index.tolist()
|
|
164
|
+
row["primary_segments"] = sorted(top[:3])
|
|
165
|
+
|
|
166
|
+
def _annotate_seasonality(self, rows: List[Dict]) -> None:
|
|
167
|
+
if not self._seasonality_periods:
|
|
168
|
+
return
|
|
169
|
+
detected_windows = set()
|
|
170
|
+
for sp in self._seasonality_periods:
|
|
171
|
+
if sp.period in SEASONALITY_WINDOW_MAP:
|
|
172
|
+
detected_windows.add(SEASONALITY_WINDOW_MAP[sp.period])
|
|
173
|
+
for row in rows:
|
|
174
|
+
if row["window"] in detected_windows:
|
|
175
|
+
period_name = next(
|
|
176
|
+
(sp.period_name or f"{sp.period}d" for sp in self._seasonality_periods
|
|
177
|
+
if SEASONALITY_WINDOW_MAP.get(sp.period) == row["window"]), ""
|
|
178
|
+
)
|
|
179
|
+
row["note"] = f"Seasonality detected ({period_name})"
|
|
180
|
+
|
|
181
|
+
def _annotate_timing(self, rows: List[Dict]) -> None:
|
|
182
|
+
if self._inter_event_median is None:
|
|
183
|
+
return
|
|
184
|
+
for row in rows:
|
|
185
|
+
window_days = row["window_days"]
|
|
186
|
+
if window_days is None:
|
|
187
|
+
continue
|
|
188
|
+
ratio = self._inter_event_median / window_days if window_days > 0 else 0
|
|
189
|
+
if TIMING_TOLERANCE <= ratio <= (1.0 / TIMING_TOLERANCE):
|
|
190
|
+
existing = row["note"]
|
|
191
|
+
timing_note = "Timing-aligned (median inter-event)"
|
|
192
|
+
row["note"] = f"{existing}; {timing_note}" if existing else timing_note
|
|
193
|
+
|
|
194
|
+
def _compute_heterogeneity(self, lifecycles: pd.DataFrame, selected_windows: List[str]) -> TemporalHeterogeneityResult:
|
|
195
|
+
eta_intensity, eta_event = self._compute_eta_squared(lifecycles)
|
|
196
|
+
level = self._classify_heterogeneity(max(eta_intensity, eta_event))
|
|
197
|
+
cold_start_frac = self._cold_start_fraction(lifecycles)
|
|
198
|
+
advisory, rationale = self._build_advisory(level, cold_start_frac, selected_windows, lifecycles)
|
|
199
|
+
coverage_table = self._build_coverage_table(lifecycles, selected_windows)
|
|
200
|
+
return TemporalHeterogeneityResult(
|
|
201
|
+
eta_squared_intensity=eta_intensity,
|
|
202
|
+
eta_squared_event_count=eta_event,
|
|
203
|
+
heterogeneity_level=level,
|
|
204
|
+
segmentation_advisory=advisory,
|
|
205
|
+
advisory_rationale=rationale,
|
|
206
|
+
coverage_table=coverage_table,
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
def _compute_eta_squared(self, lifecycles: pd.DataFrame) -> Tuple[float, float]:
|
|
210
|
+
group_col = "lifecycle_quadrant" if "lifecycle_quadrant" in lifecycles.columns else None
|
|
211
|
+
if group_col is None:
|
|
212
|
+
return 0.0, 0.0
|
|
213
|
+
groups = lifecycles[group_col]
|
|
214
|
+
if groups.nunique() < 2:
|
|
215
|
+
return 0.0, 0.0
|
|
216
|
+
eta_intensity = self._eta_squared_for_variable(lifecycles, "intensity", groups)
|
|
217
|
+
eta_event = self._eta_squared_for_variable(lifecycles, "event_count", groups)
|
|
218
|
+
return eta_intensity, eta_event
|
|
219
|
+
|
|
220
|
+
def _eta_squared_for_variable(self, df: pd.DataFrame, var: str, groups: pd.Series) -> float:
|
|
221
|
+
if var not in df.columns:
|
|
222
|
+
return 0.0
|
|
223
|
+
values = df[var].astype(float)
|
|
224
|
+
grand_mean = values.mean()
|
|
225
|
+
ss_total = ((values - grand_mean) ** 2).sum()
|
|
226
|
+
if ss_total == 0:
|
|
227
|
+
return 0.0
|
|
228
|
+
ss_between = 0.0
|
|
229
|
+
for _, group_vals in values.groupby(groups):
|
|
230
|
+
n_k = len(group_vals)
|
|
231
|
+
mean_k = group_vals.mean()
|
|
232
|
+
ss_between += n_k * (mean_k - grand_mean) ** 2
|
|
233
|
+
return float(ss_between / ss_total)
|
|
234
|
+
|
|
235
|
+
def _classify_heterogeneity(self, eta_max: float) -> str:
|
|
236
|
+
if eta_max < 0.06:
|
|
237
|
+
return "low"
|
|
238
|
+
if eta_max < 0.14:
|
|
239
|
+
return "moderate"
|
|
240
|
+
return "high"
|
|
241
|
+
|
|
242
|
+
def _cold_start_fraction(self, lifecycles: pd.DataFrame) -> float:
|
|
243
|
+
cold_labels = {"One-shot", "One-time"}
|
|
244
|
+
cold_count = 0
|
|
245
|
+
for col in ("lifecycle_quadrant", "activity_segment"):
|
|
246
|
+
if col in lifecycles.columns:
|
|
247
|
+
cold_count = max(cold_count, lifecycles[col].isin(cold_labels).sum())
|
|
248
|
+
return cold_count / len(lifecycles) if len(lifecycles) > 0 else 0.0
|
|
249
|
+
|
|
250
|
+
def _build_advisory(
|
|
251
|
+
self, level: str, cold_start_frac: float, selected_windows: List[str], lifecycles: pd.DataFrame,
|
|
252
|
+
) -> Tuple[str, List[str]]:
|
|
253
|
+
rationale: List[str] = []
|
|
254
|
+
if level == "low":
|
|
255
|
+
rationale.append("Low temporal diversity across quadrants")
|
|
256
|
+
rationale.append("Union strategy loses minimal signal")
|
|
257
|
+
return "single_model", rationale
|
|
258
|
+
if level == "high" and cold_start_frac > 0.30:
|
|
259
|
+
rationale.append("High temporal diversity across quadrants")
|
|
260
|
+
rationale.append(f"Large cold-start population ({cold_start_frac:.0%} One-time/One-shot)")
|
|
261
|
+
rationale.append("Consider separate handling for entities with vs without history")
|
|
262
|
+
return "consider_separate_models", rationale
|
|
263
|
+
rationale.append(f"{level.capitalize()} temporal diversity across quadrants")
|
|
264
|
+
rationale.append("Union windows still pragmatic for feature engineering")
|
|
265
|
+
rationale.append("Model may benefit from knowing entity's engagement pattern")
|
|
266
|
+
return "consider_segment_feature", rationale
|
|
267
|
+
|
|
268
|
+
def _build_coverage_table(self, lifecycles: pd.DataFrame, selected_windows: List[str]) -> pd.DataFrame:
|
|
269
|
+
duration = lifecycles["duration_days"].astype(float)
|
|
270
|
+
event_count = lifecycles["event_count"].astype(float)
|
|
271
|
+
rows = []
|
|
272
|
+
for window in selected_windows:
|
|
273
|
+
window_days = WINDOW_DAYS_MAP.get(window)
|
|
274
|
+
if window_days is None:
|
|
275
|
+
rows.append({"window": "all_time", "coverage_pct": 1.0, "meaningful_pct": 1.0, "zero_risk_pct": 0.0})
|
|
276
|
+
continue
|
|
277
|
+
has_span = duration >= window_days
|
|
278
|
+
coverage = has_span.mean()
|
|
279
|
+
expected_events = event_count * (window_days / duration.clip(lower=1))
|
|
280
|
+
meaningful = (has_span & (expected_events >= 2)).mean()
|
|
281
|
+
zero_risk = 1.0 - meaningful
|
|
282
|
+
rows.append({
|
|
283
|
+
"window": window,
|
|
284
|
+
"coverage_pct": round(float(coverage), 4),
|
|
285
|
+
"meaningful_pct": round(float(meaningful), 4),
|
|
286
|
+
"zero_risk_pct": round(float(zero_risk), 4),
|
|
287
|
+
})
|
|
288
|
+
return pd.DataFrame(rows)
|