churnkit 0.75.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
- churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
- churnkit-0.75.0a1.dist-info/METADATA +229 -0
- churnkit-0.75.0a1.dist-info/RECORD +302 -0
- churnkit-0.75.0a1.dist-info/WHEEL +4 -0
- churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
- churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
- customer_retention/__init__.py +37 -0
- customer_retention/analysis/__init__.py +0 -0
- customer_retention/analysis/auto_explorer/__init__.py +62 -0
- customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
- customer_retention/analysis/auto_explorer/explorer.py +258 -0
- customer_retention/analysis/auto_explorer/findings.py +291 -0
- customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
- customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
- customer_retention/analysis/auto_explorer/recommendations.py +418 -0
- customer_retention/analysis/business/__init__.py +26 -0
- customer_retention/analysis/business/ab_test_designer.py +144 -0
- customer_retention/analysis/business/fairness_analyzer.py +166 -0
- customer_retention/analysis/business/intervention_matcher.py +121 -0
- customer_retention/analysis/business/report_generator.py +222 -0
- customer_retention/analysis/business/risk_profile.py +199 -0
- customer_retention/analysis/business/roi_analyzer.py +139 -0
- customer_retention/analysis/diagnostics/__init__.py +20 -0
- customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
- customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
- customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
- customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
- customer_retention/analysis/diagnostics/noise_tester.py +140 -0
- customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
- customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
- customer_retention/analysis/discovery/__init__.py +8 -0
- customer_retention/analysis/discovery/config_generator.py +49 -0
- customer_retention/analysis/discovery/discovery_flow.py +19 -0
- customer_retention/analysis/discovery/type_inferencer.py +147 -0
- customer_retention/analysis/interpretability/__init__.py +13 -0
- customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
- customer_retention/analysis/interpretability/counterfactual.py +175 -0
- customer_retention/analysis/interpretability/individual_explainer.py +141 -0
- customer_retention/analysis/interpretability/pdp_generator.py +103 -0
- customer_retention/analysis/interpretability/shap_explainer.py +106 -0
- customer_retention/analysis/jupyter_save_hook.py +28 -0
- customer_retention/analysis/notebook_html_exporter.py +136 -0
- customer_retention/analysis/notebook_progress.py +60 -0
- customer_retention/analysis/plotly_preprocessor.py +154 -0
- customer_retention/analysis/recommendations/__init__.py +54 -0
- customer_retention/analysis/recommendations/base.py +158 -0
- customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
- customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
- customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
- customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
- customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
- customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
- customer_retention/analysis/recommendations/datetime/extract.py +149 -0
- customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
- customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
- customer_retention/analysis/recommendations/pipeline.py +74 -0
- customer_retention/analysis/recommendations/registry.py +76 -0
- customer_retention/analysis/recommendations/selection/__init__.py +3 -0
- customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
- customer_retention/analysis/recommendations/transform/__init__.py +4 -0
- customer_retention/analysis/recommendations/transform/power.py +94 -0
- customer_retention/analysis/recommendations/transform/scale.py +112 -0
- customer_retention/analysis/visualization/__init__.py +15 -0
- customer_retention/analysis/visualization/chart_builder.py +2619 -0
- customer_retention/analysis/visualization/console.py +122 -0
- customer_retention/analysis/visualization/display.py +171 -0
- customer_retention/analysis/visualization/number_formatter.py +36 -0
- customer_retention/artifacts/__init__.py +3 -0
- customer_retention/artifacts/fit_artifact_registry.py +146 -0
- customer_retention/cli.py +93 -0
- customer_retention/core/__init__.py +0 -0
- customer_retention/core/compat/__init__.py +193 -0
- customer_retention/core/compat/detection.py +99 -0
- customer_retention/core/compat/ops.py +48 -0
- customer_retention/core/compat/pandas_backend.py +57 -0
- customer_retention/core/compat/spark_backend.py +75 -0
- customer_retention/core/components/__init__.py +11 -0
- customer_retention/core/components/base.py +79 -0
- customer_retention/core/components/components/__init__.py +13 -0
- customer_retention/core/components/components/deployer.py +26 -0
- customer_retention/core/components/components/explainer.py +26 -0
- customer_retention/core/components/components/feature_eng.py +33 -0
- customer_retention/core/components/components/ingester.py +34 -0
- customer_retention/core/components/components/profiler.py +34 -0
- customer_retention/core/components/components/trainer.py +38 -0
- customer_retention/core/components/components/transformer.py +36 -0
- customer_retention/core/components/components/validator.py +37 -0
- customer_retention/core/components/enums.py +33 -0
- customer_retention/core/components/orchestrator.py +94 -0
- customer_retention/core/components/registry.py +59 -0
- customer_retention/core/config/__init__.py +39 -0
- customer_retention/core/config/column_config.py +95 -0
- customer_retention/core/config/experiments.py +71 -0
- customer_retention/core/config/pipeline_config.py +117 -0
- customer_retention/core/config/source_config.py +83 -0
- customer_retention/core/utils/__init__.py +28 -0
- customer_retention/core/utils/leakage.py +85 -0
- customer_retention/core/utils/severity.py +53 -0
- customer_retention/core/utils/statistics.py +90 -0
- customer_retention/generators/__init__.py +0 -0
- customer_retention/generators/notebook_generator/__init__.py +167 -0
- customer_retention/generators/notebook_generator/base.py +55 -0
- customer_retention/generators/notebook_generator/cell_builder.py +49 -0
- customer_retention/generators/notebook_generator/config.py +47 -0
- customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
- customer_retention/generators/notebook_generator/local_generator.py +48 -0
- customer_retention/generators/notebook_generator/project_init.py +174 -0
- customer_retention/generators/notebook_generator/runner.py +150 -0
- customer_retention/generators/notebook_generator/script_generator.py +110 -0
- customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
- customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
- customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
- customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
- customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
- customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
- customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
- customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
- customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
- customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
- customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
- customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
- customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
- customer_retention/generators/orchestration/__init__.py +23 -0
- customer_retention/generators/orchestration/code_generator.py +196 -0
- customer_retention/generators/orchestration/context.py +147 -0
- customer_retention/generators/orchestration/data_materializer.py +188 -0
- customer_retention/generators/orchestration/databricks_exporter.py +411 -0
- customer_retention/generators/orchestration/doc_generator.py +311 -0
- customer_retention/generators/pipeline_generator/__init__.py +26 -0
- customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
- customer_retention/generators/pipeline_generator/generator.py +142 -0
- customer_retention/generators/pipeline_generator/models.py +166 -0
- customer_retention/generators/pipeline_generator/renderer.py +2125 -0
- customer_retention/generators/spec_generator/__init__.py +37 -0
- customer_retention/generators/spec_generator/databricks_generator.py +433 -0
- customer_retention/generators/spec_generator/generic_generator.py +373 -0
- customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
- customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
- customer_retention/integrations/__init__.py +0 -0
- customer_retention/integrations/adapters/__init__.py +13 -0
- customer_retention/integrations/adapters/base.py +10 -0
- customer_retention/integrations/adapters/factory.py +25 -0
- customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
- customer_retention/integrations/adapters/feature_store/base.py +57 -0
- customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
- customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
- customer_retention/integrations/adapters/feature_store/local.py +75 -0
- customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
- customer_retention/integrations/adapters/mlflow/base.py +32 -0
- customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
- customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
- customer_retention/integrations/adapters/mlflow/local.py +50 -0
- customer_retention/integrations/adapters/storage/__init__.py +5 -0
- customer_retention/integrations/adapters/storage/base.py +33 -0
- customer_retention/integrations/adapters/storage/databricks.py +76 -0
- customer_retention/integrations/adapters/storage/local.py +59 -0
- customer_retention/integrations/feature_store/__init__.py +47 -0
- customer_retention/integrations/feature_store/definitions.py +215 -0
- customer_retention/integrations/feature_store/manager.py +744 -0
- customer_retention/integrations/feature_store/registry.py +412 -0
- customer_retention/integrations/iteration/__init__.py +28 -0
- customer_retention/integrations/iteration/context.py +212 -0
- customer_retention/integrations/iteration/feedback_collector.py +184 -0
- customer_retention/integrations/iteration/orchestrator.py +168 -0
- customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
- customer_retention/integrations/iteration/signals.py +212 -0
- customer_retention/integrations/llm_context/__init__.py +4 -0
- customer_retention/integrations/llm_context/context_builder.py +201 -0
- customer_retention/integrations/llm_context/prompts.py +100 -0
- customer_retention/integrations/streaming/__init__.py +103 -0
- customer_retention/integrations/streaming/batch_integration.py +149 -0
- customer_retention/integrations/streaming/early_warning_model.py +227 -0
- customer_retention/integrations/streaming/event_schema.py +214 -0
- customer_retention/integrations/streaming/online_store_writer.py +249 -0
- customer_retention/integrations/streaming/realtime_scorer.py +261 -0
- customer_retention/integrations/streaming/trigger_engine.py +293 -0
- customer_retention/integrations/streaming/window_aggregator.py +393 -0
- customer_retention/stages/__init__.py +0 -0
- customer_retention/stages/cleaning/__init__.py +9 -0
- customer_retention/stages/cleaning/base.py +28 -0
- customer_retention/stages/cleaning/missing_handler.py +160 -0
- customer_retention/stages/cleaning/outlier_handler.py +204 -0
- customer_retention/stages/deployment/__init__.py +28 -0
- customer_retention/stages/deployment/batch_scorer.py +106 -0
- customer_retention/stages/deployment/champion_challenger.py +299 -0
- customer_retention/stages/deployment/model_registry.py +182 -0
- customer_retention/stages/deployment/retraining_trigger.py +245 -0
- customer_retention/stages/features/__init__.py +73 -0
- customer_retention/stages/features/behavioral_features.py +266 -0
- customer_retention/stages/features/customer_segmentation.py +505 -0
- customer_retention/stages/features/feature_definitions.py +265 -0
- customer_retention/stages/features/feature_engineer.py +551 -0
- customer_retention/stages/features/feature_manifest.py +340 -0
- customer_retention/stages/features/feature_selector.py +239 -0
- customer_retention/stages/features/interaction_features.py +160 -0
- customer_retention/stages/features/temporal_features.py +243 -0
- customer_retention/stages/ingestion/__init__.py +9 -0
- customer_retention/stages/ingestion/load_result.py +32 -0
- customer_retention/stages/ingestion/loaders.py +195 -0
- customer_retention/stages/ingestion/source_registry.py +130 -0
- customer_retention/stages/modeling/__init__.py +31 -0
- customer_retention/stages/modeling/baseline_trainer.py +139 -0
- customer_retention/stages/modeling/cross_validator.py +125 -0
- customer_retention/stages/modeling/data_splitter.py +205 -0
- customer_retention/stages/modeling/feature_scaler.py +99 -0
- customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
- customer_retention/stages/modeling/imbalance_handler.py +282 -0
- customer_retention/stages/modeling/mlflow_logger.py +95 -0
- customer_retention/stages/modeling/model_comparator.py +149 -0
- customer_retention/stages/modeling/model_evaluator.py +138 -0
- customer_retention/stages/modeling/threshold_optimizer.py +131 -0
- customer_retention/stages/monitoring/__init__.py +37 -0
- customer_retention/stages/monitoring/alert_manager.py +328 -0
- customer_retention/stages/monitoring/drift_detector.py +201 -0
- customer_retention/stages/monitoring/performance_monitor.py +242 -0
- customer_retention/stages/preprocessing/__init__.py +5 -0
- customer_retention/stages/preprocessing/transformer_manager.py +284 -0
- customer_retention/stages/profiling/__init__.py +256 -0
- customer_retention/stages/profiling/categorical_distribution.py +269 -0
- customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
- customer_retention/stages/profiling/column_profiler.py +527 -0
- customer_retention/stages/profiling/distribution_analysis.py +483 -0
- customer_retention/stages/profiling/drift_detector.py +310 -0
- customer_retention/stages/profiling/feature_capacity.py +507 -0
- customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
- customer_retention/stages/profiling/profile_result.py +212 -0
- customer_retention/stages/profiling/quality_checks.py +1632 -0
- customer_retention/stages/profiling/relationship_detector.py +256 -0
- customer_retention/stages/profiling/relationship_recommender.py +454 -0
- customer_retention/stages/profiling/report_generator.py +520 -0
- customer_retention/stages/profiling/scd_analyzer.py +151 -0
- customer_retention/stages/profiling/segment_analyzer.py +632 -0
- customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
- customer_retention/stages/profiling/target_level_analyzer.py +217 -0
- customer_retention/stages/profiling/temporal_analyzer.py +388 -0
- customer_retention/stages/profiling/temporal_coverage.py +488 -0
- customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
- customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
- customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
- customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
- customer_retention/stages/profiling/text_embedder.py +87 -0
- customer_retention/stages/profiling/text_processor.py +115 -0
- customer_retention/stages/profiling/text_reducer.py +60 -0
- customer_retention/stages/profiling/time_series_profiler.py +303 -0
- customer_retention/stages/profiling/time_window_aggregator.py +376 -0
- customer_retention/stages/profiling/type_detector.py +382 -0
- customer_retention/stages/profiling/window_recommendation.py +288 -0
- customer_retention/stages/temporal/__init__.py +166 -0
- customer_retention/stages/temporal/access_guard.py +180 -0
- customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
- customer_retention/stages/temporal/data_preparer.py +178 -0
- customer_retention/stages/temporal/point_in_time_join.py +134 -0
- customer_retention/stages/temporal/point_in_time_registry.py +148 -0
- customer_retention/stages/temporal/scenario_detector.py +163 -0
- customer_retention/stages/temporal/snapshot_manager.py +259 -0
- customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
- customer_retention/stages/temporal/timestamp_discovery.py +531 -0
- customer_retention/stages/temporal/timestamp_manager.py +255 -0
- customer_retention/stages/transformation/__init__.py +13 -0
- customer_retention/stages/transformation/binary_handler.py +85 -0
- customer_retention/stages/transformation/categorical_encoder.py +245 -0
- customer_retention/stages/transformation/datetime_transformer.py +97 -0
- customer_retention/stages/transformation/numeric_transformer.py +181 -0
- customer_retention/stages/transformation/pipeline.py +257 -0
- customer_retention/stages/validation/__init__.py +60 -0
- customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
- customer_retention/stages/validation/business_sense_gate.py +173 -0
- customer_retention/stages/validation/data_quality_gate.py +235 -0
- customer_retention/stages/validation/data_validators.py +511 -0
- customer_retention/stages/validation/feature_quality_gate.py +183 -0
- customer_retention/stages/validation/gates.py +117 -0
- customer_retention/stages/validation/leakage_gate.py +352 -0
- customer_retention/stages/validation/model_validity_gate.py +213 -0
- customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
- customer_retention/stages/validation/quality_scorer.py +544 -0
- customer_retention/stages/validation/rule_generator.py +57 -0
- customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
- customer_retention/stages/validation/timeseries_detector.py +769 -0
- customer_retention/transforms/__init__.py +47 -0
- customer_retention/transforms/artifact_store.py +50 -0
- customer_retention/transforms/executor.py +157 -0
- customer_retention/transforms/fitted.py +92 -0
- customer_retention/transforms/ops.py +148 -0
|
@@ -0,0 +1,769 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Time series detection and validation for exploratory data analysis.
|
|
3
|
+
|
|
4
|
+
This module provides detection of time series data patterns and
|
|
5
|
+
quality validation specific to temporal datasets.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import warnings
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from datetime import timedelta
|
|
11
|
+
from enum import Enum
|
|
12
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
13
|
+
|
|
14
|
+
from customer_retention.core.compat import DataFrame, pd
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DatasetType(Enum):
|
|
18
|
+
"""Classification of dataset structure."""
|
|
19
|
+
SNAPSHOT = "snapshot" # Single row per entity (point-in-time)
|
|
20
|
+
TIME_SERIES = "time_series" # Multiple rows per entity over time
|
|
21
|
+
EVENT_LOG = "event_log" # Irregular events per entity
|
|
22
|
+
UNKNOWN = "unknown"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class TimeSeriesFrequency(Enum):
|
|
26
|
+
"""Detected frequency of time series."""
|
|
27
|
+
DAILY = "daily"
|
|
28
|
+
WEEKLY = "weekly"
|
|
29
|
+
MONTHLY = "monthly"
|
|
30
|
+
QUARTERLY = "quarterly"
|
|
31
|
+
YEARLY = "yearly"
|
|
32
|
+
HOURLY = "hourly"
|
|
33
|
+
IRREGULAR = "irregular"
|
|
34
|
+
UNKNOWN = "unknown"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class TimeSeriesCharacteristics:
|
|
39
|
+
"""Characteristics of detected time series data."""
|
|
40
|
+
is_time_series: bool
|
|
41
|
+
dataset_type: DatasetType
|
|
42
|
+
entity_column: Optional[str] = None
|
|
43
|
+
timestamp_column: Optional[str] = None
|
|
44
|
+
|
|
45
|
+
# Entity statistics
|
|
46
|
+
total_entities: int = 0
|
|
47
|
+
min_observations_per_entity: int = 0
|
|
48
|
+
max_observations_per_entity: int = 0
|
|
49
|
+
avg_observations_per_entity: float = 0.0
|
|
50
|
+
median_observations_per_entity: float = 0.0
|
|
51
|
+
|
|
52
|
+
# Temporal statistics
|
|
53
|
+
time_span_days: float = 0.0
|
|
54
|
+
detected_frequency: TimeSeriesFrequency = TimeSeriesFrequency.UNKNOWN
|
|
55
|
+
median_interval_hours: float = 0.0
|
|
56
|
+
|
|
57
|
+
# Quality indicators
|
|
58
|
+
entities_with_single_observation: int = 0
|
|
59
|
+
entities_with_gaps: int = 0
|
|
60
|
+
duplicate_timestamps_count: int = 0
|
|
61
|
+
|
|
62
|
+
confidence: float = 0.0 # 0-1 confidence in detection
|
|
63
|
+
evidence: List[str] = field(default_factory=list)
|
|
64
|
+
|
|
65
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
66
|
+
"""Convert to dictionary for serialization."""
|
|
67
|
+
return {
|
|
68
|
+
"is_time_series": self.is_time_series,
|
|
69
|
+
"dataset_type": self.dataset_type.value,
|
|
70
|
+
"entity_column": self.entity_column,
|
|
71
|
+
"timestamp_column": self.timestamp_column,
|
|
72
|
+
"total_entities": self.total_entities,
|
|
73
|
+
"avg_observations_per_entity": round(self.avg_observations_per_entity, 2),
|
|
74
|
+
"time_span_days": round(self.time_span_days, 1),
|
|
75
|
+
"detected_frequency": self.detected_frequency.value,
|
|
76
|
+
"confidence": round(self.confidence, 2),
|
|
77
|
+
"evidence": self.evidence
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@dataclass
|
|
82
|
+
class TimeSeriesValidationResult:
|
|
83
|
+
"""Result of time series quality validation."""
|
|
84
|
+
# Temporal coverage
|
|
85
|
+
total_expected_periods: int = 0
|
|
86
|
+
total_actual_periods: int = 0
|
|
87
|
+
coverage_percentage: float = 100.0
|
|
88
|
+
|
|
89
|
+
# Gap analysis
|
|
90
|
+
entities_with_gaps: int = 0
|
|
91
|
+
total_gaps: int = 0
|
|
92
|
+
max_gap_periods: int = 0
|
|
93
|
+
gap_examples: List[Dict[str, Any]] = field(default_factory=list)
|
|
94
|
+
|
|
95
|
+
# Duplicate timestamps
|
|
96
|
+
entities_with_duplicate_timestamps: int = 0
|
|
97
|
+
total_duplicate_timestamps: int = 0
|
|
98
|
+
duplicate_examples: List[Dict[str, Any]] = field(default_factory=list)
|
|
99
|
+
|
|
100
|
+
# Temporal ordering
|
|
101
|
+
entities_with_ordering_issues: int = 0
|
|
102
|
+
ordering_issue_examples: List[Dict[str, Any]] = field(default_factory=list)
|
|
103
|
+
|
|
104
|
+
# Frequency consistency
|
|
105
|
+
frequency_consistent: bool = True
|
|
106
|
+
frequency_deviation_percentage: float = 0.0
|
|
107
|
+
|
|
108
|
+
# Overall quality score for time series aspects
|
|
109
|
+
temporal_quality_score: float = 100.0
|
|
110
|
+
issues: List[str] = field(default_factory=list)
|
|
111
|
+
|
|
112
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
113
|
+
"""Convert to dictionary for serialization."""
|
|
114
|
+
return {
|
|
115
|
+
"coverage_percentage": round(self.coverage_percentage, 2),
|
|
116
|
+
"entities_with_gaps": self.entities_with_gaps,
|
|
117
|
+
"total_gaps": self.total_gaps,
|
|
118
|
+
"entities_with_duplicate_timestamps": self.entities_with_duplicate_timestamps,
|
|
119
|
+
"total_duplicate_timestamps": self.total_duplicate_timestamps,
|
|
120
|
+
"frequency_consistent": self.frequency_consistent,
|
|
121
|
+
"temporal_quality_score": round(self.temporal_quality_score, 1),
|
|
122
|
+
"issues": self.issues
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class TimeSeriesDetector:
|
|
127
|
+
"""
|
|
128
|
+
Detect time series patterns in datasets.
|
|
129
|
+
|
|
130
|
+
Analyzes a dataset to determine if it represents:
|
|
131
|
+
- Snapshot data (single observation per entity)
|
|
132
|
+
- Time series data (multiple observations per entity over time)
|
|
133
|
+
- Event log data (irregular events per entity)
|
|
134
|
+
|
|
135
|
+
Example
|
|
136
|
+
-------
|
|
137
|
+
>>> detector = TimeSeriesDetector()
|
|
138
|
+
>>> result = detector.detect(df, entity_column='customer_id')
|
|
139
|
+
>>> if result.is_time_series:
|
|
140
|
+
... print(f"Time series detected with {result.avg_observations_per_entity:.1f} obs/entity")
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
# Common timestamp column name patterns
|
|
144
|
+
TIMESTAMP_PATTERNS = [
|
|
145
|
+
'date', 'time', 'timestamp', 'datetime', 'created', 'updated',
|
|
146
|
+
'event_date', 'transaction_date', 'order_date', 'period',
|
|
147
|
+
'month', 'year', 'week', 'day', 'ts', 'dt'
|
|
148
|
+
]
|
|
149
|
+
|
|
150
|
+
# Common entity/ID column name patterns
|
|
151
|
+
ENTITY_PATTERNS = [
|
|
152
|
+
'id', 'customer_id', 'user_id', 'account_id', 'entity_id',
|
|
153
|
+
'custid', 'userid', 'client_id', 'member_id', 'subscriber_id'
|
|
154
|
+
]
|
|
155
|
+
|
|
156
|
+
def detect(
|
|
157
|
+
self,
|
|
158
|
+
df: DataFrame,
|
|
159
|
+
entity_column: Optional[str] = None,
|
|
160
|
+
timestamp_column: Optional[str] = None,
|
|
161
|
+
min_observations_threshold: int = 2
|
|
162
|
+
) -> TimeSeriesCharacteristics:
|
|
163
|
+
"""
|
|
164
|
+
Detect if dataset contains time series data.
|
|
165
|
+
|
|
166
|
+
Parameters
|
|
167
|
+
----------
|
|
168
|
+
df : DataFrame
|
|
169
|
+
Data to analyze
|
|
170
|
+
entity_column : str, optional
|
|
171
|
+
Column identifying entities (e.g., customer_id).
|
|
172
|
+
If not provided, will attempt to auto-detect.
|
|
173
|
+
timestamp_column : str, optional
|
|
174
|
+
Column containing timestamps.
|
|
175
|
+
If not provided, will attempt to auto-detect.
|
|
176
|
+
min_observations_threshold : int
|
|
177
|
+
Minimum average observations per entity to classify as time series
|
|
178
|
+
|
|
179
|
+
Returns
|
|
180
|
+
-------
|
|
181
|
+
TimeSeriesCharacteristics
|
|
182
|
+
Detected characteristics of the dataset
|
|
183
|
+
"""
|
|
184
|
+
evidence = []
|
|
185
|
+
|
|
186
|
+
# Auto-detect entity column if not provided
|
|
187
|
+
if entity_column is None:
|
|
188
|
+
entity_column = self._detect_entity_column(df)
|
|
189
|
+
if entity_column:
|
|
190
|
+
evidence.append(f"Auto-detected entity column: {entity_column}")
|
|
191
|
+
|
|
192
|
+
# Auto-detect timestamp column if not provided
|
|
193
|
+
if timestamp_column is None:
|
|
194
|
+
timestamp_column = self._detect_timestamp_column(df)
|
|
195
|
+
if timestamp_column:
|
|
196
|
+
evidence.append(f"Auto-detected timestamp column: {timestamp_column}")
|
|
197
|
+
|
|
198
|
+
# If we can't detect both, return as unknown
|
|
199
|
+
if entity_column is None or entity_column not in df.columns:
|
|
200
|
+
return TimeSeriesCharacteristics(
|
|
201
|
+
is_time_series=False,
|
|
202
|
+
dataset_type=DatasetType.UNKNOWN,
|
|
203
|
+
confidence=0.0,
|
|
204
|
+
evidence=["Could not detect entity column"]
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# Calculate entity statistics
|
|
208
|
+
entity_counts = df[entity_column].value_counts()
|
|
209
|
+
total_entities = len(entity_counts)
|
|
210
|
+
|
|
211
|
+
# Handle empty dataframe
|
|
212
|
+
if total_entities == 0:
|
|
213
|
+
return TimeSeriesCharacteristics(
|
|
214
|
+
is_time_series=False,
|
|
215
|
+
dataset_type=DatasetType.SNAPSHOT,
|
|
216
|
+
entity_column=entity_column,
|
|
217
|
+
timestamp_column=timestamp_column,
|
|
218
|
+
total_entities=0,
|
|
219
|
+
confidence=0.0,
|
|
220
|
+
evidence=["Empty dataset - no entities found"]
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
min_obs = int(entity_counts.min())
|
|
224
|
+
max_obs = int(entity_counts.max())
|
|
225
|
+
avg_obs = float(entity_counts.mean())
|
|
226
|
+
median_obs = float(entity_counts.median())
|
|
227
|
+
single_obs_entities = int((entity_counts == 1).sum())
|
|
228
|
+
|
|
229
|
+
evidence.append(f"Found {total_entities:,} unique entities")
|
|
230
|
+
evidence.append(f"Observations per entity: min={min_obs}, max={max_obs}, avg={avg_obs:.1f}")
|
|
231
|
+
|
|
232
|
+
# Determine dataset type based on observations per entity
|
|
233
|
+
if avg_obs < min_observations_threshold:
|
|
234
|
+
# Mostly single observations - likely snapshot data
|
|
235
|
+
return TimeSeriesCharacteristics(
|
|
236
|
+
is_time_series=False,
|
|
237
|
+
dataset_type=DatasetType.SNAPSHOT,
|
|
238
|
+
entity_column=entity_column,
|
|
239
|
+
timestamp_column=timestamp_column,
|
|
240
|
+
total_entities=total_entities,
|
|
241
|
+
min_observations_per_entity=min_obs,
|
|
242
|
+
max_observations_per_entity=max_obs,
|
|
243
|
+
avg_observations_per_entity=avg_obs,
|
|
244
|
+
median_observations_per_entity=median_obs,
|
|
245
|
+
entities_with_single_observation=single_obs_entities,
|
|
246
|
+
confidence=0.8 if avg_obs < 1.5 else 0.6,
|
|
247
|
+
evidence=evidence + ["Dataset appears to be snapshot (single observation per entity)"]
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
# Multiple observations per entity - analyze temporal aspects
|
|
251
|
+
time_span_days = 0.0
|
|
252
|
+
detected_frequency = TimeSeriesFrequency.UNKNOWN
|
|
253
|
+
median_interval_hours = 0.0
|
|
254
|
+
duplicate_timestamps = 0
|
|
255
|
+
|
|
256
|
+
if timestamp_column and timestamp_column in df.columns:
|
|
257
|
+
# Convert to datetime if needed
|
|
258
|
+
ts_series = pd.to_datetime(
|
|
259
|
+
df[timestamp_column], errors='coerce', format='mixed'
|
|
260
|
+
)
|
|
261
|
+
valid_ts = ts_series.notna()
|
|
262
|
+
|
|
263
|
+
if valid_ts.sum() > 0:
|
|
264
|
+
time_span = ts_series.max() - ts_series.min()
|
|
265
|
+
time_span_days = time_span.total_seconds() / 86400
|
|
266
|
+
evidence.append(f"Time span: {time_span_days:.1f} days")
|
|
267
|
+
|
|
268
|
+
# Detect frequency
|
|
269
|
+
detected_frequency, median_interval_hours = self._detect_frequency(
|
|
270
|
+
df, entity_column, timestamp_column
|
|
271
|
+
)
|
|
272
|
+
evidence.append(f"Detected frequency: {detected_frequency.value}")
|
|
273
|
+
|
|
274
|
+
# Check for duplicate timestamps per entity
|
|
275
|
+
dup_check = df.groupby([entity_column, timestamp_column]).size()
|
|
276
|
+
duplicate_timestamps = int((dup_check > 1).sum())
|
|
277
|
+
if duplicate_timestamps > 0:
|
|
278
|
+
evidence.append(f"Found {duplicate_timestamps} duplicate timestamps")
|
|
279
|
+
|
|
280
|
+
# Determine if this is time series or event log
|
|
281
|
+
if detected_frequency == TimeSeriesFrequency.IRREGULAR:
|
|
282
|
+
dataset_type = DatasetType.EVENT_LOG
|
|
283
|
+
evidence.append("Irregular intervals suggest event log data")
|
|
284
|
+
else:
|
|
285
|
+
dataset_type = DatasetType.TIME_SERIES
|
|
286
|
+
evidence.append("Regular intervals suggest time series data")
|
|
287
|
+
|
|
288
|
+
# Calculate confidence
|
|
289
|
+
confidence = self._calculate_confidence(
|
|
290
|
+
avg_obs, timestamp_column is not None,
|
|
291
|
+
detected_frequency != TimeSeriesFrequency.UNKNOWN
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
return TimeSeriesCharacteristics(
|
|
295
|
+
is_time_series=True,
|
|
296
|
+
dataset_type=dataset_type,
|
|
297
|
+
entity_column=entity_column,
|
|
298
|
+
timestamp_column=timestamp_column,
|
|
299
|
+
total_entities=total_entities,
|
|
300
|
+
min_observations_per_entity=min_obs,
|
|
301
|
+
max_observations_per_entity=max_obs,
|
|
302
|
+
avg_observations_per_entity=avg_obs,
|
|
303
|
+
median_observations_per_entity=median_obs,
|
|
304
|
+
time_span_days=time_span_days,
|
|
305
|
+
detected_frequency=detected_frequency,
|
|
306
|
+
median_interval_hours=median_interval_hours,
|
|
307
|
+
entities_with_single_observation=single_obs_entities,
|
|
308
|
+
duplicate_timestamps_count=duplicate_timestamps,
|
|
309
|
+
confidence=confidence,
|
|
310
|
+
evidence=evidence
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
def _detect_entity_column(self, df: DataFrame) -> Optional[str]:
|
|
314
|
+
"""Auto-detect the entity/ID column."""
|
|
315
|
+
# First, look for columns matching common patterns
|
|
316
|
+
for col in df.columns:
|
|
317
|
+
col_lower = col.lower()
|
|
318
|
+
for pattern in self.ENTITY_PATTERNS:
|
|
319
|
+
if pattern in col_lower:
|
|
320
|
+
return col
|
|
321
|
+
|
|
322
|
+
# Look for columns that might be identifiers based on characteristics
|
|
323
|
+
for col in df.columns:
|
|
324
|
+
if df[col].dtype == 'object' or df[col].dtype.name.startswith('int'):
|
|
325
|
+
# High cardinality but not unique (multiple rows per entity)
|
|
326
|
+
distinct_ratio = df[col].nunique() / len(df)
|
|
327
|
+
if 0.01 < distinct_ratio < 0.9: # Not constant, not unique
|
|
328
|
+
# Check if values repeat
|
|
329
|
+
if df[col].value_counts().max() > 1:
|
|
330
|
+
return col
|
|
331
|
+
|
|
332
|
+
return None
|
|
333
|
+
|
|
334
|
+
def _detect_timestamp_column(self, df: DataFrame) -> Optional[str]:
|
|
335
|
+
"""Auto-detect the timestamp column."""
|
|
336
|
+
candidates = []
|
|
337
|
+
|
|
338
|
+
for col in df.columns:
|
|
339
|
+
col_lower = col.lower()
|
|
340
|
+
|
|
341
|
+
# Check if column name matches timestamp patterns
|
|
342
|
+
name_match = any(pattern in col_lower for pattern in self.TIMESTAMP_PATTERNS)
|
|
343
|
+
|
|
344
|
+
# Check if column is datetime type
|
|
345
|
+
is_datetime = pd.api.types.is_datetime64_any_dtype(df[col])
|
|
346
|
+
|
|
347
|
+
# Try to parse as datetime
|
|
348
|
+
can_parse = False
|
|
349
|
+
if not is_datetime and df[col].dtype == 'object':
|
|
350
|
+
try:
|
|
351
|
+
with warnings.catch_warnings():
|
|
352
|
+
warnings.filterwarnings('ignore', category=FutureWarning)
|
|
353
|
+
parsed = pd.to_datetime(
|
|
354
|
+
df[col].head(100), errors='coerce', format='mixed'
|
|
355
|
+
)
|
|
356
|
+
can_parse = parsed.notna().mean() > 0.8
|
|
357
|
+
except Exception:
|
|
358
|
+
pass
|
|
359
|
+
|
|
360
|
+
if is_datetime:
|
|
361
|
+
candidates.append((col, 3)) # Highest priority
|
|
362
|
+
elif name_match and can_parse:
|
|
363
|
+
candidates.append((col, 2))
|
|
364
|
+
elif name_match:
|
|
365
|
+
candidates.append((col, 1))
|
|
366
|
+
elif can_parse:
|
|
367
|
+
candidates.append((col, 1))
|
|
368
|
+
|
|
369
|
+
if candidates:
|
|
370
|
+
# Return highest priority candidate
|
|
371
|
+
candidates.sort(key=lambda x: x[1], reverse=True)
|
|
372
|
+
return candidates[0][0]
|
|
373
|
+
|
|
374
|
+
return None
|
|
375
|
+
|
|
376
|
+
def _detect_frequency(
|
|
377
|
+
self,
|
|
378
|
+
df: DataFrame,
|
|
379
|
+
entity_column: str,
|
|
380
|
+
timestamp_column: str
|
|
381
|
+
) -> Tuple[TimeSeriesFrequency, float]:
|
|
382
|
+
"""Detect the frequency of the time series."""
|
|
383
|
+
# Sample entities for efficiency
|
|
384
|
+
sample_entities = df[entity_column].unique()[:100]
|
|
385
|
+
|
|
386
|
+
intervals = []
|
|
387
|
+
for entity in sample_entities:
|
|
388
|
+
entity_data = df[df[entity_column] == entity].copy()
|
|
389
|
+
if len(entity_data) < 2:
|
|
390
|
+
continue
|
|
391
|
+
|
|
392
|
+
ts = pd.to_datetime(
|
|
393
|
+
entity_data[timestamp_column], errors='coerce', format='mixed'
|
|
394
|
+
)
|
|
395
|
+
ts = ts.dropna().sort_values()
|
|
396
|
+
|
|
397
|
+
if len(ts) < 2:
|
|
398
|
+
continue
|
|
399
|
+
|
|
400
|
+
diffs = ts.diff().dropna()
|
|
401
|
+
intervals.extend([d.total_seconds() / 3600 for d in diffs]) # Hours
|
|
402
|
+
|
|
403
|
+
if not intervals:
|
|
404
|
+
return TimeSeriesFrequency.UNKNOWN, 0.0
|
|
405
|
+
|
|
406
|
+
median_hours = float(pd.Series(intervals).median())
|
|
407
|
+
|
|
408
|
+
# Classify frequency based on median interval
|
|
409
|
+
if median_hours < 2:
|
|
410
|
+
freq = TimeSeriesFrequency.HOURLY
|
|
411
|
+
elif 20 <= median_hours <= 28:
|
|
412
|
+
freq = TimeSeriesFrequency.DAILY
|
|
413
|
+
elif 144 <= median_hours <= 192: # 6-8 days
|
|
414
|
+
freq = TimeSeriesFrequency.WEEKLY
|
|
415
|
+
elif 672 <= median_hours <= 768: # 28-32 days
|
|
416
|
+
freq = TimeSeriesFrequency.MONTHLY
|
|
417
|
+
elif 2016 <= median_hours <= 2208: # ~84-92 days
|
|
418
|
+
freq = TimeSeriesFrequency.QUARTERLY
|
|
419
|
+
elif 8400 <= median_hours <= 8880: # ~350-370 days
|
|
420
|
+
freq = TimeSeriesFrequency.YEARLY
|
|
421
|
+
else:
|
|
422
|
+
# Check variance to determine if irregular
|
|
423
|
+
std_hours = float(pd.Series(intervals).std())
|
|
424
|
+
cv = std_hours / median_hours if median_hours > 0 else 1
|
|
425
|
+
if cv > 0.5: # High coefficient of variation
|
|
426
|
+
freq = TimeSeriesFrequency.IRREGULAR
|
|
427
|
+
else:
|
|
428
|
+
freq = TimeSeriesFrequency.IRREGULAR
|
|
429
|
+
|
|
430
|
+
return freq, median_hours
|
|
431
|
+
|
|
432
|
+
def _calculate_confidence(
|
|
433
|
+
self,
|
|
434
|
+
avg_observations: float,
|
|
435
|
+
has_timestamp: bool,
|
|
436
|
+
has_frequency: bool
|
|
437
|
+
) -> float:
|
|
438
|
+
"""Calculate confidence score for time series detection."""
|
|
439
|
+
confidence = 0.5 # Base confidence
|
|
440
|
+
|
|
441
|
+
# More observations per entity = higher confidence
|
|
442
|
+
if avg_observations >= 10:
|
|
443
|
+
confidence += 0.3
|
|
444
|
+
elif avg_observations >= 5:
|
|
445
|
+
confidence += 0.2
|
|
446
|
+
elif avg_observations >= 2:
|
|
447
|
+
confidence += 0.1
|
|
448
|
+
|
|
449
|
+
# Having a timestamp column increases confidence
|
|
450
|
+
if has_timestamp:
|
|
451
|
+
confidence += 0.1
|
|
452
|
+
|
|
453
|
+
# Having detected frequency increases confidence
|
|
454
|
+
if has_frequency:
|
|
455
|
+
confidence += 0.1
|
|
456
|
+
|
|
457
|
+
return min(1.0, confidence)
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
class TimeSeriesValidator:
|
|
461
|
+
"""
|
|
462
|
+
Validate time series data quality.
|
|
463
|
+
|
|
464
|
+
Performs quality checks specific to time series data:
|
|
465
|
+
- Temporal coverage and gaps
|
|
466
|
+
- Duplicate timestamps
|
|
467
|
+
- Temporal ordering
|
|
468
|
+
- Frequency consistency
|
|
469
|
+
|
|
470
|
+
Example
|
|
471
|
+
-------
|
|
472
|
+
>>> validator = TimeSeriesValidator()
|
|
473
|
+
>>> result = validator.validate(
|
|
474
|
+
... df,
|
|
475
|
+
... entity_column='customer_id',
|
|
476
|
+
... timestamp_column='date',
|
|
477
|
+
... expected_frequency='daily'
|
|
478
|
+
... )
|
|
479
|
+
>>> print(f"Temporal quality: {result.temporal_quality_score:.1f}/100")
|
|
480
|
+
"""
|
|
481
|
+
|
|
482
|
+
def validate(
|
|
483
|
+
self,
|
|
484
|
+
df: DataFrame,
|
|
485
|
+
entity_column: str,
|
|
486
|
+
timestamp_column: str,
|
|
487
|
+
expected_frequency: Optional[str] = None,
|
|
488
|
+
max_allowed_gap_periods: int = 3
|
|
489
|
+
) -> TimeSeriesValidationResult:
|
|
490
|
+
"""
|
|
491
|
+
Validate time series data quality.
|
|
492
|
+
|
|
493
|
+
Parameters
|
|
494
|
+
----------
|
|
495
|
+
df : DataFrame
|
|
496
|
+
Time series data to validate
|
|
497
|
+
entity_column : str
|
|
498
|
+
Column identifying entities
|
|
499
|
+
timestamp_column : str
|
|
500
|
+
Column containing timestamps
|
|
501
|
+
expected_frequency : str, optional
|
|
502
|
+
Expected frequency ('daily', 'weekly', 'monthly', etc.)
|
|
503
|
+
max_allowed_gap_periods : int
|
|
504
|
+
Maximum gap periods before flagging as issue
|
|
505
|
+
|
|
506
|
+
Returns
|
|
507
|
+
-------
|
|
508
|
+
TimeSeriesValidationResult
|
|
509
|
+
Validation results with quality metrics
|
|
510
|
+
"""
|
|
511
|
+
issues = []
|
|
512
|
+
|
|
513
|
+
# Validate inputs
|
|
514
|
+
if entity_column not in df.columns:
|
|
515
|
+
return TimeSeriesValidationResult(
|
|
516
|
+
temporal_quality_score=0,
|
|
517
|
+
issues=[f"Entity column '{entity_column}' not found"]
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
if timestamp_column not in df.columns:
|
|
521
|
+
return TimeSeriesValidationResult(
|
|
522
|
+
temporal_quality_score=0,
|
|
523
|
+
issues=[f"Timestamp column '{timestamp_column}' not found"]
|
|
524
|
+
)
|
|
525
|
+
|
|
526
|
+
# Convert timestamp
|
|
527
|
+
df_copy = df.copy()
|
|
528
|
+
df_copy['_ts'] = pd.to_datetime(
|
|
529
|
+
df_copy[timestamp_column], errors='coerce', format='mixed'
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
# Check for duplicate timestamps per entity
|
|
533
|
+
dup_result = self._check_duplicate_timestamps(df_copy, entity_column)
|
|
534
|
+
if dup_result['total'] > 0:
|
|
535
|
+
issues.append(
|
|
536
|
+
f"{dup_result['total']} duplicate timestamps across "
|
|
537
|
+
f"{dup_result['entities']} entities"
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
# Check temporal ordering
|
|
541
|
+
order_result = self._check_ordering(df_copy, entity_column)
|
|
542
|
+
if order_result['entities'] > 0:
|
|
543
|
+
issues.append(
|
|
544
|
+
f"{order_result['entities']} entities have ordering issues"
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
# Analyze gaps
|
|
548
|
+
gap_result = self._analyze_gaps(
|
|
549
|
+
df_copy, entity_column, expected_frequency, max_allowed_gap_periods
|
|
550
|
+
)
|
|
551
|
+
if gap_result['entities_with_gaps'] > 0:
|
|
552
|
+
issues.append(
|
|
553
|
+
f"{gap_result['entities_with_gaps']} entities have significant gaps"
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
# Calculate temporal quality score
|
|
557
|
+
total_entities = df[entity_column].nunique()
|
|
558
|
+
|
|
559
|
+
penalties = 0
|
|
560
|
+
|
|
561
|
+
# Duplicate timestamp penalty
|
|
562
|
+
dup_rate = dup_result['entities'] / total_entities if total_entities > 0 else 0
|
|
563
|
+
if dup_rate > 0.1:
|
|
564
|
+
penalties += 20
|
|
565
|
+
elif dup_rate > 0.01:
|
|
566
|
+
penalties += 10
|
|
567
|
+
|
|
568
|
+
# Ordering issues penalty
|
|
569
|
+
order_rate = order_result['entities'] / total_entities if total_entities > 0 else 0
|
|
570
|
+
if order_rate > 0.1:
|
|
571
|
+
penalties += 20
|
|
572
|
+
elif order_rate > 0.01:
|
|
573
|
+
penalties += 10
|
|
574
|
+
|
|
575
|
+
# Gap penalty
|
|
576
|
+
gap_rate = gap_result['entities_with_gaps'] / total_entities if total_entities > 0 else 0
|
|
577
|
+
if gap_rate > 0.2:
|
|
578
|
+
penalties += 20
|
|
579
|
+
elif gap_rate > 0.1:
|
|
580
|
+
penalties += 10
|
|
581
|
+
elif gap_rate > 0.05:
|
|
582
|
+
penalties += 5
|
|
583
|
+
|
|
584
|
+
temporal_quality_score = max(0, 100 - penalties)
|
|
585
|
+
|
|
586
|
+
return TimeSeriesValidationResult(
|
|
587
|
+
total_expected_periods=gap_result.get('expected_periods', 0),
|
|
588
|
+
total_actual_periods=gap_result.get('actual_periods', 0),
|
|
589
|
+
coverage_percentage=gap_result.get('coverage', 100.0),
|
|
590
|
+
entities_with_gaps=gap_result['entities_with_gaps'],
|
|
591
|
+
total_gaps=gap_result['total_gaps'],
|
|
592
|
+
max_gap_periods=gap_result['max_gap'],
|
|
593
|
+
gap_examples=gap_result['examples'],
|
|
594
|
+
entities_with_duplicate_timestamps=dup_result['entities'],
|
|
595
|
+
total_duplicate_timestamps=dup_result['total'],
|
|
596
|
+
duplicate_examples=dup_result['examples'],
|
|
597
|
+
entities_with_ordering_issues=order_result['entities'],
|
|
598
|
+
ordering_issue_examples=order_result['examples'],
|
|
599
|
+
frequency_consistent=gap_result.get('frequency_consistent', True),
|
|
600
|
+
frequency_deviation_percentage=gap_result.get('frequency_deviation', 0.0),
|
|
601
|
+
temporal_quality_score=temporal_quality_score,
|
|
602
|
+
issues=issues
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
def _check_duplicate_timestamps(
|
|
606
|
+
self,
|
|
607
|
+
df: DataFrame,
|
|
608
|
+
entity_column: str
|
|
609
|
+
) -> Dict[str, Any]:
|
|
610
|
+
"""Check for duplicate timestamps within each entity."""
|
|
611
|
+
dup_counts = df.groupby([entity_column, '_ts']).size()
|
|
612
|
+
duplicates = dup_counts[dup_counts > 1]
|
|
613
|
+
|
|
614
|
+
examples = []
|
|
615
|
+
if len(duplicates) > 0:
|
|
616
|
+
for (entity, ts), count in duplicates.head(3).items():
|
|
617
|
+
examples.append({
|
|
618
|
+
'entity': entity,
|
|
619
|
+
'timestamp': str(ts),
|
|
620
|
+
'count': int(count)
|
|
621
|
+
})
|
|
622
|
+
|
|
623
|
+
return {
|
|
624
|
+
'total': len(duplicates),
|
|
625
|
+
'entities': duplicates.index.get_level_values(0).nunique() if len(duplicates) > 0 else 0,
|
|
626
|
+
'examples': examples
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
def _check_ordering(
|
|
630
|
+
self,
|
|
631
|
+
df: DataFrame,
|
|
632
|
+
entity_column: str
|
|
633
|
+
) -> Dict[str, Any]:
|
|
634
|
+
"""Check if timestamps are properly ordered within each entity."""
|
|
635
|
+
entities_with_issues = []
|
|
636
|
+
examples = []
|
|
637
|
+
|
|
638
|
+
# Sample for efficiency
|
|
639
|
+
sample_entities = df[entity_column].unique()[:1000]
|
|
640
|
+
|
|
641
|
+
for entity in sample_entities:
|
|
642
|
+
entity_data = df[df[entity_column] == entity]['_ts'].dropna()
|
|
643
|
+
if len(entity_data) < 2:
|
|
644
|
+
continue
|
|
645
|
+
|
|
646
|
+
# Check if sorted
|
|
647
|
+
if not entity_data.is_monotonic_increasing:
|
|
648
|
+
entities_with_issues.append(entity)
|
|
649
|
+
if len(examples) < 3:
|
|
650
|
+
examples.append({
|
|
651
|
+
'entity': entity,
|
|
652
|
+
'issue': 'timestamps not in ascending order'
|
|
653
|
+
})
|
|
654
|
+
|
|
655
|
+
return {
|
|
656
|
+
'entities': len(entities_with_issues),
|
|
657
|
+
'examples': examples
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
def _analyze_gaps(
|
|
661
|
+
self,
|
|
662
|
+
df: DataFrame,
|
|
663
|
+
entity_column: str,
|
|
664
|
+
expected_frequency: Optional[str],
|
|
665
|
+
max_allowed_gap_periods: int
|
|
666
|
+
) -> Dict[str, Any]:
|
|
667
|
+
"""Analyze gaps in time series."""
|
|
668
|
+
# Determine expected interval
|
|
669
|
+
if expected_frequency:
|
|
670
|
+
expected_interval = self._frequency_to_timedelta(expected_frequency)
|
|
671
|
+
else:
|
|
672
|
+
# Estimate from data
|
|
673
|
+
expected_interval = self._estimate_interval(df, entity_column)
|
|
674
|
+
|
|
675
|
+
if expected_interval is None:
|
|
676
|
+
return {
|
|
677
|
+
'entities_with_gaps': 0,
|
|
678
|
+
'total_gaps': 0,
|
|
679
|
+
'max_gap': 0,
|
|
680
|
+
'examples': [],
|
|
681
|
+
'coverage': 100.0,
|
|
682
|
+
'frequency_consistent': True,
|
|
683
|
+
'frequency_deviation': 0.0
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
entities_with_gaps = []
|
|
687
|
+
total_gaps = 0
|
|
688
|
+
max_gap = 0
|
|
689
|
+
gap_examples = []
|
|
690
|
+
|
|
691
|
+
# Sample for efficiency
|
|
692
|
+
sample_entities = df[entity_column].unique()[:500]
|
|
693
|
+
|
|
694
|
+
for entity in sample_entities:
|
|
695
|
+
entity_data = df[df[entity_column] == entity]['_ts'].dropna().sort_values()
|
|
696
|
+
if len(entity_data) < 2:
|
|
697
|
+
continue
|
|
698
|
+
|
|
699
|
+
diffs = entity_data.diff().dropna()
|
|
700
|
+
|
|
701
|
+
# Find gaps larger than allowed
|
|
702
|
+
threshold = expected_interval * max_allowed_gap_periods
|
|
703
|
+
large_gaps = diffs[diffs > threshold]
|
|
704
|
+
|
|
705
|
+
if len(large_gaps) > 0:
|
|
706
|
+
entities_with_gaps.append(entity)
|
|
707
|
+
total_gaps += len(large_gaps)
|
|
708
|
+
|
|
709
|
+
gap_periods = int((large_gaps.max() / expected_interval))
|
|
710
|
+
max_gap = max(max_gap, gap_periods)
|
|
711
|
+
|
|
712
|
+
if len(gap_examples) < 3:
|
|
713
|
+
gap_examples.append({
|
|
714
|
+
'entity': entity,
|
|
715
|
+
'gap_size': str(large_gaps.max()),
|
|
716
|
+
'gap_periods': gap_periods
|
|
717
|
+
})
|
|
718
|
+
|
|
719
|
+
# Calculate coverage
|
|
720
|
+
coverage = 100.0
|
|
721
|
+
if len(sample_entities) > 0:
|
|
722
|
+
coverage = 100.0 * (1 - len(entities_with_gaps) / len(sample_entities))
|
|
723
|
+
|
|
724
|
+
return {
|
|
725
|
+
'entities_with_gaps': len(entities_with_gaps),
|
|
726
|
+
'total_gaps': total_gaps,
|
|
727
|
+
'max_gap': max_gap,
|
|
728
|
+
'examples': gap_examples,
|
|
729
|
+
'coverage': coverage,
|
|
730
|
+
'frequency_consistent': len(entities_with_gaps) < len(sample_entities) * 0.1,
|
|
731
|
+
'frequency_deviation': 0.0,
|
|
732
|
+
'expected_periods': 0,
|
|
733
|
+
'actual_periods': 0
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
def _frequency_to_timedelta(self, frequency: str) -> Optional[timedelta]:
|
|
737
|
+
"""Convert frequency string to timedelta."""
|
|
738
|
+
freq_map = {
|
|
739
|
+
'hourly': timedelta(hours=1),
|
|
740
|
+
'daily': timedelta(days=1),
|
|
741
|
+
'weekly': timedelta(weeks=1),
|
|
742
|
+
'monthly': timedelta(days=30),
|
|
743
|
+
'quarterly': timedelta(days=91),
|
|
744
|
+
'yearly': timedelta(days=365),
|
|
745
|
+
}
|
|
746
|
+
return freq_map.get(frequency.lower())
|
|
747
|
+
|
|
748
|
+
def _estimate_interval(
|
|
749
|
+
self,
|
|
750
|
+
df: DataFrame,
|
|
751
|
+
entity_column: str
|
|
752
|
+
) -> Optional[timedelta]:
|
|
753
|
+
"""Estimate the typical interval from the data."""
|
|
754
|
+
intervals = []
|
|
755
|
+
|
|
756
|
+
sample_entities = df[entity_column].unique()[:100]
|
|
757
|
+
|
|
758
|
+
for entity in sample_entities:
|
|
759
|
+
entity_data = df[df[entity_column] == entity]['_ts'].dropna().sort_values()
|
|
760
|
+
if len(entity_data) < 2:
|
|
761
|
+
continue
|
|
762
|
+
|
|
763
|
+
diffs = entity_data.diff().dropna()
|
|
764
|
+
intervals.extend(diffs.tolist())
|
|
765
|
+
|
|
766
|
+
if not intervals:
|
|
767
|
+
return None
|
|
768
|
+
|
|
769
|
+
return pd.Series(intervals).median()
|