churnkit 0.75.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +647 -0
  2. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +1165 -0
  3. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +961 -0
  4. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +1690 -0
  5. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +679 -0
  6. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +3305 -0
  7. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +1463 -0
  8. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +1430 -0
  9. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +854 -0
  10. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +1639 -0
  11. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +1890 -0
  12. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +1457 -0
  13. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +1624 -0
  14. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +780 -0
  15. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +979 -0
  16. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +572 -0
  17. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +1179 -0
  18. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +1418 -0
  19. churnkit-0.75.0a1.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +151 -0
  20. churnkit-0.75.0a1.dist-info/METADATA +229 -0
  21. churnkit-0.75.0a1.dist-info/RECORD +302 -0
  22. churnkit-0.75.0a1.dist-info/WHEEL +4 -0
  23. churnkit-0.75.0a1.dist-info/entry_points.txt +2 -0
  24. churnkit-0.75.0a1.dist-info/licenses/LICENSE +202 -0
  25. customer_retention/__init__.py +37 -0
  26. customer_retention/analysis/__init__.py +0 -0
  27. customer_retention/analysis/auto_explorer/__init__.py +62 -0
  28. customer_retention/analysis/auto_explorer/exploration_manager.py +470 -0
  29. customer_retention/analysis/auto_explorer/explorer.py +258 -0
  30. customer_retention/analysis/auto_explorer/findings.py +291 -0
  31. customer_retention/analysis/auto_explorer/layered_recommendations.py +485 -0
  32. customer_retention/analysis/auto_explorer/recommendation_builder.py +148 -0
  33. customer_retention/analysis/auto_explorer/recommendations.py +418 -0
  34. customer_retention/analysis/business/__init__.py +26 -0
  35. customer_retention/analysis/business/ab_test_designer.py +144 -0
  36. customer_retention/analysis/business/fairness_analyzer.py +166 -0
  37. customer_retention/analysis/business/intervention_matcher.py +121 -0
  38. customer_retention/analysis/business/report_generator.py +222 -0
  39. customer_retention/analysis/business/risk_profile.py +199 -0
  40. customer_retention/analysis/business/roi_analyzer.py +139 -0
  41. customer_retention/analysis/diagnostics/__init__.py +20 -0
  42. customer_retention/analysis/diagnostics/calibration_analyzer.py +133 -0
  43. customer_retention/analysis/diagnostics/cv_analyzer.py +144 -0
  44. customer_retention/analysis/diagnostics/error_analyzer.py +107 -0
  45. customer_retention/analysis/diagnostics/leakage_detector.py +394 -0
  46. customer_retention/analysis/diagnostics/noise_tester.py +140 -0
  47. customer_retention/analysis/diagnostics/overfitting_analyzer.py +190 -0
  48. customer_retention/analysis/diagnostics/segment_analyzer.py +122 -0
  49. customer_retention/analysis/discovery/__init__.py +8 -0
  50. customer_retention/analysis/discovery/config_generator.py +49 -0
  51. customer_retention/analysis/discovery/discovery_flow.py +19 -0
  52. customer_retention/analysis/discovery/type_inferencer.py +147 -0
  53. customer_retention/analysis/interpretability/__init__.py +13 -0
  54. customer_retention/analysis/interpretability/cohort_analyzer.py +185 -0
  55. customer_retention/analysis/interpretability/counterfactual.py +175 -0
  56. customer_retention/analysis/interpretability/individual_explainer.py +141 -0
  57. customer_retention/analysis/interpretability/pdp_generator.py +103 -0
  58. customer_retention/analysis/interpretability/shap_explainer.py +106 -0
  59. customer_retention/analysis/jupyter_save_hook.py +28 -0
  60. customer_retention/analysis/notebook_html_exporter.py +136 -0
  61. customer_retention/analysis/notebook_progress.py +60 -0
  62. customer_retention/analysis/plotly_preprocessor.py +154 -0
  63. customer_retention/analysis/recommendations/__init__.py +54 -0
  64. customer_retention/analysis/recommendations/base.py +158 -0
  65. customer_retention/analysis/recommendations/cleaning/__init__.py +11 -0
  66. customer_retention/analysis/recommendations/cleaning/consistency.py +107 -0
  67. customer_retention/analysis/recommendations/cleaning/deduplicate.py +94 -0
  68. customer_retention/analysis/recommendations/cleaning/impute.py +67 -0
  69. customer_retention/analysis/recommendations/cleaning/outlier.py +71 -0
  70. customer_retention/analysis/recommendations/datetime/__init__.py +3 -0
  71. customer_retention/analysis/recommendations/datetime/extract.py +149 -0
  72. customer_retention/analysis/recommendations/encoding/__init__.py +3 -0
  73. customer_retention/analysis/recommendations/encoding/categorical.py +114 -0
  74. customer_retention/analysis/recommendations/pipeline.py +74 -0
  75. customer_retention/analysis/recommendations/registry.py +76 -0
  76. customer_retention/analysis/recommendations/selection/__init__.py +3 -0
  77. customer_retention/analysis/recommendations/selection/drop_column.py +56 -0
  78. customer_retention/analysis/recommendations/transform/__init__.py +4 -0
  79. customer_retention/analysis/recommendations/transform/power.py +94 -0
  80. customer_retention/analysis/recommendations/transform/scale.py +112 -0
  81. customer_retention/analysis/visualization/__init__.py +15 -0
  82. customer_retention/analysis/visualization/chart_builder.py +2619 -0
  83. customer_retention/analysis/visualization/console.py +122 -0
  84. customer_retention/analysis/visualization/display.py +171 -0
  85. customer_retention/analysis/visualization/number_formatter.py +36 -0
  86. customer_retention/artifacts/__init__.py +3 -0
  87. customer_retention/artifacts/fit_artifact_registry.py +146 -0
  88. customer_retention/cli.py +93 -0
  89. customer_retention/core/__init__.py +0 -0
  90. customer_retention/core/compat/__init__.py +193 -0
  91. customer_retention/core/compat/detection.py +99 -0
  92. customer_retention/core/compat/ops.py +48 -0
  93. customer_retention/core/compat/pandas_backend.py +57 -0
  94. customer_retention/core/compat/spark_backend.py +75 -0
  95. customer_retention/core/components/__init__.py +11 -0
  96. customer_retention/core/components/base.py +79 -0
  97. customer_retention/core/components/components/__init__.py +13 -0
  98. customer_retention/core/components/components/deployer.py +26 -0
  99. customer_retention/core/components/components/explainer.py +26 -0
  100. customer_retention/core/components/components/feature_eng.py +33 -0
  101. customer_retention/core/components/components/ingester.py +34 -0
  102. customer_retention/core/components/components/profiler.py +34 -0
  103. customer_retention/core/components/components/trainer.py +38 -0
  104. customer_retention/core/components/components/transformer.py +36 -0
  105. customer_retention/core/components/components/validator.py +37 -0
  106. customer_retention/core/components/enums.py +33 -0
  107. customer_retention/core/components/orchestrator.py +94 -0
  108. customer_retention/core/components/registry.py +59 -0
  109. customer_retention/core/config/__init__.py +39 -0
  110. customer_retention/core/config/column_config.py +95 -0
  111. customer_retention/core/config/experiments.py +71 -0
  112. customer_retention/core/config/pipeline_config.py +117 -0
  113. customer_retention/core/config/source_config.py +83 -0
  114. customer_retention/core/utils/__init__.py +28 -0
  115. customer_retention/core/utils/leakage.py +85 -0
  116. customer_retention/core/utils/severity.py +53 -0
  117. customer_retention/core/utils/statistics.py +90 -0
  118. customer_retention/generators/__init__.py +0 -0
  119. customer_retention/generators/notebook_generator/__init__.py +167 -0
  120. customer_retention/generators/notebook_generator/base.py +55 -0
  121. customer_retention/generators/notebook_generator/cell_builder.py +49 -0
  122. customer_retention/generators/notebook_generator/config.py +47 -0
  123. customer_retention/generators/notebook_generator/databricks_generator.py +48 -0
  124. customer_retention/generators/notebook_generator/local_generator.py +48 -0
  125. customer_retention/generators/notebook_generator/project_init.py +174 -0
  126. customer_retention/generators/notebook_generator/runner.py +150 -0
  127. customer_retention/generators/notebook_generator/script_generator.py +110 -0
  128. customer_retention/generators/notebook_generator/stages/__init__.py +19 -0
  129. customer_retention/generators/notebook_generator/stages/base_stage.py +86 -0
  130. customer_retention/generators/notebook_generator/stages/s01_ingestion.py +100 -0
  131. customer_retention/generators/notebook_generator/stages/s02_profiling.py +95 -0
  132. customer_retention/generators/notebook_generator/stages/s03_cleaning.py +180 -0
  133. customer_retention/generators/notebook_generator/stages/s04_transformation.py +165 -0
  134. customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +115 -0
  135. customer_retention/generators/notebook_generator/stages/s06_feature_selection.py +97 -0
  136. customer_retention/generators/notebook_generator/stages/s07_model_training.py +176 -0
  137. customer_retention/generators/notebook_generator/stages/s08_deployment.py +81 -0
  138. customer_retention/generators/notebook_generator/stages/s09_monitoring.py +112 -0
  139. customer_retention/generators/notebook_generator/stages/s10_batch_inference.py +642 -0
  140. customer_retention/generators/notebook_generator/stages/s11_feature_store.py +348 -0
  141. customer_retention/generators/orchestration/__init__.py +23 -0
  142. customer_retention/generators/orchestration/code_generator.py +196 -0
  143. customer_retention/generators/orchestration/context.py +147 -0
  144. customer_retention/generators/orchestration/data_materializer.py +188 -0
  145. customer_retention/generators/orchestration/databricks_exporter.py +411 -0
  146. customer_retention/generators/orchestration/doc_generator.py +311 -0
  147. customer_retention/generators/pipeline_generator/__init__.py +26 -0
  148. customer_retention/generators/pipeline_generator/findings_parser.py +727 -0
  149. customer_retention/generators/pipeline_generator/generator.py +142 -0
  150. customer_retention/generators/pipeline_generator/models.py +166 -0
  151. customer_retention/generators/pipeline_generator/renderer.py +2125 -0
  152. customer_retention/generators/spec_generator/__init__.py +37 -0
  153. customer_retention/generators/spec_generator/databricks_generator.py +433 -0
  154. customer_retention/generators/spec_generator/generic_generator.py +373 -0
  155. customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +685 -0
  156. customer_retention/generators/spec_generator/pipeline_spec.py +298 -0
  157. customer_retention/integrations/__init__.py +0 -0
  158. customer_retention/integrations/adapters/__init__.py +13 -0
  159. customer_retention/integrations/adapters/base.py +10 -0
  160. customer_retention/integrations/adapters/factory.py +25 -0
  161. customer_retention/integrations/adapters/feature_store/__init__.py +6 -0
  162. customer_retention/integrations/adapters/feature_store/base.py +57 -0
  163. customer_retention/integrations/adapters/feature_store/databricks.py +94 -0
  164. customer_retention/integrations/adapters/feature_store/feast_adapter.py +97 -0
  165. customer_retention/integrations/adapters/feature_store/local.py +75 -0
  166. customer_retention/integrations/adapters/mlflow/__init__.py +6 -0
  167. customer_retention/integrations/adapters/mlflow/base.py +32 -0
  168. customer_retention/integrations/adapters/mlflow/databricks.py +54 -0
  169. customer_retention/integrations/adapters/mlflow/experiment_tracker.py +161 -0
  170. customer_retention/integrations/adapters/mlflow/local.py +50 -0
  171. customer_retention/integrations/adapters/storage/__init__.py +5 -0
  172. customer_retention/integrations/adapters/storage/base.py +33 -0
  173. customer_retention/integrations/adapters/storage/databricks.py +76 -0
  174. customer_retention/integrations/adapters/storage/local.py +59 -0
  175. customer_retention/integrations/feature_store/__init__.py +47 -0
  176. customer_retention/integrations/feature_store/definitions.py +215 -0
  177. customer_retention/integrations/feature_store/manager.py +744 -0
  178. customer_retention/integrations/feature_store/registry.py +412 -0
  179. customer_retention/integrations/iteration/__init__.py +28 -0
  180. customer_retention/integrations/iteration/context.py +212 -0
  181. customer_retention/integrations/iteration/feedback_collector.py +184 -0
  182. customer_retention/integrations/iteration/orchestrator.py +168 -0
  183. customer_retention/integrations/iteration/recommendation_tracker.py +341 -0
  184. customer_retention/integrations/iteration/signals.py +212 -0
  185. customer_retention/integrations/llm_context/__init__.py +4 -0
  186. customer_retention/integrations/llm_context/context_builder.py +201 -0
  187. customer_retention/integrations/llm_context/prompts.py +100 -0
  188. customer_retention/integrations/streaming/__init__.py +103 -0
  189. customer_retention/integrations/streaming/batch_integration.py +149 -0
  190. customer_retention/integrations/streaming/early_warning_model.py +227 -0
  191. customer_retention/integrations/streaming/event_schema.py +214 -0
  192. customer_retention/integrations/streaming/online_store_writer.py +249 -0
  193. customer_retention/integrations/streaming/realtime_scorer.py +261 -0
  194. customer_retention/integrations/streaming/trigger_engine.py +293 -0
  195. customer_retention/integrations/streaming/window_aggregator.py +393 -0
  196. customer_retention/stages/__init__.py +0 -0
  197. customer_retention/stages/cleaning/__init__.py +9 -0
  198. customer_retention/stages/cleaning/base.py +28 -0
  199. customer_retention/stages/cleaning/missing_handler.py +160 -0
  200. customer_retention/stages/cleaning/outlier_handler.py +204 -0
  201. customer_retention/stages/deployment/__init__.py +28 -0
  202. customer_retention/stages/deployment/batch_scorer.py +106 -0
  203. customer_retention/stages/deployment/champion_challenger.py +299 -0
  204. customer_retention/stages/deployment/model_registry.py +182 -0
  205. customer_retention/stages/deployment/retraining_trigger.py +245 -0
  206. customer_retention/stages/features/__init__.py +73 -0
  207. customer_retention/stages/features/behavioral_features.py +266 -0
  208. customer_retention/stages/features/customer_segmentation.py +505 -0
  209. customer_retention/stages/features/feature_definitions.py +265 -0
  210. customer_retention/stages/features/feature_engineer.py +551 -0
  211. customer_retention/stages/features/feature_manifest.py +340 -0
  212. customer_retention/stages/features/feature_selector.py +239 -0
  213. customer_retention/stages/features/interaction_features.py +160 -0
  214. customer_retention/stages/features/temporal_features.py +243 -0
  215. customer_retention/stages/ingestion/__init__.py +9 -0
  216. customer_retention/stages/ingestion/load_result.py +32 -0
  217. customer_retention/stages/ingestion/loaders.py +195 -0
  218. customer_retention/stages/ingestion/source_registry.py +130 -0
  219. customer_retention/stages/modeling/__init__.py +31 -0
  220. customer_retention/stages/modeling/baseline_trainer.py +139 -0
  221. customer_retention/stages/modeling/cross_validator.py +125 -0
  222. customer_retention/stages/modeling/data_splitter.py +205 -0
  223. customer_retention/stages/modeling/feature_scaler.py +99 -0
  224. customer_retention/stages/modeling/hyperparameter_tuner.py +107 -0
  225. customer_retention/stages/modeling/imbalance_handler.py +282 -0
  226. customer_retention/stages/modeling/mlflow_logger.py +95 -0
  227. customer_retention/stages/modeling/model_comparator.py +149 -0
  228. customer_retention/stages/modeling/model_evaluator.py +138 -0
  229. customer_retention/stages/modeling/threshold_optimizer.py +131 -0
  230. customer_retention/stages/monitoring/__init__.py +37 -0
  231. customer_retention/stages/monitoring/alert_manager.py +328 -0
  232. customer_retention/stages/monitoring/drift_detector.py +201 -0
  233. customer_retention/stages/monitoring/performance_monitor.py +242 -0
  234. customer_retention/stages/preprocessing/__init__.py +5 -0
  235. customer_retention/stages/preprocessing/transformer_manager.py +284 -0
  236. customer_retention/stages/profiling/__init__.py +256 -0
  237. customer_retention/stages/profiling/categorical_distribution.py +269 -0
  238. customer_retention/stages/profiling/categorical_target_analyzer.py +274 -0
  239. customer_retention/stages/profiling/column_profiler.py +527 -0
  240. customer_retention/stages/profiling/distribution_analysis.py +483 -0
  241. customer_retention/stages/profiling/drift_detector.py +310 -0
  242. customer_retention/stages/profiling/feature_capacity.py +507 -0
  243. customer_retention/stages/profiling/pattern_analysis_config.py +513 -0
  244. customer_retention/stages/profiling/profile_result.py +212 -0
  245. customer_retention/stages/profiling/quality_checks.py +1632 -0
  246. customer_retention/stages/profiling/relationship_detector.py +256 -0
  247. customer_retention/stages/profiling/relationship_recommender.py +454 -0
  248. customer_retention/stages/profiling/report_generator.py +520 -0
  249. customer_retention/stages/profiling/scd_analyzer.py +151 -0
  250. customer_retention/stages/profiling/segment_analyzer.py +632 -0
  251. customer_retention/stages/profiling/segment_aware_outlier.py +265 -0
  252. customer_retention/stages/profiling/target_level_analyzer.py +217 -0
  253. customer_retention/stages/profiling/temporal_analyzer.py +388 -0
  254. customer_retention/stages/profiling/temporal_coverage.py +488 -0
  255. customer_retention/stages/profiling/temporal_feature_analyzer.py +692 -0
  256. customer_retention/stages/profiling/temporal_feature_engineer.py +703 -0
  257. customer_retention/stages/profiling/temporal_pattern_analyzer.py +636 -0
  258. customer_retention/stages/profiling/temporal_quality_checks.py +278 -0
  259. customer_retention/stages/profiling/temporal_target_analyzer.py +241 -0
  260. customer_retention/stages/profiling/text_embedder.py +87 -0
  261. customer_retention/stages/profiling/text_processor.py +115 -0
  262. customer_retention/stages/profiling/text_reducer.py +60 -0
  263. customer_retention/stages/profiling/time_series_profiler.py +303 -0
  264. customer_retention/stages/profiling/time_window_aggregator.py +376 -0
  265. customer_retention/stages/profiling/type_detector.py +382 -0
  266. customer_retention/stages/profiling/window_recommendation.py +288 -0
  267. customer_retention/stages/temporal/__init__.py +166 -0
  268. customer_retention/stages/temporal/access_guard.py +180 -0
  269. customer_retention/stages/temporal/cutoff_analyzer.py +235 -0
  270. customer_retention/stages/temporal/data_preparer.py +178 -0
  271. customer_retention/stages/temporal/point_in_time_join.py +134 -0
  272. customer_retention/stages/temporal/point_in_time_registry.py +148 -0
  273. customer_retention/stages/temporal/scenario_detector.py +163 -0
  274. customer_retention/stages/temporal/snapshot_manager.py +259 -0
  275. customer_retention/stages/temporal/synthetic_coordinator.py +66 -0
  276. customer_retention/stages/temporal/timestamp_discovery.py +531 -0
  277. customer_retention/stages/temporal/timestamp_manager.py +255 -0
  278. customer_retention/stages/transformation/__init__.py +13 -0
  279. customer_retention/stages/transformation/binary_handler.py +85 -0
  280. customer_retention/stages/transformation/categorical_encoder.py +245 -0
  281. customer_retention/stages/transformation/datetime_transformer.py +97 -0
  282. customer_retention/stages/transformation/numeric_transformer.py +181 -0
  283. customer_retention/stages/transformation/pipeline.py +257 -0
  284. customer_retention/stages/validation/__init__.py +60 -0
  285. customer_retention/stages/validation/adversarial_scoring_validator.py +205 -0
  286. customer_retention/stages/validation/business_sense_gate.py +173 -0
  287. customer_retention/stages/validation/data_quality_gate.py +235 -0
  288. customer_retention/stages/validation/data_validators.py +511 -0
  289. customer_retention/stages/validation/feature_quality_gate.py +183 -0
  290. customer_retention/stages/validation/gates.py +117 -0
  291. customer_retention/stages/validation/leakage_gate.py +352 -0
  292. customer_retention/stages/validation/model_validity_gate.py +213 -0
  293. customer_retention/stages/validation/pipeline_validation_runner.py +264 -0
  294. customer_retention/stages/validation/quality_scorer.py +544 -0
  295. customer_retention/stages/validation/rule_generator.py +57 -0
  296. customer_retention/stages/validation/scoring_pipeline_validator.py +446 -0
  297. customer_retention/stages/validation/timeseries_detector.py +769 -0
  298. customer_retention/transforms/__init__.py +47 -0
  299. customer_retention/transforms/artifact_store.py +50 -0
  300. customer_retention/transforms/executor.py +157 -0
  301. customer_retention/transforms/fitted.py +92 -0
  302. customer_retention/transforms/ops.py +148 -0
@@ -0,0 +1,769 @@
1
+ """
2
+ Time series detection and validation for exploratory data analysis.
3
+
4
+ This module provides detection of time series data patterns and
5
+ quality validation specific to temporal datasets.
6
+ """
7
+
8
+ import warnings
9
+ from dataclasses import dataclass, field
10
+ from datetime import timedelta
11
+ from enum import Enum
12
+ from typing import Any, Dict, List, Optional, Tuple
13
+
14
+ from customer_retention.core.compat import DataFrame, pd
15
+
16
+
17
+ class DatasetType(Enum):
18
+ """Classification of dataset structure."""
19
+ SNAPSHOT = "snapshot" # Single row per entity (point-in-time)
20
+ TIME_SERIES = "time_series" # Multiple rows per entity over time
21
+ EVENT_LOG = "event_log" # Irregular events per entity
22
+ UNKNOWN = "unknown"
23
+
24
+
25
+ class TimeSeriesFrequency(Enum):
26
+ """Detected frequency of time series."""
27
+ DAILY = "daily"
28
+ WEEKLY = "weekly"
29
+ MONTHLY = "monthly"
30
+ QUARTERLY = "quarterly"
31
+ YEARLY = "yearly"
32
+ HOURLY = "hourly"
33
+ IRREGULAR = "irregular"
34
+ UNKNOWN = "unknown"
35
+
36
+
37
+ @dataclass
38
+ class TimeSeriesCharacteristics:
39
+ """Characteristics of detected time series data."""
40
+ is_time_series: bool
41
+ dataset_type: DatasetType
42
+ entity_column: Optional[str] = None
43
+ timestamp_column: Optional[str] = None
44
+
45
+ # Entity statistics
46
+ total_entities: int = 0
47
+ min_observations_per_entity: int = 0
48
+ max_observations_per_entity: int = 0
49
+ avg_observations_per_entity: float = 0.0
50
+ median_observations_per_entity: float = 0.0
51
+
52
+ # Temporal statistics
53
+ time_span_days: float = 0.0
54
+ detected_frequency: TimeSeriesFrequency = TimeSeriesFrequency.UNKNOWN
55
+ median_interval_hours: float = 0.0
56
+
57
+ # Quality indicators
58
+ entities_with_single_observation: int = 0
59
+ entities_with_gaps: int = 0
60
+ duplicate_timestamps_count: int = 0
61
+
62
+ confidence: float = 0.0 # 0-1 confidence in detection
63
+ evidence: List[str] = field(default_factory=list)
64
+
65
+ def to_dict(self) -> Dict[str, Any]:
66
+ """Convert to dictionary for serialization."""
67
+ return {
68
+ "is_time_series": self.is_time_series,
69
+ "dataset_type": self.dataset_type.value,
70
+ "entity_column": self.entity_column,
71
+ "timestamp_column": self.timestamp_column,
72
+ "total_entities": self.total_entities,
73
+ "avg_observations_per_entity": round(self.avg_observations_per_entity, 2),
74
+ "time_span_days": round(self.time_span_days, 1),
75
+ "detected_frequency": self.detected_frequency.value,
76
+ "confidence": round(self.confidence, 2),
77
+ "evidence": self.evidence
78
+ }
79
+
80
+
81
+ @dataclass
82
+ class TimeSeriesValidationResult:
83
+ """Result of time series quality validation."""
84
+ # Temporal coverage
85
+ total_expected_periods: int = 0
86
+ total_actual_periods: int = 0
87
+ coverage_percentage: float = 100.0
88
+
89
+ # Gap analysis
90
+ entities_with_gaps: int = 0
91
+ total_gaps: int = 0
92
+ max_gap_periods: int = 0
93
+ gap_examples: List[Dict[str, Any]] = field(default_factory=list)
94
+
95
+ # Duplicate timestamps
96
+ entities_with_duplicate_timestamps: int = 0
97
+ total_duplicate_timestamps: int = 0
98
+ duplicate_examples: List[Dict[str, Any]] = field(default_factory=list)
99
+
100
+ # Temporal ordering
101
+ entities_with_ordering_issues: int = 0
102
+ ordering_issue_examples: List[Dict[str, Any]] = field(default_factory=list)
103
+
104
+ # Frequency consistency
105
+ frequency_consistent: bool = True
106
+ frequency_deviation_percentage: float = 0.0
107
+
108
+ # Overall quality score for time series aspects
109
+ temporal_quality_score: float = 100.0
110
+ issues: List[str] = field(default_factory=list)
111
+
112
+ def to_dict(self) -> Dict[str, Any]:
113
+ """Convert to dictionary for serialization."""
114
+ return {
115
+ "coverage_percentage": round(self.coverage_percentage, 2),
116
+ "entities_with_gaps": self.entities_with_gaps,
117
+ "total_gaps": self.total_gaps,
118
+ "entities_with_duplicate_timestamps": self.entities_with_duplicate_timestamps,
119
+ "total_duplicate_timestamps": self.total_duplicate_timestamps,
120
+ "frequency_consistent": self.frequency_consistent,
121
+ "temporal_quality_score": round(self.temporal_quality_score, 1),
122
+ "issues": self.issues
123
+ }
124
+
125
+
126
+ class TimeSeriesDetector:
127
+ """
128
+ Detect time series patterns in datasets.
129
+
130
+ Analyzes a dataset to determine if it represents:
131
+ - Snapshot data (single observation per entity)
132
+ - Time series data (multiple observations per entity over time)
133
+ - Event log data (irregular events per entity)
134
+
135
+ Example
136
+ -------
137
+ >>> detector = TimeSeriesDetector()
138
+ >>> result = detector.detect(df, entity_column='customer_id')
139
+ >>> if result.is_time_series:
140
+ ... print(f"Time series detected with {result.avg_observations_per_entity:.1f} obs/entity")
141
+ """
142
+
143
+ # Common timestamp column name patterns
144
+ TIMESTAMP_PATTERNS = [
145
+ 'date', 'time', 'timestamp', 'datetime', 'created', 'updated',
146
+ 'event_date', 'transaction_date', 'order_date', 'period',
147
+ 'month', 'year', 'week', 'day', 'ts', 'dt'
148
+ ]
149
+
150
+ # Common entity/ID column name patterns
151
+ ENTITY_PATTERNS = [
152
+ 'id', 'customer_id', 'user_id', 'account_id', 'entity_id',
153
+ 'custid', 'userid', 'client_id', 'member_id', 'subscriber_id'
154
+ ]
155
+
156
+ def detect(
157
+ self,
158
+ df: DataFrame,
159
+ entity_column: Optional[str] = None,
160
+ timestamp_column: Optional[str] = None,
161
+ min_observations_threshold: int = 2
162
+ ) -> TimeSeriesCharacteristics:
163
+ """
164
+ Detect if dataset contains time series data.
165
+
166
+ Parameters
167
+ ----------
168
+ df : DataFrame
169
+ Data to analyze
170
+ entity_column : str, optional
171
+ Column identifying entities (e.g., customer_id).
172
+ If not provided, will attempt to auto-detect.
173
+ timestamp_column : str, optional
174
+ Column containing timestamps.
175
+ If not provided, will attempt to auto-detect.
176
+ min_observations_threshold : int
177
+ Minimum average observations per entity to classify as time series
178
+
179
+ Returns
180
+ -------
181
+ TimeSeriesCharacteristics
182
+ Detected characteristics of the dataset
183
+ """
184
+ evidence = []
185
+
186
+ # Auto-detect entity column if not provided
187
+ if entity_column is None:
188
+ entity_column = self._detect_entity_column(df)
189
+ if entity_column:
190
+ evidence.append(f"Auto-detected entity column: {entity_column}")
191
+
192
+ # Auto-detect timestamp column if not provided
193
+ if timestamp_column is None:
194
+ timestamp_column = self._detect_timestamp_column(df)
195
+ if timestamp_column:
196
+ evidence.append(f"Auto-detected timestamp column: {timestamp_column}")
197
+
198
+ # If we can't detect both, return as unknown
199
+ if entity_column is None or entity_column not in df.columns:
200
+ return TimeSeriesCharacteristics(
201
+ is_time_series=False,
202
+ dataset_type=DatasetType.UNKNOWN,
203
+ confidence=0.0,
204
+ evidence=["Could not detect entity column"]
205
+ )
206
+
207
+ # Calculate entity statistics
208
+ entity_counts = df[entity_column].value_counts()
209
+ total_entities = len(entity_counts)
210
+
211
+ # Handle empty dataframe
212
+ if total_entities == 0:
213
+ return TimeSeriesCharacteristics(
214
+ is_time_series=False,
215
+ dataset_type=DatasetType.SNAPSHOT,
216
+ entity_column=entity_column,
217
+ timestamp_column=timestamp_column,
218
+ total_entities=0,
219
+ confidence=0.0,
220
+ evidence=["Empty dataset - no entities found"]
221
+ )
222
+
223
+ min_obs = int(entity_counts.min())
224
+ max_obs = int(entity_counts.max())
225
+ avg_obs = float(entity_counts.mean())
226
+ median_obs = float(entity_counts.median())
227
+ single_obs_entities = int((entity_counts == 1).sum())
228
+
229
+ evidence.append(f"Found {total_entities:,} unique entities")
230
+ evidence.append(f"Observations per entity: min={min_obs}, max={max_obs}, avg={avg_obs:.1f}")
231
+
232
+ # Determine dataset type based on observations per entity
233
+ if avg_obs < min_observations_threshold:
234
+ # Mostly single observations - likely snapshot data
235
+ return TimeSeriesCharacteristics(
236
+ is_time_series=False,
237
+ dataset_type=DatasetType.SNAPSHOT,
238
+ entity_column=entity_column,
239
+ timestamp_column=timestamp_column,
240
+ total_entities=total_entities,
241
+ min_observations_per_entity=min_obs,
242
+ max_observations_per_entity=max_obs,
243
+ avg_observations_per_entity=avg_obs,
244
+ median_observations_per_entity=median_obs,
245
+ entities_with_single_observation=single_obs_entities,
246
+ confidence=0.8 if avg_obs < 1.5 else 0.6,
247
+ evidence=evidence + ["Dataset appears to be snapshot (single observation per entity)"]
248
+ )
249
+
250
+ # Multiple observations per entity - analyze temporal aspects
251
+ time_span_days = 0.0
252
+ detected_frequency = TimeSeriesFrequency.UNKNOWN
253
+ median_interval_hours = 0.0
254
+ duplicate_timestamps = 0
255
+
256
+ if timestamp_column and timestamp_column in df.columns:
257
+ # Convert to datetime if needed
258
+ ts_series = pd.to_datetime(
259
+ df[timestamp_column], errors='coerce', format='mixed'
260
+ )
261
+ valid_ts = ts_series.notna()
262
+
263
+ if valid_ts.sum() > 0:
264
+ time_span = ts_series.max() - ts_series.min()
265
+ time_span_days = time_span.total_seconds() / 86400
266
+ evidence.append(f"Time span: {time_span_days:.1f} days")
267
+
268
+ # Detect frequency
269
+ detected_frequency, median_interval_hours = self._detect_frequency(
270
+ df, entity_column, timestamp_column
271
+ )
272
+ evidence.append(f"Detected frequency: {detected_frequency.value}")
273
+
274
+ # Check for duplicate timestamps per entity
275
+ dup_check = df.groupby([entity_column, timestamp_column]).size()
276
+ duplicate_timestamps = int((dup_check > 1).sum())
277
+ if duplicate_timestamps > 0:
278
+ evidence.append(f"Found {duplicate_timestamps} duplicate timestamps")
279
+
280
+ # Determine if this is time series or event log
281
+ if detected_frequency == TimeSeriesFrequency.IRREGULAR:
282
+ dataset_type = DatasetType.EVENT_LOG
283
+ evidence.append("Irregular intervals suggest event log data")
284
+ else:
285
+ dataset_type = DatasetType.TIME_SERIES
286
+ evidence.append("Regular intervals suggest time series data")
287
+
288
+ # Calculate confidence
289
+ confidence = self._calculate_confidence(
290
+ avg_obs, timestamp_column is not None,
291
+ detected_frequency != TimeSeriesFrequency.UNKNOWN
292
+ )
293
+
294
+ return TimeSeriesCharacteristics(
295
+ is_time_series=True,
296
+ dataset_type=dataset_type,
297
+ entity_column=entity_column,
298
+ timestamp_column=timestamp_column,
299
+ total_entities=total_entities,
300
+ min_observations_per_entity=min_obs,
301
+ max_observations_per_entity=max_obs,
302
+ avg_observations_per_entity=avg_obs,
303
+ median_observations_per_entity=median_obs,
304
+ time_span_days=time_span_days,
305
+ detected_frequency=detected_frequency,
306
+ median_interval_hours=median_interval_hours,
307
+ entities_with_single_observation=single_obs_entities,
308
+ duplicate_timestamps_count=duplicate_timestamps,
309
+ confidence=confidence,
310
+ evidence=evidence
311
+ )
312
+
313
+ def _detect_entity_column(self, df: DataFrame) -> Optional[str]:
314
+ """Auto-detect the entity/ID column."""
315
+ # First, look for columns matching common patterns
316
+ for col in df.columns:
317
+ col_lower = col.lower()
318
+ for pattern in self.ENTITY_PATTERNS:
319
+ if pattern in col_lower:
320
+ return col
321
+
322
+ # Look for columns that might be identifiers based on characteristics
323
+ for col in df.columns:
324
+ if df[col].dtype == 'object' or df[col].dtype.name.startswith('int'):
325
+ # High cardinality but not unique (multiple rows per entity)
326
+ distinct_ratio = df[col].nunique() / len(df)
327
+ if 0.01 < distinct_ratio < 0.9: # Not constant, not unique
328
+ # Check if values repeat
329
+ if df[col].value_counts().max() > 1:
330
+ return col
331
+
332
+ return None
333
+
334
+ def _detect_timestamp_column(self, df: DataFrame) -> Optional[str]:
335
+ """Auto-detect the timestamp column."""
336
+ candidates = []
337
+
338
+ for col in df.columns:
339
+ col_lower = col.lower()
340
+
341
+ # Check if column name matches timestamp patterns
342
+ name_match = any(pattern in col_lower for pattern in self.TIMESTAMP_PATTERNS)
343
+
344
+ # Check if column is datetime type
345
+ is_datetime = pd.api.types.is_datetime64_any_dtype(df[col])
346
+
347
+ # Try to parse as datetime
348
+ can_parse = False
349
+ if not is_datetime and df[col].dtype == 'object':
350
+ try:
351
+ with warnings.catch_warnings():
352
+ warnings.filterwarnings('ignore', category=FutureWarning)
353
+ parsed = pd.to_datetime(
354
+ df[col].head(100), errors='coerce', format='mixed'
355
+ )
356
+ can_parse = parsed.notna().mean() > 0.8
357
+ except Exception:
358
+ pass
359
+
360
+ if is_datetime:
361
+ candidates.append((col, 3)) # Highest priority
362
+ elif name_match and can_parse:
363
+ candidates.append((col, 2))
364
+ elif name_match:
365
+ candidates.append((col, 1))
366
+ elif can_parse:
367
+ candidates.append((col, 1))
368
+
369
+ if candidates:
370
+ # Return highest priority candidate
371
+ candidates.sort(key=lambda x: x[1], reverse=True)
372
+ return candidates[0][0]
373
+
374
+ return None
375
+
376
+ def _detect_frequency(
377
+ self,
378
+ df: DataFrame,
379
+ entity_column: str,
380
+ timestamp_column: str
381
+ ) -> Tuple[TimeSeriesFrequency, float]:
382
+ """Detect the frequency of the time series."""
383
+ # Sample entities for efficiency
384
+ sample_entities = df[entity_column].unique()[:100]
385
+
386
+ intervals = []
387
+ for entity in sample_entities:
388
+ entity_data = df[df[entity_column] == entity].copy()
389
+ if len(entity_data) < 2:
390
+ continue
391
+
392
+ ts = pd.to_datetime(
393
+ entity_data[timestamp_column], errors='coerce', format='mixed'
394
+ )
395
+ ts = ts.dropna().sort_values()
396
+
397
+ if len(ts) < 2:
398
+ continue
399
+
400
+ diffs = ts.diff().dropna()
401
+ intervals.extend([d.total_seconds() / 3600 for d in diffs]) # Hours
402
+
403
+ if not intervals:
404
+ return TimeSeriesFrequency.UNKNOWN, 0.0
405
+
406
+ median_hours = float(pd.Series(intervals).median())
407
+
408
+ # Classify frequency based on median interval
409
+ if median_hours < 2:
410
+ freq = TimeSeriesFrequency.HOURLY
411
+ elif 20 <= median_hours <= 28:
412
+ freq = TimeSeriesFrequency.DAILY
413
+ elif 144 <= median_hours <= 192: # 6-8 days
414
+ freq = TimeSeriesFrequency.WEEKLY
415
+ elif 672 <= median_hours <= 768: # 28-32 days
416
+ freq = TimeSeriesFrequency.MONTHLY
417
+ elif 2016 <= median_hours <= 2208: # ~84-92 days
418
+ freq = TimeSeriesFrequency.QUARTERLY
419
+ elif 8400 <= median_hours <= 8880: # ~350-370 days
420
+ freq = TimeSeriesFrequency.YEARLY
421
+ else:
422
+ # Check variance to determine if irregular
423
+ std_hours = float(pd.Series(intervals).std())
424
+ cv = std_hours / median_hours if median_hours > 0 else 1
425
+ if cv > 0.5: # High coefficient of variation
426
+ freq = TimeSeriesFrequency.IRREGULAR
427
+ else:
428
+ freq = TimeSeriesFrequency.IRREGULAR
429
+
430
+ return freq, median_hours
431
+
432
+ def _calculate_confidence(
433
+ self,
434
+ avg_observations: float,
435
+ has_timestamp: bool,
436
+ has_frequency: bool
437
+ ) -> float:
438
+ """Calculate confidence score for time series detection."""
439
+ confidence = 0.5 # Base confidence
440
+
441
+ # More observations per entity = higher confidence
442
+ if avg_observations >= 10:
443
+ confidence += 0.3
444
+ elif avg_observations >= 5:
445
+ confidence += 0.2
446
+ elif avg_observations >= 2:
447
+ confidence += 0.1
448
+
449
+ # Having a timestamp column increases confidence
450
+ if has_timestamp:
451
+ confidence += 0.1
452
+
453
+ # Having detected frequency increases confidence
454
+ if has_frequency:
455
+ confidence += 0.1
456
+
457
+ return min(1.0, confidence)
458
+
459
+
460
+ class TimeSeriesValidator:
461
+ """
462
+ Validate time series data quality.
463
+
464
+ Performs quality checks specific to time series data:
465
+ - Temporal coverage and gaps
466
+ - Duplicate timestamps
467
+ - Temporal ordering
468
+ - Frequency consistency
469
+
470
+ Example
471
+ -------
472
+ >>> validator = TimeSeriesValidator()
473
+ >>> result = validator.validate(
474
+ ... df,
475
+ ... entity_column='customer_id',
476
+ ... timestamp_column='date',
477
+ ... expected_frequency='daily'
478
+ ... )
479
+ >>> print(f"Temporal quality: {result.temporal_quality_score:.1f}/100")
480
+ """
481
+
482
+ def validate(
483
+ self,
484
+ df: DataFrame,
485
+ entity_column: str,
486
+ timestamp_column: str,
487
+ expected_frequency: Optional[str] = None,
488
+ max_allowed_gap_periods: int = 3
489
+ ) -> TimeSeriesValidationResult:
490
+ """
491
+ Validate time series data quality.
492
+
493
+ Parameters
494
+ ----------
495
+ df : DataFrame
496
+ Time series data to validate
497
+ entity_column : str
498
+ Column identifying entities
499
+ timestamp_column : str
500
+ Column containing timestamps
501
+ expected_frequency : str, optional
502
+ Expected frequency ('daily', 'weekly', 'monthly', etc.)
503
+ max_allowed_gap_periods : int
504
+ Maximum gap periods before flagging as issue
505
+
506
+ Returns
507
+ -------
508
+ TimeSeriesValidationResult
509
+ Validation results with quality metrics
510
+ """
511
+ issues = []
512
+
513
+ # Validate inputs
514
+ if entity_column not in df.columns:
515
+ return TimeSeriesValidationResult(
516
+ temporal_quality_score=0,
517
+ issues=[f"Entity column '{entity_column}' not found"]
518
+ )
519
+
520
+ if timestamp_column not in df.columns:
521
+ return TimeSeriesValidationResult(
522
+ temporal_quality_score=0,
523
+ issues=[f"Timestamp column '{timestamp_column}' not found"]
524
+ )
525
+
526
+ # Convert timestamp
527
+ df_copy = df.copy()
528
+ df_copy['_ts'] = pd.to_datetime(
529
+ df_copy[timestamp_column], errors='coerce', format='mixed'
530
+ )
531
+
532
+ # Check for duplicate timestamps per entity
533
+ dup_result = self._check_duplicate_timestamps(df_copy, entity_column)
534
+ if dup_result['total'] > 0:
535
+ issues.append(
536
+ f"{dup_result['total']} duplicate timestamps across "
537
+ f"{dup_result['entities']} entities"
538
+ )
539
+
540
+ # Check temporal ordering
541
+ order_result = self._check_ordering(df_copy, entity_column)
542
+ if order_result['entities'] > 0:
543
+ issues.append(
544
+ f"{order_result['entities']} entities have ordering issues"
545
+ )
546
+
547
+ # Analyze gaps
548
+ gap_result = self._analyze_gaps(
549
+ df_copy, entity_column, expected_frequency, max_allowed_gap_periods
550
+ )
551
+ if gap_result['entities_with_gaps'] > 0:
552
+ issues.append(
553
+ f"{gap_result['entities_with_gaps']} entities have significant gaps"
554
+ )
555
+
556
+ # Calculate temporal quality score
557
+ total_entities = df[entity_column].nunique()
558
+
559
+ penalties = 0
560
+
561
+ # Duplicate timestamp penalty
562
+ dup_rate = dup_result['entities'] / total_entities if total_entities > 0 else 0
563
+ if dup_rate > 0.1:
564
+ penalties += 20
565
+ elif dup_rate > 0.01:
566
+ penalties += 10
567
+
568
+ # Ordering issues penalty
569
+ order_rate = order_result['entities'] / total_entities if total_entities > 0 else 0
570
+ if order_rate > 0.1:
571
+ penalties += 20
572
+ elif order_rate > 0.01:
573
+ penalties += 10
574
+
575
+ # Gap penalty
576
+ gap_rate = gap_result['entities_with_gaps'] / total_entities if total_entities > 0 else 0
577
+ if gap_rate > 0.2:
578
+ penalties += 20
579
+ elif gap_rate > 0.1:
580
+ penalties += 10
581
+ elif gap_rate > 0.05:
582
+ penalties += 5
583
+
584
+ temporal_quality_score = max(0, 100 - penalties)
585
+
586
+ return TimeSeriesValidationResult(
587
+ total_expected_periods=gap_result.get('expected_periods', 0),
588
+ total_actual_periods=gap_result.get('actual_periods', 0),
589
+ coverage_percentage=gap_result.get('coverage', 100.0),
590
+ entities_with_gaps=gap_result['entities_with_gaps'],
591
+ total_gaps=gap_result['total_gaps'],
592
+ max_gap_periods=gap_result['max_gap'],
593
+ gap_examples=gap_result['examples'],
594
+ entities_with_duplicate_timestamps=dup_result['entities'],
595
+ total_duplicate_timestamps=dup_result['total'],
596
+ duplicate_examples=dup_result['examples'],
597
+ entities_with_ordering_issues=order_result['entities'],
598
+ ordering_issue_examples=order_result['examples'],
599
+ frequency_consistent=gap_result.get('frequency_consistent', True),
600
+ frequency_deviation_percentage=gap_result.get('frequency_deviation', 0.0),
601
+ temporal_quality_score=temporal_quality_score,
602
+ issues=issues
603
+ )
604
+
605
+ def _check_duplicate_timestamps(
606
+ self,
607
+ df: DataFrame,
608
+ entity_column: str
609
+ ) -> Dict[str, Any]:
610
+ """Check for duplicate timestamps within each entity."""
611
+ dup_counts = df.groupby([entity_column, '_ts']).size()
612
+ duplicates = dup_counts[dup_counts > 1]
613
+
614
+ examples = []
615
+ if len(duplicates) > 0:
616
+ for (entity, ts), count in duplicates.head(3).items():
617
+ examples.append({
618
+ 'entity': entity,
619
+ 'timestamp': str(ts),
620
+ 'count': int(count)
621
+ })
622
+
623
+ return {
624
+ 'total': len(duplicates),
625
+ 'entities': duplicates.index.get_level_values(0).nunique() if len(duplicates) > 0 else 0,
626
+ 'examples': examples
627
+ }
628
+
629
+ def _check_ordering(
630
+ self,
631
+ df: DataFrame,
632
+ entity_column: str
633
+ ) -> Dict[str, Any]:
634
+ """Check if timestamps are properly ordered within each entity."""
635
+ entities_with_issues = []
636
+ examples = []
637
+
638
+ # Sample for efficiency
639
+ sample_entities = df[entity_column].unique()[:1000]
640
+
641
+ for entity in sample_entities:
642
+ entity_data = df[df[entity_column] == entity]['_ts'].dropna()
643
+ if len(entity_data) < 2:
644
+ continue
645
+
646
+ # Check if sorted
647
+ if not entity_data.is_monotonic_increasing:
648
+ entities_with_issues.append(entity)
649
+ if len(examples) < 3:
650
+ examples.append({
651
+ 'entity': entity,
652
+ 'issue': 'timestamps not in ascending order'
653
+ })
654
+
655
+ return {
656
+ 'entities': len(entities_with_issues),
657
+ 'examples': examples
658
+ }
659
+
660
+ def _analyze_gaps(
661
+ self,
662
+ df: DataFrame,
663
+ entity_column: str,
664
+ expected_frequency: Optional[str],
665
+ max_allowed_gap_periods: int
666
+ ) -> Dict[str, Any]:
667
+ """Analyze gaps in time series."""
668
+ # Determine expected interval
669
+ if expected_frequency:
670
+ expected_interval = self._frequency_to_timedelta(expected_frequency)
671
+ else:
672
+ # Estimate from data
673
+ expected_interval = self._estimate_interval(df, entity_column)
674
+
675
+ if expected_interval is None:
676
+ return {
677
+ 'entities_with_gaps': 0,
678
+ 'total_gaps': 0,
679
+ 'max_gap': 0,
680
+ 'examples': [],
681
+ 'coverage': 100.0,
682
+ 'frequency_consistent': True,
683
+ 'frequency_deviation': 0.0
684
+ }
685
+
686
+ entities_with_gaps = []
687
+ total_gaps = 0
688
+ max_gap = 0
689
+ gap_examples = []
690
+
691
+ # Sample for efficiency
692
+ sample_entities = df[entity_column].unique()[:500]
693
+
694
+ for entity in sample_entities:
695
+ entity_data = df[df[entity_column] == entity]['_ts'].dropna().sort_values()
696
+ if len(entity_data) < 2:
697
+ continue
698
+
699
+ diffs = entity_data.diff().dropna()
700
+
701
+ # Find gaps larger than allowed
702
+ threshold = expected_interval * max_allowed_gap_periods
703
+ large_gaps = diffs[diffs > threshold]
704
+
705
+ if len(large_gaps) > 0:
706
+ entities_with_gaps.append(entity)
707
+ total_gaps += len(large_gaps)
708
+
709
+ gap_periods = int((large_gaps.max() / expected_interval))
710
+ max_gap = max(max_gap, gap_periods)
711
+
712
+ if len(gap_examples) < 3:
713
+ gap_examples.append({
714
+ 'entity': entity,
715
+ 'gap_size': str(large_gaps.max()),
716
+ 'gap_periods': gap_periods
717
+ })
718
+
719
+ # Calculate coverage
720
+ coverage = 100.0
721
+ if len(sample_entities) > 0:
722
+ coverage = 100.0 * (1 - len(entities_with_gaps) / len(sample_entities))
723
+
724
+ return {
725
+ 'entities_with_gaps': len(entities_with_gaps),
726
+ 'total_gaps': total_gaps,
727
+ 'max_gap': max_gap,
728
+ 'examples': gap_examples,
729
+ 'coverage': coverage,
730
+ 'frequency_consistent': len(entities_with_gaps) < len(sample_entities) * 0.1,
731
+ 'frequency_deviation': 0.0,
732
+ 'expected_periods': 0,
733
+ 'actual_periods': 0
734
+ }
735
+
736
+ def _frequency_to_timedelta(self, frequency: str) -> Optional[timedelta]:
737
+ """Convert frequency string to timedelta."""
738
+ freq_map = {
739
+ 'hourly': timedelta(hours=1),
740
+ 'daily': timedelta(days=1),
741
+ 'weekly': timedelta(weeks=1),
742
+ 'monthly': timedelta(days=30),
743
+ 'quarterly': timedelta(days=91),
744
+ 'yearly': timedelta(days=365),
745
+ }
746
+ return freq_map.get(frequency.lower())
747
+
748
+ def _estimate_interval(
749
+ self,
750
+ df: DataFrame,
751
+ entity_column: str
752
+ ) -> Optional[timedelta]:
753
+ """Estimate the typical interval from the data."""
754
+ intervals = []
755
+
756
+ sample_entities = df[entity_column].unique()[:100]
757
+
758
+ for entity in sample_entities:
759
+ entity_data = df[df[entity_column] == entity]['_ts'].dropna().sort_values()
760
+ if len(entity_data) < 2:
761
+ continue
762
+
763
+ diffs = entity_data.diff().dropna()
764
+ intervals.extend(diffs.tolist())
765
+
766
+ if not intervals:
767
+ return None
768
+
769
+ return pd.Series(intervals).median()