tsagentkit 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. tsagentkit/__init__.py +126 -0
  2. tsagentkit/anomaly/__init__.py +130 -0
  3. tsagentkit/backtest/__init__.py +48 -0
  4. tsagentkit/backtest/engine.py +788 -0
  5. tsagentkit/backtest/metrics.py +244 -0
  6. tsagentkit/backtest/report.py +342 -0
  7. tsagentkit/calibration/__init__.py +136 -0
  8. tsagentkit/contracts/__init__.py +133 -0
  9. tsagentkit/contracts/errors.py +275 -0
  10. tsagentkit/contracts/results.py +418 -0
  11. tsagentkit/contracts/schema.py +44 -0
  12. tsagentkit/contracts/task_spec.py +300 -0
  13. tsagentkit/covariates/__init__.py +340 -0
  14. tsagentkit/eval/__init__.py +285 -0
  15. tsagentkit/features/__init__.py +20 -0
  16. tsagentkit/features/covariates.py +328 -0
  17. tsagentkit/features/extra/__init__.py +5 -0
  18. tsagentkit/features/extra/native.py +179 -0
  19. tsagentkit/features/factory.py +187 -0
  20. tsagentkit/features/matrix.py +159 -0
  21. tsagentkit/features/tsfeatures_adapter.py +115 -0
  22. tsagentkit/features/versioning.py +203 -0
  23. tsagentkit/hierarchy/__init__.py +39 -0
  24. tsagentkit/hierarchy/aggregation.py +62 -0
  25. tsagentkit/hierarchy/evaluator.py +400 -0
  26. tsagentkit/hierarchy/reconciliation.py +232 -0
  27. tsagentkit/hierarchy/structure.py +453 -0
  28. tsagentkit/models/__init__.py +182 -0
  29. tsagentkit/models/adapters/__init__.py +83 -0
  30. tsagentkit/models/adapters/base.py +321 -0
  31. tsagentkit/models/adapters/chronos.py +387 -0
  32. tsagentkit/models/adapters/moirai.py +256 -0
  33. tsagentkit/models/adapters/registry.py +171 -0
  34. tsagentkit/models/adapters/timesfm.py +440 -0
  35. tsagentkit/models/baselines.py +207 -0
  36. tsagentkit/models/sktime.py +307 -0
  37. tsagentkit/monitoring/__init__.py +51 -0
  38. tsagentkit/monitoring/alerts.py +302 -0
  39. tsagentkit/monitoring/coverage.py +203 -0
  40. tsagentkit/monitoring/drift.py +330 -0
  41. tsagentkit/monitoring/report.py +214 -0
  42. tsagentkit/monitoring/stability.py +275 -0
  43. tsagentkit/monitoring/triggers.py +423 -0
  44. tsagentkit/qa/__init__.py +347 -0
  45. tsagentkit/router/__init__.py +37 -0
  46. tsagentkit/router/bucketing.py +489 -0
  47. tsagentkit/router/fallback.py +132 -0
  48. tsagentkit/router/plan.py +23 -0
  49. tsagentkit/router/router.py +271 -0
  50. tsagentkit/series/__init__.py +26 -0
  51. tsagentkit/series/alignment.py +206 -0
  52. tsagentkit/series/dataset.py +449 -0
  53. tsagentkit/series/sparsity.py +261 -0
  54. tsagentkit/series/validation.py +393 -0
  55. tsagentkit/serving/__init__.py +39 -0
  56. tsagentkit/serving/orchestration.py +943 -0
  57. tsagentkit/serving/packaging.py +73 -0
  58. tsagentkit/serving/provenance.py +317 -0
  59. tsagentkit/serving/tsfm_cache.py +214 -0
  60. tsagentkit/skill/README.md +135 -0
  61. tsagentkit/skill/__init__.py +8 -0
  62. tsagentkit/skill/recipes.md +429 -0
  63. tsagentkit/skill/tool_map.md +21 -0
  64. tsagentkit/time/__init__.py +134 -0
  65. tsagentkit/utils/__init__.py +20 -0
  66. tsagentkit/utils/quantiles.py +83 -0
  67. tsagentkit/utils/signature.py +47 -0
  68. tsagentkit/utils/temporal.py +41 -0
  69. tsagentkit-1.0.2.dist-info/METADATA +371 -0
  70. tsagentkit-1.0.2.dist-info/RECORD +72 -0
  71. tsagentkit-1.0.2.dist-info/WHEEL +4 -0
  72. tsagentkit-1.0.2.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,449 @@
1
+ """TSDataset implementation.
2
+
3
+ Immutable wrapper around DataFrame for time series data with
4
+ guaranteed schema and temporal integrity.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass, field, replace
10
+ from typing import TYPE_CHECKING, Any
11
+
12
+ import pandas as pd
13
+
14
+ from tsagentkit.contracts import PanelContract, TaskSpec, validate_contract
15
+ from tsagentkit.series.validation import normalize_panel_columns
16
+
17
+ if TYPE_CHECKING:
18
+ from tsagentkit.covariates import AlignedDataset, CovariateBundle
19
+ from tsagentkit.hierarchy import HierarchyStructure
20
+
21
+ from .sparsity import SparsityProfile, compute_sparsity_profile
22
+
23
+
24
+ @dataclass(frozen=True)
25
+ class TSDataset:
26
+ """Immutable time series dataset container.
27
+
28
+ Wraps a DataFrame with guaranteed schema and provides methods
29
+ for time series operations. All operations return new instances.
30
+
31
+ Attributes:
32
+ df: The underlying DataFrame (guaranteed sorted by unique_id, ds)
33
+ task_spec: Task specification for this dataset
34
+ sparsity_profile: Computed sparsity profile
35
+ metadata: Additional dataset metadata
36
+
37
+ Examples:
38
+ >>> from tsagentkit import TaskSpec
39
+ >>> import pandas as pd
40
+ >>> df = pd.DataFrame({
41
+ ... "unique_id": ["A", "A", "B", "B"],
42
+ ... "ds": pd.date_range("2024-01-01", periods=4, freq="D"),
43
+ ... "y": [1.0, 2.0, 3.0, 4.0],
44
+ ... })
45
+ >>> spec = TaskSpec(h=7, freq="D")
46
+ >>> dataset = TSDataset.from_dataframe(df, spec)
47
+ """
48
+
49
+ df: pd.DataFrame
50
+ task_spec: TaskSpec
51
+ sparsity_profile: SparsityProfile | None = field(default=None)
52
+ metadata: dict[str, Any] = field(default_factory=dict)
53
+ hierarchy: HierarchyStructure | None = field(default=None)
54
+ static_x: pd.DataFrame | None = field(default=None)
55
+ past_x: pd.DataFrame | None = field(default=None)
56
+ future_x: pd.DataFrame | None = field(default=None)
57
+ future_index: pd.DataFrame | None = field(default=None)
58
+ covariate_spec: Any | None = field(default=None)
59
+ covariate_bundle: CovariateBundle | None = field(default=None)
60
+ panel_with_covariates: pd.DataFrame | None = field(default=None)
61
+
62
+ def __post_init__(self) -> None:
63
+ """Validate the dataset after creation."""
64
+ # Since dataclass is frozen, we can't modify, but we can validate
65
+ required_cols = {"unique_id", "ds", "y"}
66
+ missing = required_cols - set(self.df.columns)
67
+ if missing:
68
+ raise ValueError(f"TSDataset missing required columns: {missing}")
69
+
70
+ # Ensure datetime type
71
+ if not pd.api.types.is_datetime64_any_dtype(self.df["ds"]):
72
+ raise ValueError("Column 'ds' must be datetime type")
73
+
74
+ @staticmethod
75
+ def _normalize_panel_columns(
76
+ df: pd.DataFrame,
77
+ contract: PanelContract,
78
+ ) -> tuple[pd.DataFrame, dict[str, str] | None]:
79
+ return normalize_panel_columns(df, contract)
80
+
81
+ @classmethod
82
+ def from_dataframe(
83
+ cls,
84
+ data: pd.DataFrame,
85
+ task_spec: TaskSpec,
86
+ validate: bool = True,
87
+ compute_sparsity: bool = True,
88
+ ) -> TSDataset:
89
+ """Create TSDataset from DataFrame.
90
+
91
+ Args:
92
+ data: Input DataFrame
93
+ task_spec: Task specification
94
+ validate: Whether to validate input (default: True)
95
+ compute_sparsity: Whether to compute sparsity profile (default: True)
96
+
97
+ Returns:
98
+ New TSDataset instance
99
+
100
+ Raises:
101
+ ValueError: If validation fails
102
+ """
103
+ df = data.copy()
104
+ contract = task_spec.panel_contract
105
+
106
+ # Validate if requested
107
+ if validate:
108
+ report, df = validate_contract(
109
+ df,
110
+ panel_contract=contract,
111
+ apply_aggregation=True,
112
+ return_data=True,
113
+ )
114
+ if not report.valid:
115
+ report.raise_if_errors()
116
+
117
+ # Normalize to canonical column names
118
+ df, column_map = cls._normalize_panel_columns(df, contract)
119
+ if column_map:
120
+ task_spec = task_spec.model_copy(update={"panel_contract": PanelContract()})
121
+
122
+ # Ensure datetime
123
+ if not pd.api.types.is_datetime64_any_dtype(df["ds"]):
124
+ df["ds"] = pd.to_datetime(df["ds"])
125
+
126
+ # Sort by unique_id, ds
127
+ df = df.sort_values(["unique_id", "ds"]).reset_index(drop=True)
128
+
129
+ # Compute sparsity profile
130
+ sparsity = None
131
+ if compute_sparsity:
132
+ sparsity = compute_sparsity_profile(df)
133
+
134
+ return cls(
135
+ df=df,
136
+ task_spec=task_spec,
137
+ sparsity_profile=sparsity,
138
+ metadata={
139
+ "panel_contract": contract.model_dump() if hasattr(contract, "model_dump") else {},
140
+ "column_map": column_map or {},
141
+ },
142
+ )
143
+
144
+ @property
145
+ def n_series(self) -> int:
146
+ """Number of unique series."""
147
+ return self.df["unique_id"].nunique()
148
+
149
+ @property
150
+ def n_observations(self) -> int:
151
+ """Total number of observations."""
152
+ return len(self.df)
153
+
154
+ @property
155
+ def date_range(self) -> tuple[pd.Timestamp, pd.Timestamp]:
156
+ """Date range of the dataset."""
157
+ return (self.df["ds"].min(), self.df["ds"].max())
158
+
159
+ @property
160
+ def series_ids(self) -> list[str]:
161
+ """List of unique series IDs."""
162
+ return sorted(self.df["unique_id"].unique().tolist())
163
+
164
+ @property
165
+ def freq(self) -> str:
166
+ """Frequency from task spec."""
167
+ return self.task_spec.freq
168
+
169
+ def get_series(self, unique_id: str) -> pd.DataFrame:
170
+ """Get data for a single series.
171
+
172
+ Args:
173
+ unique_id: Series identifier
174
+
175
+ Returns:
176
+ DataFrame with series data
177
+ """
178
+ return self.df[self.df["unique_id"] == unique_id].copy()
179
+
180
+ def filter_series(self, series_ids: list[str]) -> TSDataset:
181
+ """Create new dataset with only specified series.
182
+
183
+ Args:
184
+ series_ids: List of series IDs to keep
185
+
186
+ Returns:
187
+ New TSDataset instance
188
+ """
189
+ mask = self.df["unique_id"].isin(series_ids)
190
+ new_df = self.df[mask].copy()
191
+
192
+ return TSDataset(
193
+ df=new_df,
194
+ task_spec=self.task_spec,
195
+ sparsity_profile=self.sparsity_profile, # Keep original profile
196
+ metadata=self.metadata.copy(),
197
+ )
198
+
199
+ def filter_dates(
200
+ self,
201
+ start: str | pd.Timestamp | None = None,
202
+ end: str | pd.Timestamp | None = None,
203
+ ) -> TSDataset:
204
+ """Create new dataset filtered by date range.
205
+
206
+ Args:
207
+ start: Start date (inclusive)
208
+ end: End date (inclusive)
209
+
210
+ Returns:
211
+ New TSDataset instance
212
+ """
213
+ mask = pd.Series(True, index=self.df.index)
214
+
215
+ if start is not None:
216
+ start_ts = pd.to_datetime(start)
217
+ mask &= self.df["ds"] >= start_ts
218
+
219
+ if end is not None:
220
+ end_ts = pd.to_datetime(end)
221
+ mask &= self.df["ds"] <= end_ts
222
+
223
+ new_df = self.df[mask].copy()
224
+
225
+ return TSDataset(
226
+ df=new_df,
227
+ task_spec=self.task_spec,
228
+ sparsity_profile=None, # Need to recompute
229
+ metadata=self.metadata.copy(),
230
+ )
231
+
232
+ def split_train_test(
233
+ self,
234
+ test_size: int | None = None,
235
+ test_start: str | pd.Timestamp | None = None,
236
+ ) -> tuple[TSDataset, TSDataset]:
237
+ """Split dataset into train and test sets.
238
+
239
+ Temporal split - uses cutoff date or last N observations per series.
240
+
241
+ Args:
242
+ test_size: Number of observations for test (per series)
243
+ test_start: Start date for test set
244
+
245
+ Returns:
246
+ Tuple of (train_dataset, test_dataset)
247
+
248
+ Raises:
249
+ ValueError: If neither test_size nor test_start provided
250
+ """
251
+ if test_start is not None:
252
+ # Use date-based split
253
+ cutoff = pd.to_datetime(test_start)
254
+ train_mask = self.df["ds"] < cutoff
255
+ elif test_size is not None:
256
+ # Use last N observations per series
257
+ train_mask = pd.Series(False, index=self.df.index)
258
+
259
+ for uid in self.df["unique_id"].unique():
260
+ series_idx = self.df[self.df["unique_id"] == uid].index
261
+ if len(series_idx) > test_size:
262
+ train_idx = series_idx[:-test_size]
263
+ train_mask.loc[train_idx] = True
264
+ else:
265
+ raise ValueError("Must provide either test_size or test_start")
266
+
267
+ train_df = self.df[train_mask].copy()
268
+ test_df = self.df[~train_mask].copy()
269
+
270
+ train_ds = TSDataset(
271
+ df=train_df,
272
+ task_spec=self.task_spec,
273
+ sparsity_profile=None, # Need to recompute
274
+ metadata=self.metadata.copy(),
275
+ )
276
+
277
+ test_ds = TSDataset(
278
+ df=test_df,
279
+ task_spec=self.task_spec,
280
+ sparsity_profile=None,
281
+ metadata=self.metadata.copy(),
282
+ )
283
+
284
+ return train_ds, test_ds
285
+
286
+ def to_dict(self) -> dict[str, Any]:
287
+ """Convert to dictionary for serialization.
288
+
289
+ Note: DataFrame is converted to records format.
290
+ """
291
+ return {
292
+ "df": self.df.to_dict("records"),
293
+ "task_spec": self.task_spec.model_dump(),
294
+ "sparsity_profile": self.sparsity_profile.series_profiles if self.sparsity_profile else None,
295
+ "metadata": self.metadata,
296
+ "hierarchy": self.hierarchy is not None,
297
+ "covariates": {
298
+ "static_x_rows": int(len(self.static_x)) if self.static_x is not None else 0,
299
+ "past_x_rows": int(len(self.past_x)) if self.past_x is not None else 0,
300
+ "future_x_rows": int(len(self.future_x)) if self.future_x is not None else 0,
301
+ "future_index_rows": int(len(self.future_index)) if self.future_index is not None else 0,
302
+ "covariate_spec": (
303
+ self.covariate_spec.model_dump()
304
+ if hasattr(self.covariate_spec, "model_dump")
305
+ else self.covariate_spec
306
+ ),
307
+ },
308
+ }
309
+
310
+ def with_hierarchy(self, hierarchy: HierarchyStructure) -> TSDataset:
311
+ """Return new TSDataset with hierarchy attached.
312
+
313
+ Args:
314
+ hierarchy: Hierarchy structure to attach
315
+
316
+ Returns:
317
+ New TSDataset instance with hierarchy
318
+ """
319
+ return replace(self, hierarchy=hierarchy)
320
+
321
+ def with_covariates(
322
+ self,
323
+ aligned: AlignedDataset | None,
324
+ panel_with_covariates: pd.DataFrame | None = None,
325
+ covariate_bundle: CovariateBundle | None = None,
326
+ ) -> TSDataset:
327
+ """Return new TSDataset with covariates attached."""
328
+ if aligned is None:
329
+ return replace(
330
+ self,
331
+ static_x=None,
332
+ past_x=None,
333
+ future_x=None,
334
+ future_index=None,
335
+ covariate_spec=None,
336
+ covariate_bundle=covariate_bundle,
337
+ panel_with_covariates=panel_with_covariates,
338
+ )
339
+
340
+ return replace(
341
+ self,
342
+ static_x=aligned.static_x,
343
+ past_x=aligned.past_x,
344
+ future_x=aligned.future_x,
345
+ future_index=aligned.future_index,
346
+ covariate_spec=aligned.covariate_spec,
347
+ covariate_bundle=covariate_bundle,
348
+ panel_with_covariates=panel_with_covariates,
349
+ )
350
+
351
+ def is_hierarchical(self) -> bool:
352
+ """Check if dataset has hierarchy.
353
+
354
+ Returns:
355
+ True if hierarchy is attached
356
+ """
357
+ return self.hierarchy is not None
358
+
359
+ def get_level_series(self, level: int) -> list[str]:
360
+ """Get all series IDs at a specific hierarchy level.
361
+
362
+ Args:
363
+ level: Hierarchy level (0 = root)
364
+
365
+ Returns:
366
+ List of series IDs at that level
367
+ """
368
+ if not self.hierarchy:
369
+ return self.series_ids
370
+
371
+ return [
372
+ node for node in self.hierarchy.all_nodes
373
+ if self.hierarchy.get_level(node) == level
374
+ ]
375
+
376
+ def aggregate_to_level(self, target_level: int) -> TSDataset:
377
+ """Aggregate bottom-level data to target hierarchy level.
378
+
379
+ Args:
380
+ target_level: Target hierarchy level
381
+
382
+ Returns:
383
+ New TSDataset with aggregated data
384
+ """
385
+ if not self.hierarchy:
386
+ raise ValueError("Dataset does not have hierarchy")
387
+
388
+ target_nodes = self.hierarchy.get_nodes_at_level(target_level)
389
+
390
+ # Aggregate data for target nodes
391
+ aggregated_rows = []
392
+ for node in target_nodes:
393
+ # Find all bottom nodes that contribute to this node
394
+ bottom_contributors = []
395
+ for bottom_node in self.hierarchy.bottom_nodes:
396
+ bottom_idx = self.hierarchy.all_nodes.index(bottom_node)
397
+ node_idx = self.hierarchy.all_nodes.index(node)
398
+ if self.hierarchy.s_matrix[node_idx, bottom_idx] == 1:
399
+ bottom_contributors.append(bottom_node)
400
+
401
+ # Get data for contributors
402
+ node_data = self.df[self.df["unique_id"].isin(bottom_contributors)]
403
+
404
+ # Aggregate by date
405
+ if not node_data.empty:
406
+ aggregated = node_data.groupby("ds")["y"].sum().reset_index()
407
+ aggregated["unique_id"] = node
408
+ aggregated_rows.append(aggregated)
409
+
410
+ if not aggregated_rows:
411
+ raise ValueError(f"No data found for level {target_level}")
412
+
413
+ new_df = pd.concat(aggregated_rows, ignore_index=True)
414
+
415
+ # Reorder columns to match expected format
416
+ new_df = new_df[["unique_id", "ds", "y"]]
417
+
418
+ return replace(
419
+ self,
420
+ df=new_df,
421
+ sparsity_profile=None, # Need to recompute
422
+ )
423
+
424
+
425
+ def build_dataset(
426
+ data: pd.DataFrame,
427
+ task_spec: TaskSpec,
428
+ validate: bool = True,
429
+ compute_sparsity: bool = True,
430
+ ) -> TSDataset:
431
+ """Build a TSDataset from raw data.
432
+
433
+ Convenience function for creating TSDataset.
434
+
435
+ Args:
436
+ data: Input DataFrame
437
+ task_spec: Task specification
438
+ validate: Whether to validate input (default: True)
439
+ compute_sparsity: Whether to compute sparsity profile (default: True)
440
+
441
+ Returns:
442
+ New TSDataset instance
443
+ """
444
+ return TSDataset.from_dataframe(
445
+ data=data,
446
+ task_spec=task_spec,
447
+ validate=validate,
448
+ compute_sparsity=compute_sparsity,
449
+ )
@@ -0,0 +1,261 @@
1
+ """Sparsity profiling for time series.
2
+
3
+ Identifies intermittent, cold-start, and sparse series for the router.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from dataclasses import dataclass
9
+ from enum import StrEnum
10
+
11
+ import numpy as np
12
+ import pandas as pd
13
+
14
+
15
+ class SparsityClass(StrEnum):
16
+ """Classification of series sparsity patterns."""
17
+
18
+ REGULAR = "regular"
19
+ """Regular series with consistent observations."""
20
+
21
+ INTERMITTENT = "intermittent"
22
+ """Series with many zero values (intermittent demand)."""
23
+
24
+ SPARSE = "sparse"
25
+ """Series with irregular gaps in observations."""
26
+
27
+ COLD_START = "cold_start"
28
+ """Series with very few observations (new series)."""
29
+
30
+
31
+ @dataclass(frozen=True)
32
+ class SparsityProfile:
33
+ """Sparsity profile for a time series dataset.
34
+
35
+ Contains classification and metrics for each series in the dataset,
36
+ used by the router for model selection.
37
+
38
+ Attributes:
39
+ series_profiles: Dict mapping unique_id to profile metrics
40
+ dataset_metrics: Aggregate metrics across all series
41
+ """
42
+
43
+ series_profiles: dict[str, dict]
44
+ dataset_metrics: dict
45
+
46
+ def get_classification(self, unique_id: str) -> SparsityClass:
47
+ """Get the sparsity classification for a series.
48
+
49
+ Args:
50
+ unique_id: The series identifier
51
+
52
+ Returns:
53
+ SparsityClass enum value
54
+ """
55
+ profile = self.series_profiles.get(unique_id, {})
56
+ return SparsityClass(profile.get("classification", "regular"))
57
+
58
+ def get_series_by_class(self, cls: SparsityClass) -> list[str]:
59
+ """Get all series IDs with a given classification.
60
+
61
+ Args:
62
+ cls: Sparsity class to filter by
63
+
64
+ Returns:
65
+ List of unique_id values
66
+ """
67
+ return [
68
+ uid
69
+ for uid, profile in self.series_profiles.items()
70
+ if profile.get("classification") == cls.value
71
+ ]
72
+
73
+ def has_intermittent(self) -> bool:
74
+ """Check if dataset has any intermittent series."""
75
+ return any(
76
+ p.get("classification") == SparsityClass.INTERMITTENT.value
77
+ for p in self.series_profiles.values()
78
+ )
79
+
80
+ def has_cold_start(self) -> bool:
81
+ """Check if dataset has any cold-start series."""
82
+ return any(
83
+ p.get("classification") == SparsityClass.COLD_START.value
84
+ for p in self.series_profiles.values()
85
+ )
86
+
87
+
88
+ def compute_sparsity_profile(
89
+ df: pd.DataFrame,
90
+ min_observations: int = 10,
91
+ zero_threshold: float = 0.3,
92
+ gap_threshold: float = 0.2,
93
+ ) -> SparsityProfile:
94
+ """Compute sparsity profile for a dataset.
95
+
96
+ Analyzes each series to classify as regular, intermittent, sparse,
97
+ or cold-start based on observation patterns.
98
+
99
+ Args:
100
+ df: DataFrame with columns [unique_id, ds, y]
101
+ min_observations: Minimum observations for non-cold-start (default: 10)
102
+ zero_threshold: Threshold for zero ratio to be intermittent (default: 0.3)
103
+ gap_threshold: Threshold for gap ratio to be sparse (default: 0.2)
104
+
105
+ Returns:
106
+ SparsityProfile with classifications and metrics
107
+ """
108
+ series_profiles: dict[str, dict] = {}
109
+
110
+ for uid in df["unique_id"].unique():
111
+ series = df[df["unique_id"] == uid].sort_values("ds")
112
+ profile = _analyze_series(
113
+ series,
114
+ min_observations=min_observations,
115
+ zero_threshold=zero_threshold,
116
+ gap_threshold=gap_threshold,
117
+ )
118
+ series_profiles[uid] = profile
119
+
120
+ # Compute dataset-level metrics
121
+ dataset_metrics = _compute_dataset_metrics(series_profiles)
122
+
123
+ return SparsityProfile(
124
+ series_profiles=series_profiles,
125
+ dataset_metrics=dataset_metrics,
126
+ )
127
+
128
+
129
+ def _analyze_series(
130
+ series: pd.DataFrame,
131
+ min_observations: int,
132
+ zero_threshold: float,
133
+ gap_threshold: float,
134
+ ) -> dict:
135
+ """Analyze a single series for sparsity patterns.
136
+
137
+ Args:
138
+ series: DataFrame for a single series
139
+ min_observations: Minimum observations threshold
140
+ zero_threshold: Zero ratio threshold for intermittent
141
+ gap_threshold: Gap ratio threshold for sparse
142
+
143
+ Returns:
144
+ Dictionary with classification and metrics
145
+ """
146
+ y = series["y"].values
147
+ n = len(y)
148
+
149
+ # Basic metrics
150
+ metrics = {
151
+ "n_observations": n,
152
+ "zero_ratio": float(np.mean(y == 0)) if len(y) > 0 else 0.0,
153
+ "missing_ratio": float(series["y"].isna().mean()),
154
+ }
155
+
156
+ # Compute gap metrics if we have datetime
157
+ if n > 1 and pd.api.types.is_datetime64_any_dtype(series["ds"]):
158
+ ds = pd.to_datetime(series["ds"])
159
+ time_diffs = ds.diff().dropna()
160
+
161
+ if len(time_diffs) > 0:
162
+ # Most common interval
163
+ mode_diff = time_diffs.mode()
164
+ if len(mode_diff) > 0:
165
+ expected_interval = mode_diff.iloc[0]
166
+ # Count gaps (intervals significantly larger than expected)
167
+ gap_count = int((time_diffs > expected_interval * 1.5).sum())
168
+ metrics["gap_count"] = gap_count
169
+ metrics["gap_ratio"] = gap_count / len(time_diffs) if len(time_diffs) > 0 else 0.0
170
+ else:
171
+ metrics["gap_count"] = 0
172
+ metrics["gap_ratio"] = 0.0
173
+ else:
174
+ metrics["gap_count"] = 0
175
+ metrics["gap_ratio"] = 0.0
176
+ else:
177
+ metrics["gap_count"] = 0
178
+ metrics["gap_ratio"] = 0.0
179
+
180
+ # Classification logic
181
+ classification = _classify_series(
182
+ metrics, n, min_observations, zero_threshold, gap_threshold
183
+ )
184
+ metrics["classification"] = classification
185
+
186
+ return metrics
187
+
188
+
189
+ def _classify_series(
190
+ metrics: dict,
191
+ n: int,
192
+ min_observations: int,
193
+ zero_threshold: float,
194
+ gap_threshold: float,
195
+ ) -> str:
196
+ """Classify a series based on its metrics.
197
+
198
+ Classification order matters:
199
+ 1. Cold start (too few observations)
200
+ 2. Intermittent (many zeros)
201
+ 3. Sparse (many gaps)
202
+ 4. Regular (default)
203
+
204
+ Args:
205
+ metrics: Series metrics dictionary
206
+ n: Number of observations
207
+ min_observations: Threshold for cold-start
208
+ zero_threshold: Threshold for intermittent
209
+ gap_threshold: Threshold for sparse
210
+
211
+ Returns:
212
+ Classification string
213
+ """
214
+ # Cold start: too few observations
215
+ if n < min_observations:
216
+ return SparsityClass.COLD_START.value
217
+
218
+ # Intermittent: many zero values
219
+ if metrics.get("zero_ratio", 0) > zero_threshold:
220
+ return SparsityClass.INTERMITTENT.value
221
+
222
+ # Sparse: significant gaps
223
+ if metrics.get("gap_ratio", 0) > gap_threshold:
224
+ return SparsityClass.SPARSE.value
225
+
226
+ # Default: regular
227
+ return SparsityClass.REGULAR.value
228
+
229
+
230
+ def _compute_dataset_metrics(series_profiles: dict[str, dict]) -> dict:
231
+ """Compute aggregate metrics across all series.
232
+
233
+ Args:
234
+ series_profiles: Dict of series profiles
235
+
236
+ Returns:
237
+ Dictionary of dataset-level metrics
238
+ """
239
+ if not series_profiles:
240
+ return {
241
+ "total_series": 0,
242
+ "classification_counts": {},
243
+ "avg_observations": 0.0,
244
+ }
245
+
246
+ # Count classifications
247
+ class_counts: dict[str, int] = {}
248
+ for profile in series_profiles.values():
249
+ cls = profile.get("classification", "unknown")
250
+ class_counts[cls] = class_counts.get(cls, 0) + 1
251
+
252
+ # Average observations
253
+ avg_obs = sum(p.get("n_observations", 0) for p in series_profiles.values()) / len(
254
+ series_profiles
255
+ )
256
+
257
+ return {
258
+ "total_series": len(series_profiles),
259
+ "classification_counts": class_counts,
260
+ "avg_observations": avg_obs,
261
+ }