openstef-core 4.0.0.dev1__tar.gz → 4.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/.gitignore +26 -6
  2. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/PKG-INFO +6 -4
  3. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/README.md +2 -2
  4. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/pyproject.toml +9 -9
  5. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/__init__.py +1 -1
  6. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/base_model.py +29 -4
  7. openstef_core-4.1.0/src/openstef_core/constants.py +10 -0
  8. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/datasets/__init__.py +4 -1
  9. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/datasets/mixins.py +4 -1
  10. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/datasets/timeseries_dataset.py +67 -12
  11. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/datasets/validated_datasets.py +204 -2
  12. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/datasets/validation.py +2 -4
  13. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/datasets/versioned_timeseries_dataset.py +6 -9
  14. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/exceptions.py +31 -8
  15. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/mixins/__init__.py +1 -1
  16. openstef_core-4.1.0/src/openstef_core/mixins/param_ranges.py +131 -0
  17. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/mixins/predictor.py +111 -12
  18. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/mixins/stateful.py +5 -4
  19. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/mixins/transform.py +4 -3
  20. openstef_core-4.1.0/src/openstef_core/testing.py +306 -0
  21. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/transforms/__init__.py +1 -1
  22. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/transforms/dataset_transforms.py +2 -2
  23. openstef_core-4.1.0/src/openstef_core/types.py +455 -0
  24. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/utils/__init__.py +7 -1
  25. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/utils/datetime.py +1 -1
  26. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/utils/invariants.py +1 -1
  27. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/utils/itertools.py +1 -1
  28. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/utils/multiprocessing.py +3 -5
  29. openstef_core-4.1.0/src/openstef_core/utils/numpy.py +105 -0
  30. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/utils/pandas.py +37 -2
  31. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/utils/pydantic.py +3 -3
  32. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/__init__.py +1 -1
  33. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/datasets/test_mixins.py +15 -11
  34. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/datasets/test_timeseries_dataset.py +67 -29
  35. openstef_core-4.1.0/tests/unit/datasets/test_validated_datasets.py +382 -0
  36. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/datasets/test_validation.py +1 -1
  37. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/datasets/test_versioned_timeseries_dataset.py +138 -68
  38. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/datasets/utils.py +1 -1
  39. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/mixins/test_stateful.py +2 -2
  40. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/mixins/test_transform.py +2 -2
  41. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/test_base_model.py +1 -1
  42. openstef_core-4.1.0/tests/unit/test_hyperparams_tuning.py +115 -0
  43. openstef_core-4.1.0/tests/unit/test_param_ranges.py +112 -0
  44. openstef_core-4.1.0/tests/unit/test_types.py +466 -0
  45. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/utils/test_datetime.py +2 -2
  46. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/utils/test_itertools.py +1 -1
  47. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/utils/test_multiprocessing.py +1 -1
  48. openstef_core-4.1.0/tests/unit/utils/test_numpy.py +179 -0
  49. openstef_core-4.1.0/tests/unit/utils/test_pandas.py +56 -0
  50. openstef_core-4.0.0.dev1/src/openstef_core/testing.py +0 -131
  51. openstef_core-4.0.0.dev1/src/openstef_core/types.py +0 -302
  52. openstef_core-4.0.0.dev1/tests/unit/test_types.py +0 -82
  53. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/__init__.py +0 -0
  54. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/datasets/__init__.py +0 -0
  55. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/mixins/__init__.py +0 -0
  56. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/transforms/__init__.py +0 -0
  57. {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/utils/__init__.py +0 -0
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com> # noqa E501>
1
+ # SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project <openstef@lfenergy.org> # noqa E501>
2
2
  # SPDX-License-Identifier: MPL-2.0
3
3
 
4
4
  # Core
@@ -35,10 +35,6 @@ MANIFEST
35
35
  # Ruff
36
36
  .ruff_cache/
37
37
 
38
- # Pyright
39
- .pyright/
40
- # pyright-report/
41
-
42
38
  # Test, coverage, tox
43
39
  .pytest_cache/
44
40
  .coverage
@@ -67,6 +63,14 @@ dmypy.json
67
63
  # Sphinx
68
64
  docs/_build/
69
65
  docs/source/api/generated/
66
+ docs/source/tutorials/
67
+ docs/source/benchmarks/
68
+ # Community health files materialized from OpenSTEF/.github at build time
69
+ docs/source/contribute/_community/
70
+ docs/source/user_guide/**/quick_start_tutorial.py
71
+ docs/source/user_guide/**/feature_engineering_tutorial.py
72
+ docs/source/user_guide/**/datasets_tutorial.py
73
+ docs/source/user_guide/**/backtesting_tutorial.py
70
74
 
71
75
  # docs/_doctrees/
72
76
  # docs/_static_gen/
@@ -124,4 +128,20 @@ certificates/
124
128
  *.pkl
125
129
 
126
130
  # Benchmark outputs
127
- benchmark_results*/
131
+ benchmark_results*/
132
+
133
+ # Local dataset files
134
+ liander_dataset/
135
+
136
+ # Deployment example run artifacts (MLflow store, forecasts, dataset, Celery/Airflow state)
137
+ openstef_deployment_runs/
138
+
139
+ # Mlflow
140
+ /mlflow
141
+ /mlflow_artifacts_local
142
+
143
+ .github/instructions
144
+
145
+ # Jupyter notebook cache (myst-nb execution outputs)
146
+ .jupyter_cache/
147
+ docs/build.zip
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: openstef-core
3
- Version: 4.0.0.dev1
3
+ Version: 4.1.0
4
4
  Summary: Core functionality for OpenSTEF, a framework for short-term energy forecasting.
5
5
  Project-URL: Documentation, https://openstef.github.io/openstef/index.html
6
6
  Project-URL: Homepage, https://lfenergy.org/projects/openstef/
7
7
  Project-URL: Issues, https://github.com/OpenSTEF/openstef/issues
8
8
  Project-URL: Repository, https://github.com/OpenSTEF/openstef
9
- Author-email: "Alliander N.V" <short.term.energy.forecasts@alliander.com>
9
+ Author-email: "Alliander N.V" <openstef@lfenergy.org>
10
10
  License-Expression: MPL-2.0
11
11
  Keywords: energy,forecasting,machinelearning
12
12
  Classifier: Development Status :: 5 - Production/Stable
@@ -22,12 +22,14 @@ Requires-Dist: pandas<3,>=2.3.1
22
22
  Requires-Dist: pyarrow>=21
23
23
  Requires-Dist: pydantic-extra-types<3,>=2.10.5
24
24
  Requires-Dist: pydantic<3,>=2.12.4
25
+ Provides-Extra: benchmark
26
+ Requires-Dist: huggingface-hub>=1.2.2; extra == 'benchmark'
25
27
  Description-Content-Type: text/markdown
26
28
 
27
29
  <!--
28
- SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
30
+ SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
29
31
 
30
32
  SPDX-License-Identifier: MPL-2.0
31
33
  -->
32
34
 
33
- # openstef-core
35
+ # openstef-core
@@ -1,7 +1,7 @@
1
1
  <!--
2
- SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
2
+ SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
3
3
 
4
4
  SPDX-License-Identifier: MPL-2.0
5
5
  -->
6
6
 
7
- # openstef-core
7
+ # openstef-core
@@ -1,21 +1,19 @@
1
- # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
1
+ # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
2
2
  #
3
3
  # SPDX-License-Identifier: MPL-2.0
4
-
5
4
  [build-system]
6
5
  build-backend = "hatchling.build"
7
-
8
6
  requires = [ "hatchling" ]
9
7
 
10
8
  [project]
11
9
  name = "openstef-core"
12
- version = "4.0.0.dev1"
10
+ version = "4.1.0"
13
11
  description = "Core functionality for OpenSTEF, a framework for short-term energy forecasting."
14
12
  readme = "README.md"
15
13
  keywords = [ "energy", "forecasting", "machinelearning" ]
16
14
  license = "MPL-2.0"
17
15
  authors = [
18
- { name = "Alliander N.V", email = "short.term.energy.forecasts@alliander.com" },
16
+ { name = "Alliander N.V", email = "openstef@lfenergy.org" },
19
17
  ]
20
18
  requires-python = ">=3.12,<4.0"
21
19
  classifiers = [
@@ -26,20 +24,22 @@ classifiers = [
26
24
  "Programming Language :: Python :: 3.13",
27
25
  "Programming Language :: Python :: 3.14",
28
26
  ]
29
-
30
27
  dependencies = [
31
28
  "joblib>=1,<2",
32
29
  "numpy>=2.3.2,<3",
30
+ # Held at <3 pending the pandas 3.0 Copy-on-Write migration (see migration issue).
33
31
  "pandas>=2.3.1,<3",
34
32
  "pyarrow>=21",
35
33
  "pydantic>=2.12.4,<3",
36
34
  "pydantic-extra-types>=2.10.5,<3",
37
35
  ]
38
-
36
+ optional-dependencies.benchmark = [
37
+ "huggingface-hub>=1.2.2",
38
+ ]
39
39
  urls.Documentation = "https://openstef.github.io/openstef/index.html"
40
40
  urls.Homepage = "https://lfenergy.org/projects/openstef/"
41
41
  urls.Issues = "https://github.com/OpenSTEF/openstef/issues"
42
42
  urls.Repository = "https://github.com/OpenSTEF/openstef"
43
43
 
44
- [tool.hatch.build.targets.wheel]
45
- packages = [ "src/openstef_core" ]
44
+ [tool.hatch]
45
+ build.targets.wheel.packages = [ "src/openstef_core" ]
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
1
+ # SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
2
2
  #
3
3
  # SPDX-License-Identifier: MPL-2.0
4
4
  """Core functionality for OpenSTEF, a framework for short-term energy forecasting."""
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
1
+ # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
2
2
  #
3
3
  # SPDX-License-Identifier: MPL-2.0
4
4
 
@@ -11,11 +11,18 @@ operate on arbitrary config instances or Pydantic models / adapters.
11
11
  """
12
12
 
13
13
  from pathlib import Path
14
- from typing import Annotated, Any, Self
14
+ from typing import Annotated, Any, Self, override
15
15
 
16
16
  import yaml
17
17
  from pydantic import BaseModel as PydanticBaseModel
18
- from pydantic import BeforeValidator, ConfigDict, GetCoreSchemaHandler, TypeAdapter
18
+ from pydantic import (
19
+ BeforeValidator,
20
+ ConfigDict,
21
+ GetCoreSchemaHandler,
22
+ GetJsonSchemaHandler,
23
+ TypeAdapter,
24
+ ValidationInfo,
25
+ )
19
26
  from pydantic_core import core_schema
20
27
 
21
28
 
@@ -105,6 +112,7 @@ def read_yaml_config[T: BaseConfig, U](path: Path, class_type: type[T] | TypeAda
105
112
  class PydanticStringPrimitive:
106
113
  """Base class for Pydantic-compatible types with string serialization."""
107
114
 
115
+ @override
108
116
  def __str__(self) -> str:
109
117
  """Convert to string representation."""
110
118
  raise NotImplementedError("Subclasses must implement __str__")
@@ -115,7 +123,7 @@ class PydanticStringPrimitive:
115
123
  raise NotImplementedError("Subclasses must implement from_string")
116
124
 
117
125
  @classmethod
118
- def validate(cls, v: Any, _info: Any = None) -> Self: # noqa: ANN401
126
+ def validate(cls, v: Any, _info: ValidationInfo | None = None) -> Self: # noqa: ANN401
119
127
  """Validate and convert input to this type.
120
128
 
121
129
  Args:
@@ -150,6 +158,22 @@ class PydanticStringPrimitive:
150
158
  function=cls.validate, serialization=core_schema.plain_serializer_function_ser_schema(cls.__str__)
151
159
  )
152
160
 
161
+ @classmethod
162
+ def __get_pydantic_json_schema__(
163
+ cls,
164
+ _schema: core_schema.CoreSchema,
165
+ handler: GetJsonSchemaHandler,
166
+ ) -> dict[str, Any]:
167
+ """Generate JSON schema for OpenAPI / FastAPI compatibility.
168
+
169
+ All string-primitive types serialise as plain strings.
170
+
171
+ Returns:
172
+ JSON schema describing the type as a string.
173
+ """
174
+ return {"type": "string"}
175
+
176
+ @override
153
177
  def __eq__(self, other: object) -> bool:
154
178
  """Check equality based on string representation.
155
179
 
@@ -160,6 +184,7 @@ class PydanticStringPrimitive:
160
184
  return NotImplemented
161
185
  return str(self) == str(other)
162
186
 
187
+ @override
163
188
  def __hash__(self) -> int:
164
189
  """Return hash based on string representation."""
165
190
  return hash(str(self))
@@ -0,0 +1,10 @@
1
+ # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
2
+ #
3
+ # SPDX-License-Identifier: MPL-2.0
4
+
5
+ """Shared constants for the openstef_core package."""
6
+
7
+ LIANDER_DATASET_REPO_ID = "OpenSTEF/liander2024-energy-forecasting-benchmark"
8
+
9
+
10
+ __all__ = ["LIANDER_DATASET_REPO_ID"]
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
1
+ # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
2
2
  #
3
3
  # SPDX-License-Identifier: MPL-2.0
4
4
 
@@ -9,6 +9,7 @@ It includes both simple time series datasets and versioned datasets that track d
9
9
  over time, enabling realistic backtesting and training and forecasting.
10
10
 
11
11
  The module supports:
12
+
12
13
  - Regular time series with consistent sampling intervals
13
14
  - Versioned time series that track when data became available
14
15
  - Validated datasets with domain-specific constraints
@@ -19,6 +20,7 @@ The module supports:
19
20
  from openstef_core.datasets.timeseries_dataset import TimeSeriesDataset, validate_horizons_present
20
21
  from openstef_core.datasets.validated_datasets import (
21
22
  EnergyComponentDataset,
23
+ EnsembleForecastDataset,
22
24
  ForecastDataset,
23
25
  ForecastInputDataset,
24
26
  )
@@ -26,6 +28,7 @@ from openstef_core.datasets.versioned_timeseries_dataset import VersionedTimeSer
26
28
 
27
29
  __all__ = [
28
30
  "EnergyComponentDataset",
31
+ "EnsembleForecastDataset",
29
32
  "ForecastDataset",
30
33
  "ForecastInputDataset",
31
34
  "TimeSeriesDataset",
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
1
+ # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
2
2
  #
3
3
  # SPDX-License-Identifier: MPL-2.0
4
4
 
@@ -9,6 +9,7 @@ datasets in OpenSTEF. Protocols enable type checking and documentation of expect
9
9
  behavior without requiring inheritance.
10
10
 
11
11
  Key protocols:
12
+
12
13
  - TimeSeries: Core interface for all time series datasets with filtering and versioning
13
14
  - DatasetMixin: Interface for dataset persistence operations
14
15
  """
@@ -36,6 +37,7 @@ class TimeSeriesMixin(Protocol):
36
37
  the dataset's temporal index, and filtering/versioning capabilities.
37
38
 
38
39
  Classes implementing this interface must provide:
40
+
39
41
  - Access to the datetime index
40
42
  - Sample interval information
41
43
  - Feature names list
@@ -158,6 +160,7 @@ class DatasetMixin(Protocol):
158
160
  and reconstructed exactly as they were saved.
159
161
 
160
162
  Classes implementing this mixin must:
163
+
161
164
  - Save all data and metadata necessary for complete reconstruction
162
165
  - Store metadata in parquet file attributes using attrs
163
166
  - Handle missing metadata gracefully with sensible defaults when loading
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
1
+ # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
2
2
  #
3
3
  # SPDX-License-Identifier: MPL-2.0
4
4
 
@@ -33,7 +33,7 @@ from openstef_core.utils.pandas import unsafe_sorted_range_slice_idxs
33
33
  _logger = logging.getLogger(__name__)
34
34
 
35
35
 
36
- class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - important utility class, allow too many public methods
36
+ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin):
37
37
  """A time series dataset with regular sampling intervals and optional versioning.
38
38
 
39
39
  This class represents time series data with a consistent sampling interval
@@ -42,11 +42,18 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
42
42
  over time through either a horizon column or an available_at column.
43
43
 
44
44
  The dataset automatically detects versioning:
45
+
45
46
  - If a horizon column exists, data is versioned by forecast horizon
46
47
  - If an available_at column exists, data is versioned by availability time
47
48
  - Otherwise, data is treated as a regular time series
48
49
 
50
+ Columns whose names start with a double underscore (``__``) are treated as
51
+ internal/system columns: they are kept in ``data`` so transforms can pass
52
+ them along, but are excluded from ``feature_names`` so feature-aware
53
+ transforms ignore them.
54
+
49
55
  The dataset guarantees:
56
+
50
57
  - Data is sorted by timestamp in ascending order
51
58
  - Consistent sampling interval across all data points
52
59
  - DateTime index for temporal operations
@@ -57,7 +64,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
57
64
  available_at_column: Name of the column storing availability times (if versioned).
58
65
 
59
66
  Example:
60
- Create a simple time series dataset:
67
+ Create a simple time series dataset
61
68
 
62
69
  >>> import pandas as pd
63
70
  >>> from datetime import timedelta
@@ -71,7 +78,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
71
78
  >>> dataset.is_versioned
72
79
  False
73
80
 
74
- Create a versioned dataset with horizons:
81
+ Create a versioned dataset with horizons
75
82
 
76
83
  >>> data_with_horizon = pd.DataFrame({
77
84
  ... 'load': [100, 120],
@@ -104,10 +111,12 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
104
111
  horizon_column: str = "horizon",
105
112
  available_at_column: str = "available_at",
106
113
  is_sorted: bool = False,
114
+ check_frequency: bool = False,
107
115
  ) -> None:
108
116
  """Initialize a time series dataset.
109
117
 
110
118
  The dataset automatically detects whether it's versioned based on column presence:
119
+
111
120
  - If horizon_column exists: versioned by forecast horizon
112
121
  - If available_at_column exists: versioned by availability time
113
122
  - Otherwise: regular time series
@@ -118,10 +127,12 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
118
127
  horizon_column: Name of the column storing forecast horizons.
119
128
  available_at_column: Name of the column storing availability times.
120
129
  is_sorted: Whether the data is sorted by timestamp.
130
+ check_frequency: Whether to check that the data frequency matches sample_interval.
121
131
 
122
132
  Raises:
123
133
  TypeError: If data index is not a pandas DatetimeIndex or if versioning
124
134
  columns have incorrect types.
135
+ ValueError: If data frequency does not match sample_interval.
125
136
  """
126
137
  if not isinstance(data.index, pd.DatetimeIndex):
127
138
  raise TypeError("Data index must be a pandas DatetimeIndex.")
@@ -130,9 +141,15 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
130
141
  self.horizon_column = horizon_column
131
142
  self.available_at_column = available_at_column
132
143
  self._sample_interval = sample_interval
133
- self._internal_columns = set()
144
+ self._internal_columns = {col for col in data.columns if col.startswith("__")}
134
145
  data.index.name = self.index_name
135
146
 
147
+ # Check input data frequency matches sample_interval, only if there are enough data points to infer frequency
148
+ minimum_required_length = 2
149
+ if check_frequency and len(data) >= minimum_required_length and not self.frequency_matches(data.index):
150
+ msg = f"Data frequency does not match the sample_interval ({sample_interval})."
151
+ raise ValueError(msg)
152
+
136
153
  if self.horizon_column in data.columns:
137
154
  validate_timedelta_column(data[self.horizon_column])
138
155
  self._version_column = self.horizon_column
@@ -147,7 +164,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
147
164
  self._horizons = None
148
165
  else:
149
166
  self._version_column = None
150
- self._feature_names = data.columns.to_list()
167
+ self._feature_names = [col for col in data.columns if col not in self._internal_columns]
151
168
  self._horizons = None
152
169
 
153
170
  # Ensure invariants: data is sorted by timestamp
@@ -257,7 +274,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
257
274
  if available_at_series is None:
258
275
  return self
259
276
 
260
- cutoff = self.index.floor("D") - pd.Timedelta(available_at.lag_from_day)
277
+ cutoff = available_at.apply_index(self.index)
261
278
  data_filtered = self.data[available_at_series <= cutoff]
262
279
  return self._copy_with_data(data=data_filtered)
263
280
 
@@ -287,7 +304,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
287
304
  Returns:
288
305
  New dataset containing only rows with timestamps in the mask.
289
306
  """
290
- data_filtered = self.data.loc[self.index.isin(mask)] # pyright: ignore[reportUnknownMemberType]
307
+ data_filtered = self.data.loc[self.index.isin(mask)]
291
308
 
292
309
  return self._copy_with_data(data=data_filtered)
293
310
 
@@ -303,7 +320,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
303
320
  if self.horizons is None:
304
321
  return self
305
322
 
306
- data_selected = self.data[self.lead_time_series == horizon.value]
323
+ data_selected = cast(pd.DataFrame, self.data[self.lead_time_series == horizon.value])
307
324
  return self._copy_with_data(data=data_selected)
308
325
 
309
326
  def to_pandas(self) -> pd.DataFrame:
@@ -378,7 +395,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
378
395
  available_at_column: str = "available_at",
379
396
  horizon_column: str = "horizon",
380
397
  ) -> Self:
381
- df = pd.read_parquet(path=path) # pyright: ignore[reportUnknownMemberType]
398
+ df = pd.read_parquet(path=path)
382
399
  if not isinstance(df.index, pd.DatetimeIndex):
383
400
  if timestamp_column not in df.columns:
384
401
  raise TimeSeriesValidationError(
@@ -443,6 +460,44 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
443
460
  is_sorted=is_sorted,
444
461
  )
445
462
 
463
+ @staticmethod
464
+ def _infer_frequency(index: pd.DatetimeIndex) -> pd.Timedelta:
465
+ """Infer the frequency of a pandas DatetimeIndex if the freq attribute is not set.
466
+
467
+ This method calculates the most common time difference between consecutive timestamps,
468
+ which is more permissive of missing chunks of data than the pandas infer_freq method.
469
+
470
+ Args:
471
+ index (pd.DatetimeIndex): The datetime index to infer the frequency from.
472
+
473
+ Returns:
474
+ pd.Timedelta: The inferred frequency as a pandas Timedelta.
475
+
476
+ Raises:
477
+ ValueError: If the index has fewer than 2 timestamps.
478
+ """
479
+ minimum_required_length = 2
480
+ if len(index) < minimum_required_length:
481
+ raise ValueError("Cannot infer frequency from an index with fewer than 2 timestamps.")
482
+
483
+ # Calculate the differences between consecutive timestamps
484
+ deltas = index.to_series().drop_duplicates().sort_values().diff().dropna()
485
+
486
+ # Find the most common difference
487
+ return deltas.mode().iloc[0]
488
+
489
+ def frequency_matches(self, index: pd.DatetimeIndex) -> bool:
490
+ """Check if the frequency of the data matches the model frequency.
491
+
492
+ Args:
493
+ index (pd.DatetimeIndex): The data to check.
494
+
495
+ Returns:
496
+ bool: True if the frequencies match, False otherwise.
497
+ """
498
+ input_sample_interval = self._infer_frequency(index) if index.freq is None else index.freq
499
+ return input_sample_interval == self.sample_interval
500
+
446
501
 
447
502
  def validate_horizons_present(dataset: TimeSeriesDataset, horizons: list[LeadTime]) -> None:
448
503
  """Validate that the specified forecast horizons are present in the dataset.
@@ -457,8 +512,8 @@ def validate_horizons_present(dataset: TimeSeriesDataset, horizons: list[LeadTim
457
512
  if dataset.horizons is None and len(horizons) == 1:
458
513
  return # Non-versioned dataset can satisfy single-horizon requests
459
514
 
460
- required_horizons = set(horizons or [])
461
- missing_horizons = [h for h in horizons if h not in required_horizons]
515
+ available_horizons = set(dataset.horizons or [])
516
+ missing_horizons = [h for h in horizons if h not in available_horizons]
462
517
  if missing_horizons:
463
518
  raise TimeSeriesValidationError("Missing forecast horizons: " + ", ".join(map(str, missing_horizons)))
464
519