openstef-core 4.0.0.dev1__tar.gz → 4.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/.gitignore +21 -2
  2. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/PKG-INFO +6 -4
  3. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/README.md +2 -2
  4. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/pyproject.toml +7 -3
  5. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/__init__.py +1 -1
  6. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/base_model.py +25 -3
  7. openstef_core-4.0.1/src/openstef_core/constants.py +10 -0
  8. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/datasets/__init__.py +4 -1
  9. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/datasets/mixins.py +4 -1
  10. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/datasets/timeseries_dataset.py +61 -6
  11. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/datasets/validated_datasets.py +202 -2
  12. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/datasets/validation.py +1 -1
  13. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/datasets/versioned_timeseries_dataset.py +3 -3
  14. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/exceptions.py +19 -1
  15. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/mixins/__init__.py +1 -1
  16. openstef_core-4.0.1/src/openstef_core/mixins/param_ranges.py +131 -0
  17. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/mixins/predictor.py +111 -12
  18. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/mixins/stateful.py +1 -1
  19. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/mixins/transform.py +3 -3
  20. openstef_core-4.0.1/src/openstef_core/testing.py +306 -0
  21. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/transforms/__init__.py +1 -1
  22. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/transforms/dataset_transforms.py +2 -2
  23. openstef_core-4.0.1/src/openstef_core/types.py +453 -0
  24. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/utils/__init__.py +1 -1
  25. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/utils/datetime.py +1 -1
  26. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/utils/invariants.py +1 -1
  27. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/utils/itertools.py +1 -1
  28. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/utils/multiprocessing.py +1 -1
  29. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/utils/pandas.py +36 -1
  30. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/utils/pydantic.py +3 -3
  31. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/__init__.py +1 -1
  32. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/datasets/test_mixins.py +1 -1
  33. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/datasets/test_timeseries_dataset.py +32 -4
  34. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/datasets/test_validation.py +1 -1
  35. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/datasets/test_versioned_timeseries_dataset.py +3 -3
  36. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/datasets/utils.py +1 -1
  37. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/mixins/test_stateful.py +1 -1
  38. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/mixins/test_transform.py +1 -1
  39. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/test_base_model.py +1 -1
  40. openstef_core-4.0.1/tests/unit/test_hyperparams_tuning.py +115 -0
  41. openstef_core-4.0.1/tests/unit/test_param_ranges.py +112 -0
  42. openstef_core-4.0.1/tests/unit/test_types.py +456 -0
  43. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/utils/test_datetime.py +1 -1
  44. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/utils/test_itertools.py +1 -1
  45. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/utils/test_multiprocessing.py +1 -1
  46. openstef_core-4.0.1/tests/unit/utils/test_pandas.py +56 -0
  47. openstef_core-4.0.0.dev1/src/openstef_core/testing.py +0 -131
  48. openstef_core-4.0.0.dev1/src/openstef_core/types.py +0 -302
  49. openstef_core-4.0.0.dev1/tests/unit/test_types.py +0 -82
  50. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/__init__.py +0 -0
  51. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/datasets/__init__.py +0 -0
  52. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/mixins/__init__.py +0 -0
  53. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/transforms/__init__.py +0 -0
  54. {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/utils/__init__.py +0 -0
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com> # noqa E501>
1
+ # SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project <openstef@lfenergy.org> # noqa E501>
2
2
  # SPDX-License-Identifier: MPL-2.0
3
3
 
4
4
  # Core
@@ -67,6 +67,12 @@ dmypy.json
67
67
  # Sphinx
68
68
  docs/_build/
69
69
  docs/source/api/generated/
70
+ docs/source/tutorials/
71
+ docs/source/benchmarks/
72
+ docs/source/user_guide/**/quick_start_tutorial.py
73
+ docs/source/user_guide/**/feature_engineering_tutorial.py
74
+ docs/source/user_guide/**/datasets_tutorial.py
75
+ docs/source/user_guide/**/backtesting_tutorial.py
70
76
 
71
77
  # docs/_doctrees/
72
78
  # docs/_static_gen/
@@ -124,4 +130,17 @@ certificates/
124
130
  *.pkl
125
131
 
126
132
  # Benchmark outputs
127
- benchmark_results*/
133
+ benchmark_results*/
134
+
135
+ # Local dataset files
136
+ liander_dataset/
137
+
138
+ # Mlflow
139
+ /mlflow
140
+ /mlflow_artifacts_local
141
+
142
+ .github/instructions
143
+
144
+ # Jupyter notebook cache (myst-nb execution outputs)
145
+ .jupyter_cache/
146
+ docs/build.zip
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: openstef-core
3
- Version: 4.0.0.dev1
3
+ Version: 4.0.1
4
4
  Summary: Core functionality for OpenSTEF, a framework for short-term energy forecasting.
5
5
  Project-URL: Documentation, https://openstef.github.io/openstef/index.html
6
6
  Project-URL: Homepage, https://lfenergy.org/projects/openstef/
7
7
  Project-URL: Issues, https://github.com/OpenSTEF/openstef/issues
8
8
  Project-URL: Repository, https://github.com/OpenSTEF/openstef
9
- Author-email: "Alliander N.V" <short.term.energy.forecasts@alliander.com>
9
+ Author-email: "Alliander N.V" <openstef@lfenergy.org>
10
10
  License-Expression: MPL-2.0
11
11
  Keywords: energy,forecasting,machinelearning
12
12
  Classifier: Development Status :: 5 - Production/Stable
@@ -22,12 +22,14 @@ Requires-Dist: pandas<3,>=2.3.1
22
22
  Requires-Dist: pyarrow>=21
23
23
  Requires-Dist: pydantic-extra-types<3,>=2.10.5
24
24
  Requires-Dist: pydantic<3,>=2.12.4
25
+ Provides-Extra: benchmark
26
+ Requires-Dist: huggingface-hub>=1.2.2; extra == 'benchmark'
25
27
  Description-Content-Type: text/markdown
26
28
 
27
29
  <!--
28
- SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
30
+ SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
29
31
 
30
32
  SPDX-License-Identifier: MPL-2.0
31
33
  -->
32
34
 
33
- # openstef-core
35
+ # openstef-core
@@ -1,7 +1,7 @@
1
1
  <!--
2
- SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
2
+ SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
3
3
 
4
4
  SPDX-License-Identifier: MPL-2.0
5
5
  -->
6
6
 
7
- # openstef-core
7
+ # openstef-core
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
1
+ # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
2
2
  #
3
3
  # SPDX-License-Identifier: MPL-2.0
4
4
 
@@ -9,13 +9,13 @@ requires = [ "hatchling" ]
9
9
 
10
10
  [project]
11
11
  name = "openstef-core"
12
- version = "4.0.0.dev1"
12
+ version = "4.0.1"
13
13
  description = "Core functionality for OpenSTEF, a framework for short-term energy forecasting."
14
14
  readme = "README.md"
15
15
  keywords = [ "energy", "forecasting", "machinelearning" ]
16
16
  license = "MPL-2.0"
17
17
  authors = [
18
- { name = "Alliander N.V", email = "short.term.energy.forecasts@alliander.com" },
18
+ { name = "Alliander N.V", email = "openstef@lfenergy.org" },
19
19
  ]
20
20
  requires-python = ">=3.12,<4.0"
21
21
  classifiers = [
@@ -36,6 +36,10 @@ dependencies = [
36
36
  "pydantic-extra-types>=2.10.5,<3",
37
37
  ]
38
38
 
39
+ optional-dependencies.benchmark = [
40
+ "huggingface-hub>=1.2.2",
41
+ ]
42
+
39
43
  urls.Documentation = "https://openstef.github.io/openstef/index.html"
40
44
  urls.Homepage = "https://lfenergy.org/projects/openstef/"
41
45
  urls.Issues = "https://github.com/OpenSTEF/openstef/issues"
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
1
+ # SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
2
2
  #
3
3
  # SPDX-License-Identifier: MPL-2.0
4
4
  """Core functionality for OpenSTEF, a framework for short-term energy forecasting."""
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
1
+ # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
2
2
  #
3
3
  # SPDX-License-Identifier: MPL-2.0
4
4
 
@@ -15,7 +15,14 @@ from typing import Annotated, Any, Self
15
15
 
16
16
  import yaml
17
17
  from pydantic import BaseModel as PydanticBaseModel
18
- from pydantic import BeforeValidator, ConfigDict, GetCoreSchemaHandler, TypeAdapter
18
+ from pydantic import (
19
+ BeforeValidator,
20
+ ConfigDict,
21
+ GetCoreSchemaHandler,
22
+ GetJsonSchemaHandler,
23
+ TypeAdapter,
24
+ ValidationInfo,
25
+ )
19
26
  from pydantic_core import core_schema
20
27
 
21
28
 
@@ -115,7 +122,7 @@ class PydanticStringPrimitive:
115
122
  raise NotImplementedError("Subclasses must implement from_string")
116
123
 
117
124
  @classmethod
118
- def validate(cls, v: Any, _info: Any = None) -> Self: # noqa: ANN401
125
+ def validate(cls, v: Any, _info: ValidationInfo | None = None) -> Self: # noqa: ANN401
119
126
  """Validate and convert input to this type.
120
127
 
121
128
  Args:
@@ -150,6 +157,21 @@ class PydanticStringPrimitive:
150
157
  function=cls.validate, serialization=core_schema.plain_serializer_function_ser_schema(cls.__str__)
151
158
  )
152
159
 
160
+ @classmethod
161
+ def __get_pydantic_json_schema__( # noqa: PLW3201
162
+ cls,
163
+ _schema: core_schema.CoreSchema,
164
+ handler: GetJsonSchemaHandler,
165
+ ) -> dict[str, Any]:
166
+ """Generate JSON schema for OpenAPI / FastAPI compatibility.
167
+
168
+ All string-primitive types serialise as plain strings.
169
+
170
+ Returns:
171
+ JSON schema describing the type as a string.
172
+ """
173
+ return {"type": "string"}
174
+
153
175
  def __eq__(self, other: object) -> bool:
154
176
  """Check equality based on string representation.
155
177
 
@@ -0,0 +1,10 @@
1
+ # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
2
+ #
3
+ # SPDX-License-Identifier: MPL-2.0
4
+
5
+ """Shared constants for the openstef_core package."""
6
+
7
+ LIANDER_DATASET_REPO_ID = "OpenSTEF/liander2024-energy-forecasting-benchmark"
8
+
9
+
10
+ __all__ = ["LIANDER_DATASET_REPO_ID"]
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
1
+ # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
2
2
  #
3
3
  # SPDX-License-Identifier: MPL-2.0
4
4
 
@@ -9,6 +9,7 @@ It includes both simple time series datasets and versioned datasets that track d
9
9
  over time, enabling realistic backtesting and training and forecasting.
10
10
 
11
11
  The module supports:
12
+
12
13
  - Regular time series with consistent sampling intervals
13
14
  - Versioned time series that track when data became available
14
15
  - Validated datasets with domain-specific constraints
@@ -19,6 +20,7 @@ The module supports:
19
20
  from openstef_core.datasets.timeseries_dataset import TimeSeriesDataset, validate_horizons_present
20
21
  from openstef_core.datasets.validated_datasets import (
21
22
  EnergyComponentDataset,
23
+ EnsembleForecastDataset,
22
24
  ForecastDataset,
23
25
  ForecastInputDataset,
24
26
  )
@@ -26,6 +28,7 @@ from openstef_core.datasets.versioned_timeseries_dataset import VersionedTimeSer
26
28
 
27
29
  __all__ = [
28
30
  "EnergyComponentDataset",
31
+ "EnsembleForecastDataset",
29
32
  "ForecastDataset",
30
33
  "ForecastInputDataset",
31
34
  "TimeSeriesDataset",
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
1
+ # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
2
2
  #
3
3
  # SPDX-License-Identifier: MPL-2.0
4
4
 
@@ -9,6 +9,7 @@ datasets in OpenSTEF. Protocols enable type checking and documentation of expect
9
9
  behavior without requiring inheritance.
10
10
 
11
11
  Key protocols:
12
+
12
13
  - TimeSeries: Core interface for all time series datasets with filtering and versioning
13
14
  - DatasetMixin: Interface for dataset persistence operations
14
15
  """
@@ -36,6 +37,7 @@ class TimeSeriesMixin(Protocol):
36
37
  the dataset's temporal index, and filtering/versioning capabilities.
37
38
 
38
39
  Classes implementing this interface must provide:
40
+
39
41
  - Access to the datetime index
40
42
  - Sample interval information
41
43
  - Feature names list
@@ -158,6 +160,7 @@ class DatasetMixin(Protocol):
158
160
  and reconstructed exactly as they were saved.
159
161
 
160
162
  Classes implementing this mixin must:
163
+
161
164
  - Save all data and metadata necessary for complete reconstruction
162
165
  - Store metadata in parquet file attributes using attrs
163
166
  - Handle missing metadata gracefully with sensible defaults when loading
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
1
+ # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
2
2
  #
3
3
  # SPDX-License-Identifier: MPL-2.0
4
4
 
@@ -42,11 +42,18 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
42
42
  over time through either a horizon column or an available_at column.
43
43
 
44
44
  The dataset automatically detects versioning:
45
+
45
46
  - If a horizon column exists, data is versioned by forecast horizon
46
47
  - If an available_at column exists, data is versioned by availability time
47
48
  - Otherwise, data is treated as a regular time series
48
49
 
50
+ Columns whose names start with a double underscore (``__``) are treated as
51
+ internal/system columns: they are kept in ``data`` so transforms can pass
52
+ them along, but are excluded from ``feature_names`` so feature-aware
53
+ transforms ignore them.
54
+
49
55
  The dataset guarantees:
56
+
50
57
  - Data is sorted by timestamp in ascending order
51
58
  - Consistent sampling interval across all data points
52
59
  - DateTime index for temporal operations
@@ -57,7 +64,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
57
64
  available_at_column: Name of the column storing availability times (if versioned).
58
65
 
59
66
  Example:
60
- Create a simple time series dataset:
67
+ Create a simple time series dataset
61
68
 
62
69
  >>> import pandas as pd
63
70
  >>> from datetime import timedelta
@@ -71,7 +78,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
71
78
  >>> dataset.is_versioned
72
79
  False
73
80
 
74
- Create a versioned dataset with horizons:
81
+ Create a versioned dataset with horizons
75
82
 
76
83
  >>> data_with_horizon = pd.DataFrame({
77
84
  ... 'load': [100, 120],
@@ -104,10 +111,12 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
104
111
  horizon_column: str = "horizon",
105
112
  available_at_column: str = "available_at",
106
113
  is_sorted: bool = False,
114
+ check_frequency: bool = False,
107
115
  ) -> None:
108
116
  """Initialize a time series dataset.
109
117
 
110
118
  The dataset automatically detects whether it's versioned based on column presence:
119
+
111
120
  - If horizon_column exists: versioned by forecast horizon
112
121
  - If available_at_column exists: versioned by availability time
113
122
  - Otherwise: regular time series
@@ -118,10 +127,12 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
118
127
  horizon_column: Name of the column storing forecast horizons.
119
128
  available_at_column: Name of the column storing availability times.
120
129
  is_sorted: Whether the data is sorted by timestamp.
130
+ check_frequency: Whether to check that the data frequency matches sample_interval.
121
131
 
122
132
  Raises:
123
133
  TypeError: If data index is not a pandas DatetimeIndex or if versioning
124
134
  columns have incorrect types.
135
+ ValueError: If data frequency does not match sample_interval.
125
136
  """
126
137
  if not isinstance(data.index, pd.DatetimeIndex):
127
138
  raise TypeError("Data index must be a pandas DatetimeIndex.")
@@ -130,9 +141,15 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
130
141
  self.horizon_column = horizon_column
131
142
  self.available_at_column = available_at_column
132
143
  self._sample_interval = sample_interval
133
- self._internal_columns = set()
144
+ self._internal_columns = {col for col in data.columns if col.startswith("__")}
134
145
  data.index.name = self.index_name
135
146
 
147
+ # Check input data frequency matches sample_interval, only if there are enough data points to infer frequency
148
+ minimum_required_length = 2
149
+ if check_frequency and len(data) >= minimum_required_length and not self.frequency_matches(data.index):
150
+ msg = f"Data frequency does not match the sample_interval ({sample_interval})."
151
+ raise ValueError(msg)
152
+
136
153
  if self.horizon_column in data.columns:
137
154
  validate_timedelta_column(data[self.horizon_column])
138
155
  self._version_column = self.horizon_column
@@ -147,7 +164,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
147
164
  self._horizons = None
148
165
  else:
149
166
  self._version_column = None
150
- self._feature_names = data.columns.to_list()
167
+ self._feature_names = [col for col in data.columns if col not in self._internal_columns]
151
168
  self._horizons = None
152
169
 
153
170
  # Ensure invariants: data is sorted by timestamp
@@ -257,7 +274,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
257
274
  if available_at_series is None:
258
275
  return self
259
276
 
260
- cutoff = self.index.floor("D") - pd.Timedelta(available_at.lag_from_day)
277
+ cutoff = available_at.apply_index(self.index)
261
278
  data_filtered = self.data[available_at_series <= cutoff]
262
279
  return self._copy_with_data(data=data_filtered)
263
280
 
@@ -443,6 +460,44 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
443
460
  is_sorted=is_sorted,
444
461
  )
445
462
 
463
+ @staticmethod
464
+ def _infer_frequency(index: pd.DatetimeIndex) -> pd.Timedelta:
465
+ """Infer the frequency of a pandas DatetimeIndex if the freq attribute is not set.
466
+
467
+ This method calculates the most common time difference between consecutive timestamps,
468
+ which is more permissive of missing chunks of data than the pandas infer_freq method.
469
+
470
+ Args:
471
+ index (pd.DatetimeIndex): The datetime index to infer the frequency from.
472
+
473
+ Returns:
474
+ pd.Timedelta: The inferred frequency as a pandas Timedelta.
475
+
476
+ Raises:
477
+ ValueError: If the index has fewer than 2 timestamps.
478
+ """
479
+ minimum_required_length = 2
480
+ if len(index) < minimum_required_length:
481
+ raise ValueError("Cannot infer frequency from an index with fewer than 2 timestamps.")
482
+
483
+ # Calculate the differences between consecutive timestamps
484
+ deltas = index.to_series().drop_duplicates().sort_values().diff().dropna()
485
+
486
+ # Find the most common difference
487
+ return deltas.mode().iloc[0]
488
+
489
+ def frequency_matches(self, index: pd.DatetimeIndex) -> bool:
490
+ """Check if the frequency of the data matches the model frequency.
491
+
492
+ Args:
493
+ index (pd.DatetimeIndex): The data to check.
494
+
495
+ Returns:
496
+ bool: True if the frequencies match, False otherwise.
497
+ """
498
+ input_sample_interval = self._infer_frequency(index) if index.freq is None else index.freq
499
+ return input_sample_interval == self.sample_interval
500
+
446
501
 
447
502
  def validate_horizons_present(dataset: TimeSeriesDataset, horizons: list[LeadTime]) -> None:
448
503
  """Validate that the specified forecast horizons are present in the dataset.
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
1
+ # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
2
2
  #
3
3
  # SPDX-License-Identifier: MPL-2.0
4
4
 
@@ -12,6 +12,7 @@ validation to catch data quality issues early.
12
12
  from datetime import datetime, timedelta
13
13
  from typing import Self, override
14
14
 
15
+ import numpy as np
15
16
  import pandas as pd
16
17
 
17
18
  from openstef_core.datasets.timeseries_dataset import TimeSeriesDataset
@@ -19,6 +20,8 @@ from openstef_core.datasets.validation import validate_required_columns
19
20
  from openstef_core.exceptions import MissingColumnsError
20
21
  from openstef_core.types import EnergyComponentType, LeadTime, Quantile
21
22
 
23
+ ENSEMBLE_COLUMN_SEP: str = "__"
24
+
22
25
 
23
26
  class ForecastInputDataset(TimeSeriesDataset):
24
27
  """Time series dataset for forecasting with validated target column.
@@ -78,6 +81,7 @@ class ForecastInputDataset(TimeSeriesDataset):
78
81
  *,
79
82
  horizon_column: str = "horizon",
80
83
  available_at_column: str = "available_at",
84
+ check_frequency: bool = False,
81
85
  sample_weight_column: str = "sample_weight",
82
86
  target_column: str = "load",
83
87
  ) -> None:
@@ -95,6 +99,7 @@ class ForecastInputDataset(TimeSeriesDataset):
95
99
  sample_interval=sample_interval,
96
100
  horizon_column=horizon_column,
97
101
  available_at_column=available_at_column,
102
+ check_frequency=check_frequency,
98
103
  )
99
104
  self._internal_columns.add(self.sample_weight_column)
100
105
  self._feature_names = [col for col in self.data.columns if col not in self._internal_columns]
@@ -250,12 +255,14 @@ class ForecastDataset(TimeSeriesDataset):
250
255
  *,
251
256
  horizon_column: str = "horizon",
252
257
  available_at_column: str = "available_at",
258
+ standard_deviation_column: str = "stdev",
253
259
  ) -> None:
254
260
  if "forecast_start" in data.attrs:
255
261
  self.forecast_start = datetime.fromisoformat(data.attrs["forecast_start"])
256
262
  else:
257
263
  self.forecast_start = forecast_start if forecast_start is not None else data.index.min().to_pydatetime()
258
264
  self.target_column = data.attrs.get("target_column", target_column)
265
+ self.standard_deviation_column = data.attrs.get("standard_deviation_column", standard_deviation_column)
259
266
 
260
267
  super().__init__(
261
268
  data=data,
@@ -264,12 +271,42 @@ class ForecastDataset(TimeSeriesDataset):
264
271
  available_at_column=available_at_column,
265
272
  )
266
273
 
267
- quantile_feature_names = [col for col in self.feature_names if col != target_column]
274
+ exclude_columns = {target_column, standard_deviation_column}
275
+ quantile_feature_names = [col for col in self.feature_names if col not in exclude_columns]
268
276
  if not all(Quantile.is_valid_quantile_string(col) for col in quantile_feature_names):
269
277
  raise ValueError("All feature names must be valid quantile strings.")
270
278
 
271
279
  self.quantiles = [Quantile.parse(col) for col in quantile_feature_names]
272
280
 
281
+ @classmethod
282
+ def from_quantile_predictions(
283
+ cls,
284
+ predictions: np.ndarray,
285
+ index: pd.Index,
286
+ quantiles: list[Quantile],
287
+ sample_interval: timedelta,
288
+ *,
289
+ target_column: str = "load",
290
+ ) -> "ForecastDataset":
291
+ """Build a ``ForecastDataset`` from a raw predictions array.
292
+
293
+ Args:
294
+ predictions: Shape ``(n_samples, n_quantiles)``.
295
+ index: Time index for the predictions.
296
+ quantiles: Quantiles the model was trained on, in the same order as columns in *predictions*.
297
+ sample_interval: Temporal resolution of the dataset.
298
+ target_column: Name of the target column to attach to the dataset.
299
+
300
+ Returns:
301
+ ``ForecastDataset`` with quantile columns and the provided time index.
302
+ """
303
+ df = pd.DataFrame(
304
+ data=predictions,
305
+ index=index,
306
+ columns=[q.format() for q in quantiles],
307
+ )
308
+ return cls(data=df, sample_interval=sample_interval, target_column=target_column)
309
+
273
310
  @property
274
311
  def target_series(self) -> pd.Series | None:
275
312
  """Extract the target time series from the dataset.
@@ -296,6 +333,20 @@ class ForecastDataset(TimeSeriesDataset):
296
333
  raise MissingColumnsError(missing_columns=[median_col])
297
334
  return self.data[median_col]
298
335
 
336
+ @property
337
+ def standard_deviation_series(self) -> pd.Series:
338
+ """Extract the standard deviation series if it exists.
339
+
340
+ Returns:
341
+ Time series containing standard deviation values with original datetime index.
342
+
343
+ Raises:
344
+ MissingColumnsError: If the standard deviation column is not found.
345
+ """
346
+ if self.standard_deviation_column not in self.data.columns:
347
+ raise MissingColumnsError(missing_columns=[self.standard_deviation_column])
348
+ return self.data[self.standard_deviation_column] # pyright: ignore[reportUnknownVariableType]
349
+
299
350
  @property
300
351
  def quantiles_data(self) -> pd.DataFrame:
301
352
  """Extract DataFrame containing only the quantile forecast columns.
@@ -331,6 +382,7 @@ class ForecastDataset(TimeSeriesDataset):
331
382
  df = super().to_pandas()
332
383
  df.attrs["target_column"] = self.target_column
333
384
  df.attrs["forecast_start"] = self.forecast_start.isoformat()
385
+ df.attrs["standard_deviation_column"] = self.standard_deviation_column
334
386
  return df
335
387
 
336
388
  @classmethod
@@ -409,8 +461,156 @@ class EnergyComponentDataset(TimeSeriesDataset):
409
461
  )
410
462
 
411
463
 
464
+ class EnsembleForecastDataset(TimeSeriesDataset):
465
+ """First stage output format for ensemble forecasters."""
466
+
467
+ forecast_start: datetime
468
+ quantiles: list[Quantile]
469
+ forecaster_names: list[str]
470
+ target_column: str
471
+
472
+ @override
473
+ def __init__(
474
+ self,
475
+ data: pd.DataFrame,
476
+ sample_interval: timedelta = timedelta(minutes=15),
477
+ forecast_start: datetime | None = None,
478
+ target_column: str = "load",
479
+ *,
480
+ horizon_column: str = "horizon",
481
+ available_at_column: str = "available_at",
482
+ ) -> None:
483
+ if "forecast_start" in data.attrs:
484
+ self.forecast_start = datetime.fromisoformat(data.attrs["forecast_start"])
485
+ else:
486
+ self.forecast_start = forecast_start if forecast_start is not None else data.index.min().to_pydatetime()
487
+ self.target_column = data.attrs.get("target_column", target_column)
488
+
489
+ super().__init__(
490
+ data=data,
491
+ sample_interval=sample_interval,
492
+ horizon_column=horizon_column,
493
+ available_at_column=available_at_column,
494
+ )
495
+ quantile_feature_names = [col for col in self.feature_names if col != target_column]
496
+
497
+ self.forecaster_names, self.quantiles = self.get_learner_and_quantile(pd.Index(quantile_feature_names))
498
+ for name in self.forecaster_names:
499
+ if ENSEMBLE_COLUMN_SEP in name:
500
+ msg = f"Forecaster name '{name}' must not contain separator '{ENSEMBLE_COLUMN_SEP}'."
501
+ raise ValueError(msg)
502
+ n_cols = len(self.forecaster_names) * len(self.quantiles)
503
+ if len(data.columns) not in {n_cols + 1, n_cols}:
504
+ raise ValueError("Data columns do not match the expected number based on base forecasters and quantiles.")
505
+
506
+ @property
507
+ def target_series(self) -> pd.Series | None:
508
+ """Return the target series if available."""
509
+ if self.target_column in self.data.columns:
510
+ return self.data[self.target_column]
511
+ return None
512
+
513
+ @staticmethod
514
+ def get_learner_and_quantile(feature_names: pd.Index) -> tuple[list[str], list[Quantile]]:
515
+ """Extract base forecaster names and quantiles from feature names.
516
+
517
+ Column format is ``{learner}{ENSEMBLE_COLUMN_SEP}{quantile.format()}``,
518
+ e.g. ``lgbm__quantile_P50``.
519
+
520
+ Args:
521
+ feature_names: Index of feature names in the dataset.
522
+
523
+ Returns:
524
+ Tuple containing a list of base forecaster names and a list of quantiles.
525
+
526
+ Raises:
527
+ ValueError: If a column cannot be parsed or has an invalid quantile string.
528
+ """
529
+ forecasters: set[str] = set()
530
+ quantiles: set[Quantile] = set()
531
+
532
+ for feature_name in feature_names:
533
+ parts = feature_name.split(ENSEMBLE_COLUMN_SEP, maxsplit=1)
534
+ if len(parts) != 2: # noqa: PLR2004
535
+ msg = f"Column missing separator '{ENSEMBLE_COLUMN_SEP}': {feature_name}"
536
+ raise ValueError(msg)
537
+ learner_part, quantile_part = parts
538
+ if not Quantile.is_valid_quantile_string(quantile_part):
539
+ msg = f"Column has no valid quantile string: {feature_name}"
540
+ raise ValueError(msg)
541
+
542
+ forecasters.add(learner_part)
543
+ quantiles.add(Quantile.parse(quantile_part))
544
+
545
+ return list(forecasters), list(quantiles)
546
+
547
+ @classmethod
548
+ def from_forecast_datasets(
549
+ cls,
550
+ datasets: dict[str, ForecastDataset],
551
+ target_series: pd.Series | None = None,
552
+ sample_weights: pd.Series | None = None,
553
+ ) -> Self:
554
+ """Create an EnsembleForecastDataset from multiple ForecastDatasets.
555
+
556
+ Args:
557
+ datasets: Dict of ForecastDatasets to combine.
558
+ target_series: Optional target series to include in the dataset.
559
+ sample_weights: Optional sample weights series to include in the dataset.
560
+
561
+ Returns:
562
+ EnsembleForecastDataset combining all input datasets.
563
+ """
564
+ ds1 = next(iter(datasets.values()))
565
+ additional_columns: dict[str, pd.Series] = {}
566
+ if isinstance(ds1.target_series, pd.Series):
567
+ additional_columns[ds1.target_column] = ds1.target_series
568
+ elif target_series is not None:
569
+ additional_columns[ds1.target_column] = target_series
570
+
571
+ sample_weight_column = "sample_weight"
572
+ if sample_weights is not None:
573
+ additional_columns[sample_weight_column] = sample_weights
574
+
575
+ combined_data = pd.DataFrame({
576
+ f"{learner}{ENSEMBLE_COLUMN_SEP}{q.format()}": ds.data[q.format()]
577
+ for learner, ds in datasets.items()
578
+ for q in ds.quantiles
579
+ }).assign(**additional_columns)
580
+
581
+ return cls(
582
+ data=combined_data,
583
+ sample_interval=ds1.sample_interval,
584
+ forecast_start=ds1.forecast_start,
585
+ target_column=ds1.target_column,
586
+ )
587
+
588
+ def get_base_predictions_for_quantile(self, quantile: Quantile) -> ForecastInputDataset:
589
+ """Get base forecaster predictions for a specific quantile.
590
+
591
+ Args:
592
+ quantile: Quantile to select.
593
+
594
+ Returns:
595
+ ForecastInputDataset containing predictions from all base forecasters at the specified quantile.
596
+ """
597
+ selected_columns = [f"{learner}{ENSEMBLE_COLUMN_SEP}{quantile.format()}" for learner in self.forecaster_names]
598
+ selected_columns.append(self.target_column)
599
+ prediction_data = self.data[selected_columns].copy()
600
+ prediction_data.columns = [*self.forecaster_names, self.target_column]
601
+
602
+ return ForecastInputDataset(
603
+ data=prediction_data,
604
+ sample_interval=self.sample_interval,
605
+ target_column=self.target_column,
606
+ forecast_start=self.forecast_start,
607
+ )
608
+
609
+
412
610
  __all__ = [
611
+ "ENSEMBLE_COLUMN_SEP",
413
612
  "EnergyComponentDataset",
613
+ "EnsembleForecastDataset",
414
614
  "ForecastDataset",
415
615
  "ForecastInputDataset",
416
616
  ]
@@ -1,4 +1,4 @@
1
- # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
1
+ # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
2
2
  #
3
3
  # SPDX-License-Identifier: MPL-2.0
4
4