openstef-core 4.0.0.dev1__tar.gz → 4.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/.gitignore +21 -2
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/PKG-INFO +6 -4
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/README.md +2 -2
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/pyproject.toml +7 -3
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/__init__.py +1 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/base_model.py +25 -3
- openstef_core-4.0.1/src/openstef_core/constants.py +10 -0
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/datasets/__init__.py +4 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/datasets/mixins.py +4 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/datasets/timeseries_dataset.py +61 -6
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/datasets/validated_datasets.py +202 -2
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/datasets/validation.py +1 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/datasets/versioned_timeseries_dataset.py +3 -3
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/exceptions.py +19 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/mixins/__init__.py +1 -1
- openstef_core-4.0.1/src/openstef_core/mixins/param_ranges.py +131 -0
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/mixins/predictor.py +111 -12
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/mixins/stateful.py +1 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/mixins/transform.py +3 -3
- openstef_core-4.0.1/src/openstef_core/testing.py +306 -0
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/transforms/__init__.py +1 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/transforms/dataset_transforms.py +2 -2
- openstef_core-4.0.1/src/openstef_core/types.py +453 -0
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/utils/__init__.py +1 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/utils/datetime.py +1 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/utils/invariants.py +1 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/utils/itertools.py +1 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/utils/multiprocessing.py +1 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/utils/pandas.py +36 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/utils/pydantic.py +3 -3
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/__init__.py +1 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/datasets/test_mixins.py +1 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/datasets/test_timeseries_dataset.py +32 -4
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/datasets/test_validation.py +1 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/datasets/test_versioned_timeseries_dataset.py +3 -3
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/datasets/utils.py +1 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/mixins/test_stateful.py +1 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/mixins/test_transform.py +1 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/test_base_model.py +1 -1
- openstef_core-4.0.1/tests/unit/test_hyperparams_tuning.py +115 -0
- openstef_core-4.0.1/tests/unit/test_param_ranges.py +112 -0
- openstef_core-4.0.1/tests/unit/test_types.py +456 -0
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/utils/test_datetime.py +1 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/utils/test_itertools.py +1 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/utils/test_multiprocessing.py +1 -1
- openstef_core-4.0.1/tests/unit/utils/test_pandas.py +56 -0
- openstef_core-4.0.0.dev1/src/openstef_core/testing.py +0 -131
- openstef_core-4.0.0.dev1/src/openstef_core/types.py +0 -302
- openstef_core-4.0.0.dev1/tests/unit/test_types.py +0 -82
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/__init__.py +0 -0
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/datasets/__init__.py +0 -0
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/mixins/__init__.py +0 -0
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/transforms/__init__.py +0 -0
- {openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/tests/unit/utils/__init__.py +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project <
|
|
1
|
+
# SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project <openstef@lfenergy.org> # noqa E501>
|
|
2
2
|
# SPDX-License-Identifier: MPL-2.0
|
|
3
3
|
|
|
4
4
|
# Core
|
|
@@ -67,6 +67,12 @@ dmypy.json
|
|
|
67
67
|
# Sphinx
|
|
68
68
|
docs/_build/
|
|
69
69
|
docs/source/api/generated/
|
|
70
|
+
docs/source/tutorials/
|
|
71
|
+
docs/source/benchmarks/
|
|
72
|
+
docs/source/user_guide/**/quick_start_tutorial.py
|
|
73
|
+
docs/source/user_guide/**/feature_engineering_tutorial.py
|
|
74
|
+
docs/source/user_guide/**/datasets_tutorial.py
|
|
75
|
+
docs/source/user_guide/**/backtesting_tutorial.py
|
|
70
76
|
|
|
71
77
|
# docs/_doctrees/
|
|
72
78
|
# docs/_static_gen/
|
|
@@ -124,4 +130,17 @@ certificates/
|
|
|
124
130
|
*.pkl
|
|
125
131
|
|
|
126
132
|
# Benchmark outputs
|
|
127
|
-
benchmark_results*/
|
|
133
|
+
benchmark_results*/
|
|
134
|
+
|
|
135
|
+
# Local dataset files
|
|
136
|
+
liander_dataset/
|
|
137
|
+
|
|
138
|
+
# Mlflow
|
|
139
|
+
/mlflow
|
|
140
|
+
/mlflow_artifacts_local
|
|
141
|
+
|
|
142
|
+
.github/instructions
|
|
143
|
+
|
|
144
|
+
# Jupyter notebook cache (myst-nb execution outputs)
|
|
145
|
+
.jupyter_cache/
|
|
146
|
+
docs/build.zip
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: openstef-core
|
|
3
|
-
Version: 4.0.
|
|
3
|
+
Version: 4.0.1
|
|
4
4
|
Summary: Core functionality for OpenSTEF, a framework for short-term energy forecasting.
|
|
5
5
|
Project-URL: Documentation, https://openstef.github.io/openstef/index.html
|
|
6
6
|
Project-URL: Homepage, https://lfenergy.org/projects/openstef/
|
|
7
7
|
Project-URL: Issues, https://github.com/OpenSTEF/openstef/issues
|
|
8
8
|
Project-URL: Repository, https://github.com/OpenSTEF/openstef
|
|
9
|
-
Author-email: "Alliander N.V" <
|
|
9
|
+
Author-email: "Alliander N.V" <openstef@lfenergy.org>
|
|
10
10
|
License-Expression: MPL-2.0
|
|
11
11
|
Keywords: energy,forecasting,machinelearning
|
|
12
12
|
Classifier: Development Status :: 5 - Production/Stable
|
|
@@ -22,12 +22,14 @@ Requires-Dist: pandas<3,>=2.3.1
|
|
|
22
22
|
Requires-Dist: pyarrow>=21
|
|
23
23
|
Requires-Dist: pydantic-extra-types<3,>=2.10.5
|
|
24
24
|
Requires-Dist: pydantic<3,>=2.12.4
|
|
25
|
+
Provides-Extra: benchmark
|
|
26
|
+
Requires-Dist: huggingface-hub>=1.2.2; extra == 'benchmark'
|
|
25
27
|
Description-Content-Type: text/markdown
|
|
26
28
|
|
|
27
29
|
<!--
|
|
28
|
-
SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <
|
|
30
|
+
SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
|
|
29
31
|
|
|
30
32
|
SPDX-License-Identifier: MPL-2.0
|
|
31
33
|
-->
|
|
32
34
|
|
|
33
|
-
# openstef-core
|
|
35
|
+
# openstef-core
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
<!--
|
|
2
|
-
SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <
|
|
2
|
+
SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
|
|
3
3
|
|
|
4
4
|
SPDX-License-Identifier: MPL-2.0
|
|
5
5
|
-->
|
|
6
6
|
|
|
7
|
-
# openstef-core
|
|
7
|
+
# openstef-core
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <
|
|
1
|
+
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: MPL-2.0
|
|
4
4
|
|
|
@@ -9,13 +9,13 @@ requires = [ "hatchling" ]
|
|
|
9
9
|
|
|
10
10
|
[project]
|
|
11
11
|
name = "openstef-core"
|
|
12
|
-
version = "4.0.
|
|
12
|
+
version = "4.0.1"
|
|
13
13
|
description = "Core functionality for OpenSTEF, a framework for short-term energy forecasting."
|
|
14
14
|
readme = "README.md"
|
|
15
15
|
keywords = [ "energy", "forecasting", "machinelearning" ]
|
|
16
16
|
license = "MPL-2.0"
|
|
17
17
|
authors = [
|
|
18
|
-
{ name = "Alliander N.V", email = "
|
|
18
|
+
{ name = "Alliander N.V", email = "openstef@lfenergy.org" },
|
|
19
19
|
]
|
|
20
20
|
requires-python = ">=3.12,<4.0"
|
|
21
21
|
classifiers = [
|
|
@@ -36,6 +36,10 @@ dependencies = [
|
|
|
36
36
|
"pydantic-extra-types>=2.10.5,<3",
|
|
37
37
|
]
|
|
38
38
|
|
|
39
|
+
optional-dependencies.benchmark = [
|
|
40
|
+
"huggingface-hub>=1.2.2",
|
|
41
|
+
]
|
|
42
|
+
|
|
39
43
|
urls.Documentation = "https://openstef.github.io/openstef/index.html"
|
|
40
44
|
urls.Homepage = "https://lfenergy.org/projects/openstef/"
|
|
41
45
|
urls.Issues = "https://github.com/OpenSTEF/openstef/issues"
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project <
|
|
1
|
+
# SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: MPL-2.0
|
|
4
4
|
"""Core functionality for OpenSTEF, a framework for short-term energy forecasting."""
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <
|
|
1
|
+
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: MPL-2.0
|
|
4
4
|
|
|
@@ -15,7 +15,14 @@ from typing import Annotated, Any, Self
|
|
|
15
15
|
|
|
16
16
|
import yaml
|
|
17
17
|
from pydantic import BaseModel as PydanticBaseModel
|
|
18
|
-
from pydantic import
|
|
18
|
+
from pydantic import (
|
|
19
|
+
BeforeValidator,
|
|
20
|
+
ConfigDict,
|
|
21
|
+
GetCoreSchemaHandler,
|
|
22
|
+
GetJsonSchemaHandler,
|
|
23
|
+
TypeAdapter,
|
|
24
|
+
ValidationInfo,
|
|
25
|
+
)
|
|
19
26
|
from pydantic_core import core_schema
|
|
20
27
|
|
|
21
28
|
|
|
@@ -115,7 +122,7 @@ class PydanticStringPrimitive:
|
|
|
115
122
|
raise NotImplementedError("Subclasses must implement from_string")
|
|
116
123
|
|
|
117
124
|
@classmethod
|
|
118
|
-
def validate(cls, v: Any, _info:
|
|
125
|
+
def validate(cls, v: Any, _info: ValidationInfo | None = None) -> Self: # noqa: ANN401
|
|
119
126
|
"""Validate and convert input to this type.
|
|
120
127
|
|
|
121
128
|
Args:
|
|
@@ -150,6 +157,21 @@ class PydanticStringPrimitive:
|
|
|
150
157
|
function=cls.validate, serialization=core_schema.plain_serializer_function_ser_schema(cls.__str__)
|
|
151
158
|
)
|
|
152
159
|
|
|
160
|
+
@classmethod
|
|
161
|
+
def __get_pydantic_json_schema__( # noqa: PLW3201
|
|
162
|
+
cls,
|
|
163
|
+
_schema: core_schema.CoreSchema,
|
|
164
|
+
handler: GetJsonSchemaHandler,
|
|
165
|
+
) -> dict[str, Any]:
|
|
166
|
+
"""Generate JSON schema for OpenAPI / FastAPI compatibility.
|
|
167
|
+
|
|
168
|
+
All string-primitive types serialise as plain strings.
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
JSON schema describing the type as a string.
|
|
172
|
+
"""
|
|
173
|
+
return {"type": "string"}
|
|
174
|
+
|
|
153
175
|
def __eq__(self, other: object) -> bool:
|
|
154
176
|
"""Check equality based on string representation.
|
|
155
177
|
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
+
|
|
5
|
+
"""Shared constants for the openstef_core package."""
|
|
6
|
+
|
|
7
|
+
LIANDER_DATASET_REPO_ID = "OpenSTEF/liander2024-energy-forecasting-benchmark"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
__all__ = ["LIANDER_DATASET_REPO_ID"]
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <
|
|
1
|
+
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: MPL-2.0
|
|
4
4
|
|
|
@@ -9,6 +9,7 @@ It includes both simple time series datasets and versioned datasets that track d
|
|
|
9
9
|
over time, enabling realistic backtesting and training and forecasting.
|
|
10
10
|
|
|
11
11
|
The module supports:
|
|
12
|
+
|
|
12
13
|
- Regular time series with consistent sampling intervals
|
|
13
14
|
- Versioned time series that track when data became available
|
|
14
15
|
- Validated datasets with domain-specific constraints
|
|
@@ -19,6 +20,7 @@ The module supports:
|
|
|
19
20
|
from openstef_core.datasets.timeseries_dataset import TimeSeriesDataset, validate_horizons_present
|
|
20
21
|
from openstef_core.datasets.validated_datasets import (
|
|
21
22
|
EnergyComponentDataset,
|
|
23
|
+
EnsembleForecastDataset,
|
|
22
24
|
ForecastDataset,
|
|
23
25
|
ForecastInputDataset,
|
|
24
26
|
)
|
|
@@ -26,6 +28,7 @@ from openstef_core.datasets.versioned_timeseries_dataset import VersionedTimeSer
|
|
|
26
28
|
|
|
27
29
|
__all__ = [
|
|
28
30
|
"EnergyComponentDataset",
|
|
31
|
+
"EnsembleForecastDataset",
|
|
29
32
|
"ForecastDataset",
|
|
30
33
|
"ForecastInputDataset",
|
|
31
34
|
"TimeSeriesDataset",
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <
|
|
1
|
+
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: MPL-2.0
|
|
4
4
|
|
|
@@ -9,6 +9,7 @@ datasets in OpenSTEF. Protocols enable type checking and documentation of expect
|
|
|
9
9
|
behavior without requiring inheritance.
|
|
10
10
|
|
|
11
11
|
Key protocols:
|
|
12
|
+
|
|
12
13
|
- TimeSeries: Core interface for all time series datasets with filtering and versioning
|
|
13
14
|
- DatasetMixin: Interface for dataset persistence operations
|
|
14
15
|
"""
|
|
@@ -36,6 +37,7 @@ class TimeSeriesMixin(Protocol):
|
|
|
36
37
|
the dataset's temporal index, and filtering/versioning capabilities.
|
|
37
38
|
|
|
38
39
|
Classes implementing this interface must provide:
|
|
40
|
+
|
|
39
41
|
- Access to the datetime index
|
|
40
42
|
- Sample interval information
|
|
41
43
|
- Feature names list
|
|
@@ -158,6 +160,7 @@ class DatasetMixin(Protocol):
|
|
|
158
160
|
and reconstructed exactly as they were saved.
|
|
159
161
|
|
|
160
162
|
Classes implementing this mixin must:
|
|
163
|
+
|
|
161
164
|
- Save all data and metadata necessary for complete reconstruction
|
|
162
165
|
- Store metadata in parquet file attributes using attrs
|
|
163
166
|
- Handle missing metadata gracefully with sensible defaults when loading
|
{openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/datasets/timeseries_dataset.py
RENAMED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <
|
|
1
|
+
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: MPL-2.0
|
|
4
4
|
|
|
@@ -42,11 +42,18 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
|
|
|
42
42
|
over time through either a horizon column or an available_at column.
|
|
43
43
|
|
|
44
44
|
The dataset automatically detects versioning:
|
|
45
|
+
|
|
45
46
|
- If a horizon column exists, data is versioned by forecast horizon
|
|
46
47
|
- If an available_at column exists, data is versioned by availability time
|
|
47
48
|
- Otherwise, data is treated as a regular time series
|
|
48
49
|
|
|
50
|
+
Columns whose names start with a double underscore (``__``) are treated as
|
|
51
|
+
internal/system columns: they are kept in ``data`` so transforms can pass
|
|
52
|
+
them along, but are excluded from ``feature_names`` so feature-aware
|
|
53
|
+
transforms ignore them.
|
|
54
|
+
|
|
49
55
|
The dataset guarantees:
|
|
56
|
+
|
|
50
57
|
- Data is sorted by timestamp in ascending order
|
|
51
58
|
- Consistent sampling interval across all data points
|
|
52
59
|
- DateTime index for temporal operations
|
|
@@ -57,7 +64,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
|
|
|
57
64
|
available_at_column: Name of the column storing availability times (if versioned).
|
|
58
65
|
|
|
59
66
|
Example:
|
|
60
|
-
Create a simple time series dataset
|
|
67
|
+
Create a simple time series dataset
|
|
61
68
|
|
|
62
69
|
>>> import pandas as pd
|
|
63
70
|
>>> from datetime import timedelta
|
|
@@ -71,7 +78,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
|
|
|
71
78
|
>>> dataset.is_versioned
|
|
72
79
|
False
|
|
73
80
|
|
|
74
|
-
Create a versioned dataset with horizons
|
|
81
|
+
Create a versioned dataset with horizons
|
|
75
82
|
|
|
76
83
|
>>> data_with_horizon = pd.DataFrame({
|
|
77
84
|
... 'load': [100, 120],
|
|
@@ -104,10 +111,12 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
|
|
|
104
111
|
horizon_column: str = "horizon",
|
|
105
112
|
available_at_column: str = "available_at",
|
|
106
113
|
is_sorted: bool = False,
|
|
114
|
+
check_frequency: bool = False,
|
|
107
115
|
) -> None:
|
|
108
116
|
"""Initialize a time series dataset.
|
|
109
117
|
|
|
110
118
|
The dataset automatically detects whether it's versioned based on column presence:
|
|
119
|
+
|
|
111
120
|
- If horizon_column exists: versioned by forecast horizon
|
|
112
121
|
- If available_at_column exists: versioned by availability time
|
|
113
122
|
- Otherwise: regular time series
|
|
@@ -118,10 +127,12 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
|
|
|
118
127
|
horizon_column: Name of the column storing forecast horizons.
|
|
119
128
|
available_at_column: Name of the column storing availability times.
|
|
120
129
|
is_sorted: Whether the data is sorted by timestamp.
|
|
130
|
+
check_frequency: Whether to check that the data frequency matches sample_interval.
|
|
121
131
|
|
|
122
132
|
Raises:
|
|
123
133
|
TypeError: If data index is not a pandas DatetimeIndex or if versioning
|
|
124
134
|
columns have incorrect types.
|
|
135
|
+
ValueError: If data frequency does not match sample_interval.
|
|
125
136
|
"""
|
|
126
137
|
if not isinstance(data.index, pd.DatetimeIndex):
|
|
127
138
|
raise TypeError("Data index must be a pandas DatetimeIndex.")
|
|
@@ -130,9 +141,15 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
|
|
|
130
141
|
self.horizon_column = horizon_column
|
|
131
142
|
self.available_at_column = available_at_column
|
|
132
143
|
self._sample_interval = sample_interval
|
|
133
|
-
self._internal_columns =
|
|
144
|
+
self._internal_columns = {col for col in data.columns if col.startswith("__")}
|
|
134
145
|
data.index.name = self.index_name
|
|
135
146
|
|
|
147
|
+
# Check input data frequency matches sample_interval, only if there are enough data points to infer frequency
|
|
148
|
+
minimum_required_length = 2
|
|
149
|
+
if check_frequency and len(data) >= minimum_required_length and not self.frequency_matches(data.index):
|
|
150
|
+
msg = f"Data frequency does not match the sample_interval ({sample_interval})."
|
|
151
|
+
raise ValueError(msg)
|
|
152
|
+
|
|
136
153
|
if self.horizon_column in data.columns:
|
|
137
154
|
validate_timedelta_column(data[self.horizon_column])
|
|
138
155
|
self._version_column = self.horizon_column
|
|
@@ -147,7 +164,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
|
|
|
147
164
|
self._horizons = None
|
|
148
165
|
else:
|
|
149
166
|
self._version_column = None
|
|
150
|
-
self._feature_names = data.columns.
|
|
167
|
+
self._feature_names = [col for col in data.columns if col not in self._internal_columns]
|
|
151
168
|
self._horizons = None
|
|
152
169
|
|
|
153
170
|
# Ensure invariants: data is sorted by timestamp
|
|
@@ -257,7 +274,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
|
|
|
257
274
|
if available_at_series is None:
|
|
258
275
|
return self
|
|
259
276
|
|
|
260
|
-
cutoff = self.index
|
|
277
|
+
cutoff = available_at.apply_index(self.index)
|
|
261
278
|
data_filtered = self.data[available_at_series <= cutoff]
|
|
262
279
|
return self._copy_with_data(data=data_filtered)
|
|
263
280
|
|
|
@@ -443,6 +460,44 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
|
|
|
443
460
|
is_sorted=is_sorted,
|
|
444
461
|
)
|
|
445
462
|
|
|
463
|
+
@staticmethod
|
|
464
|
+
def _infer_frequency(index: pd.DatetimeIndex) -> pd.Timedelta:
|
|
465
|
+
"""Infer the frequency of a pandas DatetimeIndex if the freq attribute is not set.
|
|
466
|
+
|
|
467
|
+
This method calculates the most common time difference between consecutive timestamps,
|
|
468
|
+
which is more permissive of missing chunks of data than the pandas infer_freq method.
|
|
469
|
+
|
|
470
|
+
Args:
|
|
471
|
+
index (pd.DatetimeIndex): The datetime index to infer the frequency from.
|
|
472
|
+
|
|
473
|
+
Returns:
|
|
474
|
+
pd.Timedelta: The inferred frequency as a pandas Timedelta.
|
|
475
|
+
|
|
476
|
+
Raises:
|
|
477
|
+
ValueError: If the index has fewer than 2 timestamps.
|
|
478
|
+
"""
|
|
479
|
+
minimum_required_length = 2
|
|
480
|
+
if len(index) < minimum_required_length:
|
|
481
|
+
raise ValueError("Cannot infer frequency from an index with fewer than 2 timestamps.")
|
|
482
|
+
|
|
483
|
+
# Calculate the differences between consecutive timestamps
|
|
484
|
+
deltas = index.to_series().drop_duplicates().sort_values().diff().dropna()
|
|
485
|
+
|
|
486
|
+
# Find the most common difference
|
|
487
|
+
return deltas.mode().iloc[0]
|
|
488
|
+
|
|
489
|
+
def frequency_matches(self, index: pd.DatetimeIndex) -> bool:
|
|
490
|
+
"""Check if the frequency of the data matches the model frequency.
|
|
491
|
+
|
|
492
|
+
Args:
|
|
493
|
+
index (pd.DatetimeIndex): The data to check.
|
|
494
|
+
|
|
495
|
+
Returns:
|
|
496
|
+
bool: True if the frequencies match, False otherwise.
|
|
497
|
+
"""
|
|
498
|
+
input_sample_interval = self._infer_frequency(index) if index.freq is None else index.freq
|
|
499
|
+
return input_sample_interval == self.sample_interval
|
|
500
|
+
|
|
446
501
|
|
|
447
502
|
def validate_horizons_present(dataset: TimeSeriesDataset, horizons: list[LeadTime]) -> None:
|
|
448
503
|
"""Validate that the specified forecast horizons are present in the dataset.
|
{openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/datasets/validated_datasets.py
RENAMED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <
|
|
1
|
+
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: MPL-2.0
|
|
4
4
|
|
|
@@ -12,6 +12,7 @@ validation to catch data quality issues early.
|
|
|
12
12
|
from datetime import datetime, timedelta
|
|
13
13
|
from typing import Self, override
|
|
14
14
|
|
|
15
|
+
import numpy as np
|
|
15
16
|
import pandas as pd
|
|
16
17
|
|
|
17
18
|
from openstef_core.datasets.timeseries_dataset import TimeSeriesDataset
|
|
@@ -19,6 +20,8 @@ from openstef_core.datasets.validation import validate_required_columns
|
|
|
19
20
|
from openstef_core.exceptions import MissingColumnsError
|
|
20
21
|
from openstef_core.types import EnergyComponentType, LeadTime, Quantile
|
|
21
22
|
|
|
23
|
+
ENSEMBLE_COLUMN_SEP: str = "__"
|
|
24
|
+
|
|
22
25
|
|
|
23
26
|
class ForecastInputDataset(TimeSeriesDataset):
|
|
24
27
|
"""Time series dataset for forecasting with validated target column.
|
|
@@ -78,6 +81,7 @@ class ForecastInputDataset(TimeSeriesDataset):
|
|
|
78
81
|
*,
|
|
79
82
|
horizon_column: str = "horizon",
|
|
80
83
|
available_at_column: str = "available_at",
|
|
84
|
+
check_frequency: bool = False,
|
|
81
85
|
sample_weight_column: str = "sample_weight",
|
|
82
86
|
target_column: str = "load",
|
|
83
87
|
) -> None:
|
|
@@ -95,6 +99,7 @@ class ForecastInputDataset(TimeSeriesDataset):
|
|
|
95
99
|
sample_interval=sample_interval,
|
|
96
100
|
horizon_column=horizon_column,
|
|
97
101
|
available_at_column=available_at_column,
|
|
102
|
+
check_frequency=check_frequency,
|
|
98
103
|
)
|
|
99
104
|
self._internal_columns.add(self.sample_weight_column)
|
|
100
105
|
self._feature_names = [col for col in self.data.columns if col not in self._internal_columns]
|
|
@@ -250,12 +255,14 @@ class ForecastDataset(TimeSeriesDataset):
|
|
|
250
255
|
*,
|
|
251
256
|
horizon_column: str = "horizon",
|
|
252
257
|
available_at_column: str = "available_at",
|
|
258
|
+
standard_deviation_column: str = "stdev",
|
|
253
259
|
) -> None:
|
|
254
260
|
if "forecast_start" in data.attrs:
|
|
255
261
|
self.forecast_start = datetime.fromisoformat(data.attrs["forecast_start"])
|
|
256
262
|
else:
|
|
257
263
|
self.forecast_start = forecast_start if forecast_start is not None else data.index.min().to_pydatetime()
|
|
258
264
|
self.target_column = data.attrs.get("target_column", target_column)
|
|
265
|
+
self.standard_deviation_column = data.attrs.get("standard_deviation_column", standard_deviation_column)
|
|
259
266
|
|
|
260
267
|
super().__init__(
|
|
261
268
|
data=data,
|
|
@@ -264,12 +271,42 @@ class ForecastDataset(TimeSeriesDataset):
|
|
|
264
271
|
available_at_column=available_at_column,
|
|
265
272
|
)
|
|
266
273
|
|
|
267
|
-
|
|
274
|
+
exclude_columns = {target_column, standard_deviation_column}
|
|
275
|
+
quantile_feature_names = [col for col in self.feature_names if col not in exclude_columns]
|
|
268
276
|
if not all(Quantile.is_valid_quantile_string(col) for col in quantile_feature_names):
|
|
269
277
|
raise ValueError("All feature names must be valid quantile strings.")
|
|
270
278
|
|
|
271
279
|
self.quantiles = [Quantile.parse(col) for col in quantile_feature_names]
|
|
272
280
|
|
|
281
|
+
@classmethod
|
|
282
|
+
def from_quantile_predictions(
|
|
283
|
+
cls,
|
|
284
|
+
predictions: np.ndarray,
|
|
285
|
+
index: pd.Index,
|
|
286
|
+
quantiles: list[Quantile],
|
|
287
|
+
sample_interval: timedelta,
|
|
288
|
+
*,
|
|
289
|
+
target_column: str = "load",
|
|
290
|
+
) -> "ForecastDataset":
|
|
291
|
+
"""Build a ``ForecastDataset`` from a raw predictions array.
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
predictions: Shape ``(n_samples, n_quantiles)``.
|
|
295
|
+
index: Time index for the predictions.
|
|
296
|
+
quantiles: Quantiles the model was trained on, in the same order as columns in *predictions*.
|
|
297
|
+
sample_interval: Temporal resolution of the dataset.
|
|
298
|
+
target_column: Name of the target column to attach to the dataset.
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
``ForecastDataset`` with quantile columns and the provided time index.
|
|
302
|
+
"""
|
|
303
|
+
df = pd.DataFrame(
|
|
304
|
+
data=predictions,
|
|
305
|
+
index=index,
|
|
306
|
+
columns=[q.format() for q in quantiles],
|
|
307
|
+
)
|
|
308
|
+
return cls(data=df, sample_interval=sample_interval, target_column=target_column)
|
|
309
|
+
|
|
273
310
|
@property
|
|
274
311
|
def target_series(self) -> pd.Series | None:
|
|
275
312
|
"""Extract the target time series from the dataset.
|
|
@@ -296,6 +333,20 @@ class ForecastDataset(TimeSeriesDataset):
|
|
|
296
333
|
raise MissingColumnsError(missing_columns=[median_col])
|
|
297
334
|
return self.data[median_col]
|
|
298
335
|
|
|
336
|
+
@property
|
|
337
|
+
def standard_deviation_series(self) -> pd.Series:
|
|
338
|
+
"""Extract the standard deviation series if it exists.
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
Time series containing standard deviation values with original datetime index.
|
|
342
|
+
|
|
343
|
+
Raises:
|
|
344
|
+
MissingColumnsError: If the standard deviation column is not found.
|
|
345
|
+
"""
|
|
346
|
+
if self.standard_deviation_column not in self.data.columns:
|
|
347
|
+
raise MissingColumnsError(missing_columns=[self.standard_deviation_column])
|
|
348
|
+
return self.data[self.standard_deviation_column] # pyright: ignore[reportUnknownVariableType]
|
|
349
|
+
|
|
299
350
|
@property
|
|
300
351
|
def quantiles_data(self) -> pd.DataFrame:
|
|
301
352
|
"""Extract DataFrame containing only the quantile forecast columns.
|
|
@@ -331,6 +382,7 @@ class ForecastDataset(TimeSeriesDataset):
|
|
|
331
382
|
df = super().to_pandas()
|
|
332
383
|
df.attrs["target_column"] = self.target_column
|
|
333
384
|
df.attrs["forecast_start"] = self.forecast_start.isoformat()
|
|
385
|
+
df.attrs["standard_deviation_column"] = self.standard_deviation_column
|
|
334
386
|
return df
|
|
335
387
|
|
|
336
388
|
@classmethod
|
|
@@ -409,8 +461,156 @@ class EnergyComponentDataset(TimeSeriesDataset):
|
|
|
409
461
|
)
|
|
410
462
|
|
|
411
463
|
|
|
464
|
+
class EnsembleForecastDataset(TimeSeriesDataset):
|
|
465
|
+
"""First stage output format for ensemble forecasters."""
|
|
466
|
+
|
|
467
|
+
forecast_start: datetime
|
|
468
|
+
quantiles: list[Quantile]
|
|
469
|
+
forecaster_names: list[str]
|
|
470
|
+
target_column: str
|
|
471
|
+
|
|
472
|
+
@override
|
|
473
|
+
def __init__(
|
|
474
|
+
self,
|
|
475
|
+
data: pd.DataFrame,
|
|
476
|
+
sample_interval: timedelta = timedelta(minutes=15),
|
|
477
|
+
forecast_start: datetime | None = None,
|
|
478
|
+
target_column: str = "load",
|
|
479
|
+
*,
|
|
480
|
+
horizon_column: str = "horizon",
|
|
481
|
+
available_at_column: str = "available_at",
|
|
482
|
+
) -> None:
|
|
483
|
+
if "forecast_start" in data.attrs:
|
|
484
|
+
self.forecast_start = datetime.fromisoformat(data.attrs["forecast_start"])
|
|
485
|
+
else:
|
|
486
|
+
self.forecast_start = forecast_start if forecast_start is not None else data.index.min().to_pydatetime()
|
|
487
|
+
self.target_column = data.attrs.get("target_column", target_column)
|
|
488
|
+
|
|
489
|
+
super().__init__(
|
|
490
|
+
data=data,
|
|
491
|
+
sample_interval=sample_interval,
|
|
492
|
+
horizon_column=horizon_column,
|
|
493
|
+
available_at_column=available_at_column,
|
|
494
|
+
)
|
|
495
|
+
quantile_feature_names = [col for col in self.feature_names if col != target_column]
|
|
496
|
+
|
|
497
|
+
self.forecaster_names, self.quantiles = self.get_learner_and_quantile(pd.Index(quantile_feature_names))
|
|
498
|
+
for name in self.forecaster_names:
|
|
499
|
+
if ENSEMBLE_COLUMN_SEP in name:
|
|
500
|
+
msg = f"Forecaster name '{name}' must not contain separator '{ENSEMBLE_COLUMN_SEP}'."
|
|
501
|
+
raise ValueError(msg)
|
|
502
|
+
n_cols = len(self.forecaster_names) * len(self.quantiles)
|
|
503
|
+
if len(data.columns) not in {n_cols + 1, n_cols}:
|
|
504
|
+
raise ValueError("Data columns do not match the expected number based on base forecasters and quantiles.")
|
|
505
|
+
|
|
506
|
+
@property
|
|
507
|
+
def target_series(self) -> pd.Series | None:
|
|
508
|
+
"""Return the target series if available."""
|
|
509
|
+
if self.target_column in self.data.columns:
|
|
510
|
+
return self.data[self.target_column]
|
|
511
|
+
return None
|
|
512
|
+
|
|
513
|
+
@staticmethod
|
|
514
|
+
def get_learner_and_quantile(feature_names: pd.Index) -> tuple[list[str], list[Quantile]]:
|
|
515
|
+
"""Extract base forecaster names and quantiles from feature names.
|
|
516
|
+
|
|
517
|
+
Column format is ``{learner}{ENSEMBLE_COLUMN_SEP}{quantile.format()}``,
|
|
518
|
+
e.g. ``lgbm__quantile_P50``.
|
|
519
|
+
|
|
520
|
+
Args:
|
|
521
|
+
feature_names: Index of feature names in the dataset.
|
|
522
|
+
|
|
523
|
+
Returns:
|
|
524
|
+
Tuple containing a list of base forecaster names and a list of quantiles.
|
|
525
|
+
|
|
526
|
+
Raises:
|
|
527
|
+
ValueError: If a column cannot be parsed or has an invalid quantile string.
|
|
528
|
+
"""
|
|
529
|
+
forecasters: set[str] = set()
|
|
530
|
+
quantiles: set[Quantile] = set()
|
|
531
|
+
|
|
532
|
+
for feature_name in feature_names:
|
|
533
|
+
parts = feature_name.split(ENSEMBLE_COLUMN_SEP, maxsplit=1)
|
|
534
|
+
if len(parts) != 2: # noqa: PLR2004
|
|
535
|
+
msg = f"Column missing separator '{ENSEMBLE_COLUMN_SEP}': {feature_name}"
|
|
536
|
+
raise ValueError(msg)
|
|
537
|
+
learner_part, quantile_part = parts
|
|
538
|
+
if not Quantile.is_valid_quantile_string(quantile_part):
|
|
539
|
+
msg = f"Column has no valid quantile string: {feature_name}"
|
|
540
|
+
raise ValueError(msg)
|
|
541
|
+
|
|
542
|
+
forecasters.add(learner_part)
|
|
543
|
+
quantiles.add(Quantile.parse(quantile_part))
|
|
544
|
+
|
|
545
|
+
return list(forecasters), list(quantiles)
|
|
546
|
+
|
|
547
|
+
@classmethod
|
|
548
|
+
def from_forecast_datasets(
|
|
549
|
+
cls,
|
|
550
|
+
datasets: dict[str, ForecastDataset],
|
|
551
|
+
target_series: pd.Series | None = None,
|
|
552
|
+
sample_weights: pd.Series | None = None,
|
|
553
|
+
) -> Self:
|
|
554
|
+
"""Create an EnsembleForecastDataset from multiple ForecastDatasets.
|
|
555
|
+
|
|
556
|
+
Args:
|
|
557
|
+
datasets: Dict of ForecastDatasets to combine.
|
|
558
|
+
target_series: Optional target series to include in the dataset.
|
|
559
|
+
sample_weights: Optional sample weights series to include in the dataset.
|
|
560
|
+
|
|
561
|
+
Returns:
|
|
562
|
+
EnsembleForecastDataset combining all input datasets.
|
|
563
|
+
"""
|
|
564
|
+
ds1 = next(iter(datasets.values()))
|
|
565
|
+
additional_columns: dict[str, pd.Series] = {}
|
|
566
|
+
if isinstance(ds1.target_series, pd.Series):
|
|
567
|
+
additional_columns[ds1.target_column] = ds1.target_series
|
|
568
|
+
elif target_series is not None:
|
|
569
|
+
additional_columns[ds1.target_column] = target_series
|
|
570
|
+
|
|
571
|
+
sample_weight_column = "sample_weight"
|
|
572
|
+
if sample_weights is not None:
|
|
573
|
+
additional_columns[sample_weight_column] = sample_weights
|
|
574
|
+
|
|
575
|
+
combined_data = pd.DataFrame({
|
|
576
|
+
f"{learner}{ENSEMBLE_COLUMN_SEP}{q.format()}": ds.data[q.format()]
|
|
577
|
+
for learner, ds in datasets.items()
|
|
578
|
+
for q in ds.quantiles
|
|
579
|
+
}).assign(**additional_columns)
|
|
580
|
+
|
|
581
|
+
return cls(
|
|
582
|
+
data=combined_data,
|
|
583
|
+
sample_interval=ds1.sample_interval,
|
|
584
|
+
forecast_start=ds1.forecast_start,
|
|
585
|
+
target_column=ds1.target_column,
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
def get_base_predictions_for_quantile(self, quantile: Quantile) -> ForecastInputDataset:
|
|
589
|
+
"""Get base forecaster predictions for a specific quantile.
|
|
590
|
+
|
|
591
|
+
Args:
|
|
592
|
+
quantile: Quantile to select.
|
|
593
|
+
|
|
594
|
+
Returns:
|
|
595
|
+
ForecastInputDataset containing predictions from all base forecasters at the specified quantile.
|
|
596
|
+
"""
|
|
597
|
+
selected_columns = [f"{learner}{ENSEMBLE_COLUMN_SEP}{quantile.format()}" for learner in self.forecaster_names]
|
|
598
|
+
selected_columns.append(self.target_column)
|
|
599
|
+
prediction_data = self.data[selected_columns].copy()
|
|
600
|
+
prediction_data.columns = [*self.forecaster_names, self.target_column]
|
|
601
|
+
|
|
602
|
+
return ForecastInputDataset(
|
|
603
|
+
data=prediction_data,
|
|
604
|
+
sample_interval=self.sample_interval,
|
|
605
|
+
target_column=self.target_column,
|
|
606
|
+
forecast_start=self.forecast_start,
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
|
|
412
610
|
__all__ = [
|
|
611
|
+
"ENSEMBLE_COLUMN_SEP",
|
|
413
612
|
"EnergyComponentDataset",
|
|
613
|
+
"EnsembleForecastDataset",
|
|
414
614
|
"ForecastDataset",
|
|
415
615
|
"ForecastInputDataset",
|
|
416
616
|
]
|