openstef-core 4.0.0.dev1__tar.gz → 4.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/.gitignore +26 -6
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/PKG-INFO +6 -4
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/README.md +2 -2
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/pyproject.toml +9 -9
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/__init__.py +1 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/base_model.py +29 -4
- openstef_core-4.1.0/src/openstef_core/constants.py +10 -0
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/datasets/__init__.py +4 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/datasets/mixins.py +4 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/datasets/timeseries_dataset.py +67 -12
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/datasets/validated_datasets.py +204 -2
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/datasets/validation.py +2 -4
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/datasets/versioned_timeseries_dataset.py +6 -9
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/exceptions.py +31 -8
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/mixins/__init__.py +1 -1
- openstef_core-4.1.0/src/openstef_core/mixins/param_ranges.py +131 -0
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/mixins/predictor.py +111 -12
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/mixins/stateful.py +5 -4
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/mixins/transform.py +4 -3
- openstef_core-4.1.0/src/openstef_core/testing.py +306 -0
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/transforms/__init__.py +1 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/transforms/dataset_transforms.py +2 -2
- openstef_core-4.1.0/src/openstef_core/types.py +455 -0
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/utils/__init__.py +7 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/utils/datetime.py +1 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/utils/invariants.py +1 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/utils/itertools.py +1 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/utils/multiprocessing.py +3 -5
- openstef_core-4.1.0/src/openstef_core/utils/numpy.py +105 -0
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/utils/pandas.py +37 -2
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/utils/pydantic.py +3 -3
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/__init__.py +1 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/datasets/test_mixins.py +15 -11
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/datasets/test_timeseries_dataset.py +67 -29
- openstef_core-4.1.0/tests/unit/datasets/test_validated_datasets.py +382 -0
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/datasets/test_validation.py +1 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/datasets/test_versioned_timeseries_dataset.py +138 -68
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/datasets/utils.py +1 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/mixins/test_stateful.py +2 -2
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/mixins/test_transform.py +2 -2
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/test_base_model.py +1 -1
- openstef_core-4.1.0/tests/unit/test_hyperparams_tuning.py +115 -0
- openstef_core-4.1.0/tests/unit/test_param_ranges.py +112 -0
- openstef_core-4.1.0/tests/unit/test_types.py +466 -0
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/utils/test_datetime.py +2 -2
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/utils/test_itertools.py +1 -1
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/utils/test_multiprocessing.py +1 -1
- openstef_core-4.1.0/tests/unit/utils/test_numpy.py +179 -0
- openstef_core-4.1.0/tests/unit/utils/test_pandas.py +56 -0
- openstef_core-4.0.0.dev1/src/openstef_core/testing.py +0 -131
- openstef_core-4.0.0.dev1/src/openstef_core/types.py +0 -302
- openstef_core-4.0.0.dev1/tests/unit/test_types.py +0 -82
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/__init__.py +0 -0
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/datasets/__init__.py +0 -0
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/mixins/__init__.py +0 -0
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/transforms/__init__.py +0 -0
- {openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/tests/unit/utils/__init__.py +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project <
|
|
1
|
+
# SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project <openstef@lfenergy.org> # noqa E501>
|
|
2
2
|
# SPDX-License-Identifier: MPL-2.0
|
|
3
3
|
|
|
4
4
|
# Core
|
|
@@ -35,10 +35,6 @@ MANIFEST
|
|
|
35
35
|
# Ruff
|
|
36
36
|
.ruff_cache/
|
|
37
37
|
|
|
38
|
-
# Pyright
|
|
39
|
-
.pyright/
|
|
40
|
-
# pyright-report/
|
|
41
|
-
|
|
42
38
|
# Test, coverage, tox
|
|
43
39
|
.pytest_cache/
|
|
44
40
|
.coverage
|
|
@@ -67,6 +63,14 @@ dmypy.json
|
|
|
67
63
|
# Sphinx
|
|
68
64
|
docs/_build/
|
|
69
65
|
docs/source/api/generated/
|
|
66
|
+
docs/source/tutorials/
|
|
67
|
+
docs/source/benchmarks/
|
|
68
|
+
# Community health files materialized from OpenSTEF/.github at build time
|
|
69
|
+
docs/source/contribute/_community/
|
|
70
|
+
docs/source/user_guide/**/quick_start_tutorial.py
|
|
71
|
+
docs/source/user_guide/**/feature_engineering_tutorial.py
|
|
72
|
+
docs/source/user_guide/**/datasets_tutorial.py
|
|
73
|
+
docs/source/user_guide/**/backtesting_tutorial.py
|
|
70
74
|
|
|
71
75
|
# docs/_doctrees/
|
|
72
76
|
# docs/_static_gen/
|
|
@@ -124,4 +128,20 @@ certificates/
|
|
|
124
128
|
*.pkl
|
|
125
129
|
|
|
126
130
|
# Benchmark outputs
|
|
127
|
-
benchmark_results*/
|
|
131
|
+
benchmark_results*/
|
|
132
|
+
|
|
133
|
+
# Local dataset files
|
|
134
|
+
liander_dataset/
|
|
135
|
+
|
|
136
|
+
# Deployment example run artifacts (MLflow store, forecasts, dataset, Celery/Airflow state)
|
|
137
|
+
openstef_deployment_runs/
|
|
138
|
+
|
|
139
|
+
# Mlflow
|
|
140
|
+
/mlflow
|
|
141
|
+
/mlflow_artifacts_local
|
|
142
|
+
|
|
143
|
+
.github/instructions
|
|
144
|
+
|
|
145
|
+
# Jupyter notebook cache (myst-nb execution outputs)
|
|
146
|
+
.jupyter_cache/
|
|
147
|
+
docs/build.zip
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: openstef-core
|
|
3
|
-
Version: 4.
|
|
3
|
+
Version: 4.1.0
|
|
4
4
|
Summary: Core functionality for OpenSTEF, a framework for short-term energy forecasting.
|
|
5
5
|
Project-URL: Documentation, https://openstef.github.io/openstef/index.html
|
|
6
6
|
Project-URL: Homepage, https://lfenergy.org/projects/openstef/
|
|
7
7
|
Project-URL: Issues, https://github.com/OpenSTEF/openstef/issues
|
|
8
8
|
Project-URL: Repository, https://github.com/OpenSTEF/openstef
|
|
9
|
-
Author-email: "Alliander N.V" <
|
|
9
|
+
Author-email: "Alliander N.V" <openstef@lfenergy.org>
|
|
10
10
|
License-Expression: MPL-2.0
|
|
11
11
|
Keywords: energy,forecasting,machinelearning
|
|
12
12
|
Classifier: Development Status :: 5 - Production/Stable
|
|
@@ -22,12 +22,14 @@ Requires-Dist: pandas<3,>=2.3.1
|
|
|
22
22
|
Requires-Dist: pyarrow>=21
|
|
23
23
|
Requires-Dist: pydantic-extra-types<3,>=2.10.5
|
|
24
24
|
Requires-Dist: pydantic<3,>=2.12.4
|
|
25
|
+
Provides-Extra: benchmark
|
|
26
|
+
Requires-Dist: huggingface-hub>=1.2.2; extra == 'benchmark'
|
|
25
27
|
Description-Content-Type: text/markdown
|
|
26
28
|
|
|
27
29
|
<!--
|
|
28
|
-
SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <
|
|
30
|
+
SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
|
|
29
31
|
|
|
30
32
|
SPDX-License-Identifier: MPL-2.0
|
|
31
33
|
-->
|
|
32
34
|
|
|
33
|
-
# openstef-core
|
|
35
|
+
# openstef-core
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
<!--
|
|
2
|
-
SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <
|
|
2
|
+
SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
|
|
3
3
|
|
|
4
4
|
SPDX-License-Identifier: MPL-2.0
|
|
5
5
|
-->
|
|
6
6
|
|
|
7
|
-
# openstef-core
|
|
7
|
+
# openstef-core
|
|
@@ -1,21 +1,19 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <
|
|
1
|
+
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
-
|
|
5
4
|
[build-system]
|
|
6
5
|
build-backend = "hatchling.build"
|
|
7
|
-
|
|
8
6
|
requires = [ "hatchling" ]
|
|
9
7
|
|
|
10
8
|
[project]
|
|
11
9
|
name = "openstef-core"
|
|
12
|
-
version = "4.
|
|
10
|
+
version = "4.1.0"
|
|
13
11
|
description = "Core functionality for OpenSTEF, a framework for short-term energy forecasting."
|
|
14
12
|
readme = "README.md"
|
|
15
13
|
keywords = [ "energy", "forecasting", "machinelearning" ]
|
|
16
14
|
license = "MPL-2.0"
|
|
17
15
|
authors = [
|
|
18
|
-
{ name = "Alliander N.V", email = "
|
|
16
|
+
{ name = "Alliander N.V", email = "openstef@lfenergy.org" },
|
|
19
17
|
]
|
|
20
18
|
requires-python = ">=3.12,<4.0"
|
|
21
19
|
classifiers = [
|
|
@@ -26,20 +24,22 @@ classifiers = [
|
|
|
26
24
|
"Programming Language :: Python :: 3.13",
|
|
27
25
|
"Programming Language :: Python :: 3.14",
|
|
28
26
|
]
|
|
29
|
-
|
|
30
27
|
dependencies = [
|
|
31
28
|
"joblib>=1,<2",
|
|
32
29
|
"numpy>=2.3.2,<3",
|
|
30
|
+
# Held at <3 pending the pandas 3.0 Copy-on-Write migration (see migration issue).
|
|
33
31
|
"pandas>=2.3.1,<3",
|
|
34
32
|
"pyarrow>=21",
|
|
35
33
|
"pydantic>=2.12.4,<3",
|
|
36
34
|
"pydantic-extra-types>=2.10.5,<3",
|
|
37
35
|
]
|
|
38
|
-
|
|
36
|
+
optional-dependencies.benchmark = [
|
|
37
|
+
"huggingface-hub>=1.2.2",
|
|
38
|
+
]
|
|
39
39
|
urls.Documentation = "https://openstef.github.io/openstef/index.html"
|
|
40
40
|
urls.Homepage = "https://lfenergy.org/projects/openstef/"
|
|
41
41
|
urls.Issues = "https://github.com/OpenSTEF/openstef/issues"
|
|
42
42
|
urls.Repository = "https://github.com/OpenSTEF/openstef"
|
|
43
43
|
|
|
44
|
-
[tool.hatch
|
|
45
|
-
packages = [ "src/openstef_core" ]
|
|
44
|
+
[tool.hatch]
|
|
45
|
+
build.targets.wheel.packages = [ "src/openstef_core" ]
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project <
|
|
1
|
+
# SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: MPL-2.0
|
|
4
4
|
"""Core functionality for OpenSTEF, a framework for short-term energy forecasting."""
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <
|
|
1
|
+
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: MPL-2.0
|
|
4
4
|
|
|
@@ -11,11 +11,18 @@ operate on arbitrary config instances or Pydantic models / adapters.
|
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
13
|
from pathlib import Path
|
|
14
|
-
from typing import Annotated, Any, Self
|
|
14
|
+
from typing import Annotated, Any, Self, override
|
|
15
15
|
|
|
16
16
|
import yaml
|
|
17
17
|
from pydantic import BaseModel as PydanticBaseModel
|
|
18
|
-
from pydantic import
|
|
18
|
+
from pydantic import (
|
|
19
|
+
BeforeValidator,
|
|
20
|
+
ConfigDict,
|
|
21
|
+
GetCoreSchemaHandler,
|
|
22
|
+
GetJsonSchemaHandler,
|
|
23
|
+
TypeAdapter,
|
|
24
|
+
ValidationInfo,
|
|
25
|
+
)
|
|
19
26
|
from pydantic_core import core_schema
|
|
20
27
|
|
|
21
28
|
|
|
@@ -105,6 +112,7 @@ def read_yaml_config[T: BaseConfig, U](path: Path, class_type: type[T] | TypeAda
|
|
|
105
112
|
class PydanticStringPrimitive:
|
|
106
113
|
"""Base class for Pydantic-compatible types with string serialization."""
|
|
107
114
|
|
|
115
|
+
@override
|
|
108
116
|
def __str__(self) -> str:
|
|
109
117
|
"""Convert to string representation."""
|
|
110
118
|
raise NotImplementedError("Subclasses must implement __str__")
|
|
@@ -115,7 +123,7 @@ class PydanticStringPrimitive:
|
|
|
115
123
|
raise NotImplementedError("Subclasses must implement from_string")
|
|
116
124
|
|
|
117
125
|
@classmethod
|
|
118
|
-
def validate(cls, v: Any, _info:
|
|
126
|
+
def validate(cls, v: Any, _info: ValidationInfo | None = None) -> Self: # noqa: ANN401
|
|
119
127
|
"""Validate and convert input to this type.
|
|
120
128
|
|
|
121
129
|
Args:
|
|
@@ -150,6 +158,22 @@ class PydanticStringPrimitive:
|
|
|
150
158
|
function=cls.validate, serialization=core_schema.plain_serializer_function_ser_schema(cls.__str__)
|
|
151
159
|
)
|
|
152
160
|
|
|
161
|
+
@classmethod
|
|
162
|
+
def __get_pydantic_json_schema__(
|
|
163
|
+
cls,
|
|
164
|
+
_schema: core_schema.CoreSchema,
|
|
165
|
+
handler: GetJsonSchemaHandler,
|
|
166
|
+
) -> dict[str, Any]:
|
|
167
|
+
"""Generate JSON schema for OpenAPI / FastAPI compatibility.
|
|
168
|
+
|
|
169
|
+
All string-primitive types serialise as plain strings.
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
JSON schema describing the type as a string.
|
|
173
|
+
"""
|
|
174
|
+
return {"type": "string"}
|
|
175
|
+
|
|
176
|
+
@override
|
|
153
177
|
def __eq__(self, other: object) -> bool:
|
|
154
178
|
"""Check equality based on string representation.
|
|
155
179
|
|
|
@@ -160,6 +184,7 @@ class PydanticStringPrimitive:
|
|
|
160
184
|
return NotImplemented
|
|
161
185
|
return str(self) == str(other)
|
|
162
186
|
|
|
187
|
+
@override
|
|
163
188
|
def __hash__(self) -> int:
|
|
164
189
|
"""Return hash based on string representation."""
|
|
165
190
|
return hash(str(self))
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
+
|
|
5
|
+
"""Shared constants for the openstef_core package."""
|
|
6
|
+
|
|
7
|
+
LIANDER_DATASET_REPO_ID = "OpenSTEF/liander2024-energy-forecasting-benchmark"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
__all__ = ["LIANDER_DATASET_REPO_ID"]
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <
|
|
1
|
+
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: MPL-2.0
|
|
4
4
|
|
|
@@ -9,6 +9,7 @@ It includes both simple time series datasets and versioned datasets that track d
|
|
|
9
9
|
over time, enabling realistic backtesting and training and forecasting.
|
|
10
10
|
|
|
11
11
|
The module supports:
|
|
12
|
+
|
|
12
13
|
- Regular time series with consistent sampling intervals
|
|
13
14
|
- Versioned time series that track when data became available
|
|
14
15
|
- Validated datasets with domain-specific constraints
|
|
@@ -19,6 +20,7 @@ The module supports:
|
|
|
19
20
|
from openstef_core.datasets.timeseries_dataset import TimeSeriesDataset, validate_horizons_present
|
|
20
21
|
from openstef_core.datasets.validated_datasets import (
|
|
21
22
|
EnergyComponentDataset,
|
|
23
|
+
EnsembleForecastDataset,
|
|
22
24
|
ForecastDataset,
|
|
23
25
|
ForecastInputDataset,
|
|
24
26
|
)
|
|
@@ -26,6 +28,7 @@ from openstef_core.datasets.versioned_timeseries_dataset import VersionedTimeSer
|
|
|
26
28
|
|
|
27
29
|
__all__ = [
|
|
28
30
|
"EnergyComponentDataset",
|
|
31
|
+
"EnsembleForecastDataset",
|
|
29
32
|
"ForecastDataset",
|
|
30
33
|
"ForecastInputDataset",
|
|
31
34
|
"TimeSeriesDataset",
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <
|
|
1
|
+
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: MPL-2.0
|
|
4
4
|
|
|
@@ -9,6 +9,7 @@ datasets in OpenSTEF. Protocols enable type checking and documentation of expect
|
|
|
9
9
|
behavior without requiring inheritance.
|
|
10
10
|
|
|
11
11
|
Key protocols:
|
|
12
|
+
|
|
12
13
|
- TimeSeries: Core interface for all time series datasets with filtering and versioning
|
|
13
14
|
- DatasetMixin: Interface for dataset persistence operations
|
|
14
15
|
"""
|
|
@@ -36,6 +37,7 @@ class TimeSeriesMixin(Protocol):
|
|
|
36
37
|
the dataset's temporal index, and filtering/versioning capabilities.
|
|
37
38
|
|
|
38
39
|
Classes implementing this interface must provide:
|
|
40
|
+
|
|
39
41
|
- Access to the datetime index
|
|
40
42
|
- Sample interval information
|
|
41
43
|
- Feature names list
|
|
@@ -158,6 +160,7 @@ class DatasetMixin(Protocol):
|
|
|
158
160
|
and reconstructed exactly as they were saved.
|
|
159
161
|
|
|
160
162
|
Classes implementing this mixin must:
|
|
163
|
+
|
|
161
164
|
- Save all data and metadata necessary for complete reconstruction
|
|
162
165
|
- Store metadata in parquet file attributes using attrs
|
|
163
166
|
- Handle missing metadata gracefully with sensible defaults when loading
|
{openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/datasets/timeseries_dataset.py
RENAMED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <
|
|
1
|
+
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: MPL-2.0
|
|
4
4
|
|
|
@@ -33,7 +33,7 @@ from openstef_core.utils.pandas import unsafe_sorted_range_slice_idxs
|
|
|
33
33
|
_logger = logging.getLogger(__name__)
|
|
34
34
|
|
|
35
35
|
|
|
36
|
-
class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin):
|
|
36
|
+
class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin):
|
|
37
37
|
"""A time series dataset with regular sampling intervals and optional versioning.
|
|
38
38
|
|
|
39
39
|
This class represents time series data with a consistent sampling interval
|
|
@@ -42,11 +42,18 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
|
|
|
42
42
|
over time through either a horizon column or an available_at column.
|
|
43
43
|
|
|
44
44
|
The dataset automatically detects versioning:
|
|
45
|
+
|
|
45
46
|
- If a horizon column exists, data is versioned by forecast horizon
|
|
46
47
|
- If an available_at column exists, data is versioned by availability time
|
|
47
48
|
- Otherwise, data is treated as a regular time series
|
|
48
49
|
|
|
50
|
+
Columns whose names start with a double underscore (``__``) are treated as
|
|
51
|
+
internal/system columns: they are kept in ``data`` so transforms can pass
|
|
52
|
+
them along, but are excluded from ``feature_names`` so feature-aware
|
|
53
|
+
transforms ignore them.
|
|
54
|
+
|
|
49
55
|
The dataset guarantees:
|
|
56
|
+
|
|
50
57
|
- Data is sorted by timestamp in ascending order
|
|
51
58
|
- Consistent sampling interval across all data points
|
|
52
59
|
- DateTime index for temporal operations
|
|
@@ -57,7 +64,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
|
|
|
57
64
|
available_at_column: Name of the column storing availability times (if versioned).
|
|
58
65
|
|
|
59
66
|
Example:
|
|
60
|
-
Create a simple time series dataset
|
|
67
|
+
Create a simple time series dataset
|
|
61
68
|
|
|
62
69
|
>>> import pandas as pd
|
|
63
70
|
>>> from datetime import timedelta
|
|
@@ -71,7 +78,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
|
|
|
71
78
|
>>> dataset.is_versioned
|
|
72
79
|
False
|
|
73
80
|
|
|
74
|
-
Create a versioned dataset with horizons
|
|
81
|
+
Create a versioned dataset with horizons
|
|
75
82
|
|
|
76
83
|
>>> data_with_horizon = pd.DataFrame({
|
|
77
84
|
... 'load': [100, 120],
|
|
@@ -104,10 +111,12 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
|
|
|
104
111
|
horizon_column: str = "horizon",
|
|
105
112
|
available_at_column: str = "available_at",
|
|
106
113
|
is_sorted: bool = False,
|
|
114
|
+
check_frequency: bool = False,
|
|
107
115
|
) -> None:
|
|
108
116
|
"""Initialize a time series dataset.
|
|
109
117
|
|
|
110
118
|
The dataset automatically detects whether it's versioned based on column presence:
|
|
119
|
+
|
|
111
120
|
- If horizon_column exists: versioned by forecast horizon
|
|
112
121
|
- If available_at_column exists: versioned by availability time
|
|
113
122
|
- Otherwise: regular time series
|
|
@@ -118,10 +127,12 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
|
|
|
118
127
|
horizon_column: Name of the column storing forecast horizons.
|
|
119
128
|
available_at_column: Name of the column storing availability times.
|
|
120
129
|
is_sorted: Whether the data is sorted by timestamp.
|
|
130
|
+
check_frequency: Whether to check that the data frequency matches sample_interval.
|
|
121
131
|
|
|
122
132
|
Raises:
|
|
123
133
|
TypeError: If data index is not a pandas DatetimeIndex or if versioning
|
|
124
134
|
columns have incorrect types.
|
|
135
|
+
ValueError: If data frequency does not match sample_interval.
|
|
125
136
|
"""
|
|
126
137
|
if not isinstance(data.index, pd.DatetimeIndex):
|
|
127
138
|
raise TypeError("Data index must be a pandas DatetimeIndex.")
|
|
@@ -130,9 +141,15 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
|
|
|
130
141
|
self.horizon_column = horizon_column
|
|
131
142
|
self.available_at_column = available_at_column
|
|
132
143
|
self._sample_interval = sample_interval
|
|
133
|
-
self._internal_columns =
|
|
144
|
+
self._internal_columns = {col for col in data.columns if col.startswith("__")}
|
|
134
145
|
data.index.name = self.index_name
|
|
135
146
|
|
|
147
|
+
# Check input data frequency matches sample_interval, only if there are enough data points to infer frequency
|
|
148
|
+
minimum_required_length = 2
|
|
149
|
+
if check_frequency and len(data) >= minimum_required_length and not self.frequency_matches(data.index):
|
|
150
|
+
msg = f"Data frequency does not match the sample_interval ({sample_interval})."
|
|
151
|
+
raise ValueError(msg)
|
|
152
|
+
|
|
136
153
|
if self.horizon_column in data.columns:
|
|
137
154
|
validate_timedelta_column(data[self.horizon_column])
|
|
138
155
|
self._version_column = self.horizon_column
|
|
@@ -147,7 +164,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
|
|
|
147
164
|
self._horizons = None
|
|
148
165
|
else:
|
|
149
166
|
self._version_column = None
|
|
150
|
-
self._feature_names = data.columns.
|
|
167
|
+
self._feature_names = [col for col in data.columns if col not in self._internal_columns]
|
|
151
168
|
self._horizons = None
|
|
152
169
|
|
|
153
170
|
# Ensure invariants: data is sorted by timestamp
|
|
@@ -257,7 +274,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
|
|
|
257
274
|
if available_at_series is None:
|
|
258
275
|
return self
|
|
259
276
|
|
|
260
|
-
cutoff = self.index
|
|
277
|
+
cutoff = available_at.apply_index(self.index)
|
|
261
278
|
data_filtered = self.data[available_at_series <= cutoff]
|
|
262
279
|
return self._copy_with_data(data=data_filtered)
|
|
263
280
|
|
|
@@ -287,7 +304,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
|
|
|
287
304
|
Returns:
|
|
288
305
|
New dataset containing only rows with timestamps in the mask.
|
|
289
306
|
"""
|
|
290
|
-
data_filtered = self.data.loc[self.index.isin(mask)]
|
|
307
|
+
data_filtered = self.data.loc[self.index.isin(mask)]
|
|
291
308
|
|
|
292
309
|
return self._copy_with_data(data=data_filtered)
|
|
293
310
|
|
|
@@ -303,7 +320,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
|
|
|
303
320
|
if self.horizons is None:
|
|
304
321
|
return self
|
|
305
322
|
|
|
306
|
-
data_selected = self.data[self.lead_time_series == horizon.value]
|
|
323
|
+
data_selected = cast(pd.DataFrame, self.data[self.lead_time_series == horizon.value])
|
|
307
324
|
return self._copy_with_data(data=data_selected)
|
|
308
325
|
|
|
309
326
|
def to_pandas(self) -> pd.DataFrame:
|
|
@@ -378,7 +395,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
|
|
|
378
395
|
available_at_column: str = "available_at",
|
|
379
396
|
horizon_column: str = "horizon",
|
|
380
397
|
) -> Self:
|
|
381
|
-
df = pd.read_parquet(path=path)
|
|
398
|
+
df = pd.read_parquet(path=path)
|
|
382
399
|
if not isinstance(df.index, pd.DatetimeIndex):
|
|
383
400
|
if timestamp_column not in df.columns:
|
|
384
401
|
raise TimeSeriesValidationError(
|
|
@@ -443,6 +460,44 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin): # noqa: PLR0904 - impor
|
|
|
443
460
|
is_sorted=is_sorted,
|
|
444
461
|
)
|
|
445
462
|
|
|
463
|
+
@staticmethod
|
|
464
|
+
def _infer_frequency(index: pd.DatetimeIndex) -> pd.Timedelta:
|
|
465
|
+
"""Infer the frequency of a pandas DatetimeIndex if the freq attribute is not set.
|
|
466
|
+
|
|
467
|
+
This method calculates the most common time difference between consecutive timestamps,
|
|
468
|
+
which is more permissive of missing chunks of data than the pandas infer_freq method.
|
|
469
|
+
|
|
470
|
+
Args:
|
|
471
|
+
index (pd.DatetimeIndex): The datetime index to infer the frequency from.
|
|
472
|
+
|
|
473
|
+
Returns:
|
|
474
|
+
pd.Timedelta: The inferred frequency as a pandas Timedelta.
|
|
475
|
+
|
|
476
|
+
Raises:
|
|
477
|
+
ValueError: If the index has fewer than 2 timestamps.
|
|
478
|
+
"""
|
|
479
|
+
minimum_required_length = 2
|
|
480
|
+
if len(index) < minimum_required_length:
|
|
481
|
+
raise ValueError("Cannot infer frequency from an index with fewer than 2 timestamps.")
|
|
482
|
+
|
|
483
|
+
# Calculate the differences between consecutive timestamps
|
|
484
|
+
deltas = index.to_series().drop_duplicates().sort_values().diff().dropna()
|
|
485
|
+
|
|
486
|
+
# Find the most common difference
|
|
487
|
+
return deltas.mode().iloc[0]
|
|
488
|
+
|
|
489
|
+
def frequency_matches(self, index: pd.DatetimeIndex) -> bool:
|
|
490
|
+
"""Check if the frequency of the data matches the model frequency.
|
|
491
|
+
|
|
492
|
+
Args:
|
|
493
|
+
index (pd.DatetimeIndex): The data to check.
|
|
494
|
+
|
|
495
|
+
Returns:
|
|
496
|
+
bool: True if the frequencies match, False otherwise.
|
|
497
|
+
"""
|
|
498
|
+
input_sample_interval = self._infer_frequency(index) if index.freq is None else index.freq
|
|
499
|
+
return input_sample_interval == self.sample_interval
|
|
500
|
+
|
|
446
501
|
|
|
447
502
|
def validate_horizons_present(dataset: TimeSeriesDataset, horizons: list[LeadTime]) -> None:
|
|
448
503
|
"""Validate that the specified forecast horizons are present in the dataset.
|
|
@@ -457,8 +512,8 @@ def validate_horizons_present(dataset: TimeSeriesDataset, horizons: list[LeadTim
|
|
|
457
512
|
if dataset.horizons is None and len(horizons) == 1:
|
|
458
513
|
return # Non-versioned dataset can satisfy single-horizon requests
|
|
459
514
|
|
|
460
|
-
|
|
461
|
-
missing_horizons = [h for h in horizons if h not in
|
|
515
|
+
available_horizons = set(dataset.horizons or [])
|
|
516
|
+
missing_horizons = [h for h in horizons if h not in available_horizons]
|
|
462
517
|
if missing_horizons:
|
|
463
518
|
raise TimeSeriesValidationError("Missing forecast horizons: " + ", ".join(map(str, missing_horizons)))
|
|
464
519
|
|