openstef-core 4.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+ # SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
2
+ #
3
+ # SPDX-License-Identifier: MPL-2.0
4
+ """Core functionality for OpenSTEF, a framework for short-term energy forecasting."""
@@ -0,0 +1,205 @@
1
+ # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
2
+ #
3
+ # SPDX-License-Identifier: MPL-2.0
4
+
5
+ """Configuration utilities for OpenSTEF Beam.
6
+
7
+ This module provides a `BaseConfig` class extending Pydantic's `BaseModel`
8
+ with convenience helpers for reading from and writing to YAML files. It also
9
+ exposes two helper functions `write_yaml_config` and `read_yaml_config` that
10
+ operate on arbitrary config instances or Pydantic models / adapters.
11
+ """
12
+
13
+ from pathlib import Path
14
+ from typing import Annotated, Any, Self
15
+
16
+ import yaml
17
+ from pydantic import BaseModel as PydanticBaseModel
18
+ from pydantic import (
19
+ BeforeValidator,
20
+ ConfigDict,
21
+ GetCoreSchemaHandler,
22
+ GetJsonSchemaHandler,
23
+ TypeAdapter,
24
+ ValidationInfo,
25
+ )
26
+ from pydantic_core import core_schema
27
+
28
+
29
+ class BaseModel(PydanticBaseModel):
30
+ """Base model class for OpenSTEF components."""
31
+
32
+ model_config = ConfigDict(protected_namespaces=(), arbitrary_types_allowed=True, ser_json_inf_nan="null")
33
+
34
+
35
+ class BaseConfig(PydanticBaseModel):
36
+ """Base configuration model.
37
+
38
+ It configures Pydantic model for safe YAML serialization / deserialization.
39
+ """
40
+
41
+ model_config = ConfigDict(
42
+ protected_namespaces=(),
43
+ extra="ignore",
44
+ arbitrary_types_allowed=False,
45
+ )
46
+
47
+ @classmethod
48
+ def read_yaml(cls, path: Path) -> Self:
49
+ """Create an instance from a YAML file.
50
+
51
+ Args:
52
+ path: Path to the YAML file to read.
53
+
54
+ Returns:
55
+ An instance of the config class populated with the file contents.
56
+ """
57
+ return read_yaml_config(path, class_type=cls)
58
+
59
+ def write_yaml(self, path: Path) -> None:
60
+ """Write this configuration to a YAML file.
61
+
62
+ Args:
63
+ path: Destination path for the YAML file (will be overwritten).
64
+ """
65
+ write_yaml_config(self, path)
66
+
67
+
68
+ def write_yaml_config(config: BaseConfig, path: Path) -> None:
69
+ """Write the config to a YAML file.
70
+
71
+ Args:
72
+ config: The configuration object to serialize.
73
+ path: Destination path for the YAML file (will be overwritten).
74
+
75
+ Example:
76
+ >>> from pathlib import Path
77
+ >>> from pydantic import BaseModel
78
+ >>> class MyConfig(BaseModel):
79
+ ... foo: int
80
+ >>> cfg = MyConfig(foo=123)
81
+ >>> write_yaml_config(cfg, Path("/tmp/test.yaml"))
82
+ """
83
+ with path.open("w", encoding="utf-8") as f:
84
+ yaml.dump(config.model_dump(mode="json"), f, allow_unicode=True)
85
+
86
+
87
+ def read_yaml_config[T: BaseConfig, U](path: Path, class_type: type[T] | TypeAdapter[U]) -> T | U:
88
+ """Read a configuration object from a YAML file.
89
+
90
+ This function supports two kinds of targets:
91
+
92
+ * A subclass of `BaseConfig`, in which case Pydantic's `model_validate` is used.
93
+ * A `TypeAdapter` instance for more advanced / non-`BaseModel` schema validation.
94
+
95
+ Args:
96
+ path: Path to the YAML file to read.
97
+ class_type: The target type (a `BaseConfig` subclass) or a `TypeAdapter`.
98
+
99
+ Returns:
100
+ A validated configuration instance (either ``T`` or ``U`` depending on
101
+ the provided ``class_type``).
102
+ """
103
+ with path.open("r", encoding="utf-8") as f:
104
+ data = yaml.safe_load(f)
105
+
106
+ if isinstance(class_type, TypeAdapter):
107
+ return class_type.validate_python(data)
108
+
109
+ return class_type.model_validate(data)
110
+
111
+
112
+ class PydanticStringPrimitive:
113
+ """Base class for Pydantic-compatible types with string serialization."""
114
+
115
+ def __str__(self) -> str:
116
+ """Convert to string representation."""
117
+ raise NotImplementedError("Subclasses must implement __str__")
118
+
119
+ @classmethod
120
+ def from_string(cls, s: str) -> Self:
121
+ """Create an instance from string representation."""
122
+ raise NotImplementedError("Subclasses must implement from_string")
123
+
124
+ @classmethod
125
+ def validate(cls, v: Any, _info: ValidationInfo | None = None) -> Self: # noqa: ANN401
126
+ """Validate and convert input to this type.
127
+
128
+ Args:
129
+ v: Input value to validate.
130
+ _info: Additional validation info (unused).
131
+
132
+ Returns:
133
+ Validated instance of this type.
134
+
135
+ Raises:
136
+ ValueError: If input cannot be converted to this type.
137
+ """
138
+ if isinstance(v, cls):
139
+ return v
140
+ if isinstance(v, str):
141
+ return cls.from_string(v)
142
+
143
+ # Subclasses should handle their specific types
144
+ error_message = f"Cannot convert {v} to {cls.__name__}"
145
+ raise ValueError(error_message)
146
+
147
+ @classmethod
148
+ def __get_pydantic_core_schema__(
149
+ cls, _source_type: type[Any], _handler: GetCoreSchemaHandler
150
+ ) -> core_schema.CoreSchema:
151
+ """Define Pydantic validation and serialization behavior.
152
+
153
+ Returns:
154
+ Core schema for Pydantic validation and serialization.
155
+ """
156
+ return core_schema.with_info_plain_validator_function(
157
+ function=cls.validate, serialization=core_schema.plain_serializer_function_ser_schema(cls.__str__)
158
+ )
159
+
160
+ @classmethod
161
+ def __get_pydantic_json_schema__( # noqa: PLW3201
162
+ cls,
163
+ _schema: core_schema.CoreSchema,
164
+ handler: GetJsonSchemaHandler,
165
+ ) -> dict[str, Any]:
166
+ """Generate JSON schema for OpenAPI / FastAPI compatibility.
167
+
168
+ All string-primitive types serialise as plain strings.
169
+
170
+ Returns:
171
+ JSON schema describing the type as a string.
172
+ """
173
+ return {"type": "string"}
174
+
175
+ def __eq__(self, other: object) -> bool:
176
+ """Check equality based on string representation.
177
+
178
+ Returns:
179
+ True if both objects have the same string representation, False otherwise.
180
+ """
181
+ if not isinstance(other, self.__class__):
182
+ return NotImplemented
183
+ return str(self) == str(other)
184
+
185
+ def __hash__(self) -> int:
186
+ """Return hash based on string representation."""
187
+ return hash(str(self))
188
+
189
+
190
+ def _convert_none_to_nan(v: float | None) -> float:
191
+ if v is None:
192
+ return float("nan")
193
+ return v
194
+
195
+
196
+ FloatOrNan = Annotated[float, BeforeValidator(_convert_none_to_nan)]
197
+
198
+ __all__ = [
199
+ "BaseConfig",
200
+ "BaseModel",
201
+ "FloatOrNan",
202
+ "PydanticStringPrimitive",
203
+ "read_yaml_config",
204
+ "write_yaml_config",
205
+ ]
@@ -0,0 +1,10 @@
1
+ # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
2
+ #
3
+ # SPDX-License-Identifier: MPL-2.0
4
+
5
+ """Shared constants for the openstef_core package."""
6
+
7
+ LIANDER_DATASET_REPO_ID = "OpenSTEF/liander2024-energy-forecasting-benchmark"
8
+
9
+
10
+ __all__ = ["LIANDER_DATASET_REPO_ID"]
@@ -0,0 +1,37 @@
1
+ # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
2
+ #
3
+ # SPDX-License-Identifier: MPL-2.0
4
+
5
+ """Time series datasets and versioned data access.
6
+
7
+ This module provides core data structures for handling time series data in OpenSTEF forecasting.
8
+ It includes both simple time series datasets and versioned datasets that track data availability
9
+ over time, enabling realistic backtesting and training and forecasting.
10
+
11
+ The module supports:
12
+
13
+ - Regular time series with consistent sampling intervals
14
+ - Versioned time series that track when data became available
15
+ - Validated datasets with domain-specific constraints
16
+ - Data transformations and validation utilities
17
+ - Feature concatenation and horizon restriction operations
18
+ """
19
+
20
+ from openstef_core.datasets.timeseries_dataset import TimeSeriesDataset, validate_horizons_present
21
+ from openstef_core.datasets.validated_datasets import (
22
+ EnergyComponentDataset,
23
+ EnsembleForecastDataset,
24
+ ForecastDataset,
25
+ ForecastInputDataset,
26
+ )
27
+ from openstef_core.datasets.versioned_timeseries_dataset import VersionedTimeSeriesDataset
28
+
29
+ __all__ = [
30
+ "EnergyComponentDataset",
31
+ "EnsembleForecastDataset",
32
+ "ForecastDataset",
33
+ "ForecastInputDataset",
34
+ "TimeSeriesDataset",
35
+ "VersionedTimeSeriesDataset",
36
+ "validate_horizons_present",
37
+ ]
@@ -0,0 +1,221 @@
1
+ # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
2
+ #
3
+ # SPDX-License-Identifier: MPL-2.0
4
+
5
+ """Protocol definitions for time series dataset interfaces.
6
+
7
+ This module provides protocol classes that define the core interfaces for time series
8
+ datasets in OpenSTEF. Protocols enable type checking and documentation of expected
9
+ behavior without requiring inheritance.
10
+
11
+ Key protocols:
12
+
13
+ - TimeSeries: Core interface for all time series datasets with filtering and versioning
14
+ - DatasetMixin: Interface for dataset persistence operations
15
+ """
16
+
17
+ from abc import abstractmethod
18
+ from collections.abc import Callable
19
+ from datetime import datetime, timedelta
20
+ from typing import TYPE_CHECKING, Concatenate, Protocol, Self
21
+
22
+ from pydantic import FilePath
23
+
24
+ from openstef_core.types import AvailableAt, LeadTime
25
+
26
+ if TYPE_CHECKING:
27
+ import pandas as pd
28
+
29
+ from openstef_core.datasets.timeseries_dataset import TimeSeriesDataset
30
+
31
+
32
+ class TimeSeriesMixin(Protocol):
33
+ """Interface defining the interface for time series datasets.
34
+
35
+ This interface defines the essential operations that all time series datasets
36
+ must implement. It provides access to feature metadata, temporal properties,
37
+ the dataset's temporal index, and filtering/versioning capabilities.
38
+
39
+ Classes implementing this interface must provide:
40
+
41
+ - Access to the datetime index
42
+ - Sample interval information
43
+ - Feature names list
44
+ - Versioning status indicator
45
+ - Filtering methods for time ranges, availability, and lead times
46
+ - Version selection for point-in-time data reconstruction
47
+ """
48
+
49
+ @property
50
+ @abstractmethod
51
+ def index(self) -> "pd.DatetimeIndex":
52
+ """Get the datetime index of the dataset.
53
+
54
+ Returns:
55
+ DatetimeIndex representing all timestamps in the dataset.
56
+ """
57
+
58
+ @property
59
+ @abstractmethod
60
+ def sample_interval(self) -> timedelta:
61
+ """Get the fixed time interval between consecutive data points.
62
+
63
+ Returns:
64
+ The sample interval as a timedelta.
65
+ """
66
+
67
+ @property
68
+ @abstractmethod
69
+ def feature_names(self) -> list[str]:
70
+ """Get the names of all available features in the dataset.
71
+
72
+ Returns:
73
+ List of feature names, excluding metadata columns like timestamp,
74
+ available_at, or horizon.
75
+ """
76
+
77
+ @property
78
+ @abstractmethod
79
+ def is_versioned(self) -> bool:
80
+ """Check if the dataset tracks data availability over time.
81
+
82
+ Returns:
83
+ True if the dataset is versioned (tracks availability via horizon
84
+ or available_at columns), False for regular time series.
85
+ """
86
+
87
+ @abstractmethod
88
+ def filter_by_range(self, start: datetime | None = None, end: datetime | None = None) -> Self:
89
+ """Filter the dataset to include only data within the specified time range.
90
+
91
+ Args:
92
+ start: Inclusive start time of the range. If None, no start boundary applied.
93
+ end: Exclusive end time of the range. If None, no end boundary applied.
94
+
95
+ Returns:
96
+ New instance containing only data within [start, end).
97
+ """
98
+
99
+ @abstractmethod
100
+ def filter_by_available_before(self, available_before: datetime) -> Self:
101
+ """Filter to include only data available before the specified timestamp.
102
+
103
+ Args:
104
+ available_before: Cutoff time for data availability.
105
+
106
+ Returns:
107
+ New instance containing only data available before the cutoff.
108
+ """
109
+
110
+ @abstractmethod
111
+ def filter_by_available_at(self, available_at: AvailableAt) -> Self:
112
+ """Filter based on realistic daily data availability constraints.
113
+
114
+ Args:
115
+ available_at: Specification defining when data becomes available.
116
+
117
+ Returns:
118
+ New instance with data filtered by availability pattern.
119
+ """
120
+
121
+ @abstractmethod
122
+ def filter_by_lead_time(self, lead_time: LeadTime) -> Self:
123
+ """Filter to include only data available at or longer than the specified lead time.
124
+
125
+ Args:
126
+ lead_time: Minimum time gap required between data availability and timestamp.
127
+
128
+ Returns:
129
+ New instance containing only data available with the required lead time.
130
+ """
131
+
132
+ @abstractmethod
133
+ def select_version(self) -> "TimeSeriesDataset":
134
+ """Select a specific version of the dataset based on data availability.
135
+
136
+ Creates a point-in-time snapshot by selecting the latest available version
137
+ for each timestamp. Essential for preventing lookahead bias in backtesting.
138
+
139
+ Returns:
140
+ TimeSeriesDataset containing the selected version of the data.
141
+ """
142
+
143
+ def calculate_time_coverage(self) -> timedelta:
144
+ """Calculates the total time span covered by the dataset.
145
+
146
+ This method computes the total duration represented by the dataset
147
+ based on its unique timestamps and sample interval.
148
+
149
+ Returns:
150
+ timedelta: Total time coverage of the dataset.
151
+ """
152
+ return len(self.index.unique()) * self.sample_interval
153
+
154
+
155
+ class DatasetMixin(Protocol):
156
+ """Abstract base class for dataset persistence operations.
157
+
158
+ This mixin defines the interface for saving and loading datasets to/from
159
+ parquet files. It ensures datasets can be persisted with all their metadata
160
+ and reconstructed exactly as they were saved.
161
+
162
+ Classes implementing this mixin must:
163
+
164
+ - Save all data and metadata necessary for complete reconstruction
165
+ - Store metadata in parquet file attributes using attrs
166
+ - Handle missing metadata gracefully with sensible defaults when loading
167
+
168
+ See Also:
169
+ TimeSeriesDataset: Implementation for standard time series datasets.
170
+ VersionedTimeSeriesDataset: Implementation for versioned dataset segments.
171
+ """
172
+
173
+ @abstractmethod
174
+ def to_parquet(self, path: FilePath) -> None:
175
+ """Save the dataset to a parquet file.
176
+
177
+ Stores both the dataset's data and all necessary metadata for complete
178
+ reconstruction. Metadata should be stored in the parquet file's attrs
179
+ dictionary.
180
+
181
+ Args:
182
+ path: File path where the dataset should be saved.
183
+
184
+ See Also:
185
+ read_parquet: Counterpart method for loading datasets.
186
+ """
187
+
188
+ @classmethod
189
+ @abstractmethod
190
+ def read_parquet(cls, path: FilePath) -> Self:
191
+ """Load a dataset from a parquet file.
192
+
193
+ Reconstructs a dataset from a parquet file created with to_parquet,
194
+ including all data and metadata. Should handle missing metadata
195
+ gracefully with sensible defaults.
196
+
197
+ Args:
198
+ path: Path to the parquet file to load.
199
+
200
+ Returns:
201
+ New dataset instance reconstructed from the file.
202
+
203
+ See Also:
204
+ to_parquet: Counterpart method for saving datasets.
205
+ """
206
+
207
+ def pipe[T, **P](self, func: Callable[Concatenate[Self, P], T], *args: P.args, **kwargs: P.kwargs) -> T:
208
+ """Applies a function to the dataset and returns the result.
209
+
210
+ This method allows for functional-style transformations and operations
211
+ on the dataset, enabling method chaining and cleaner code.
212
+
213
+ Args:
214
+ func: A callable that takes the dataset instance and returns a value of type T.
215
+ *args: Positional arguments to pass to the function.
216
+ **kwargs: Keyword arguments to pass to the function.
217
+
218
+ Returns:
219
+ The result of applying the function to the dataset.
220
+ """
221
+ return func(self, *args, **kwargs)