PyPI - openstef-core - Versions diffs - 4.0.0.dev1__tar.gz → 4.0.1__tar.gz - Mend

openstef-core 4.0.0.dev1tar.gz → 4.0.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

{openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/.gitignore RENAMED Viewed

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com> # noqa E501>
+# SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project <openstef@lfenergy.org> # noqa E501>
 # SPDX-License-Identifier: MPL-2.0
 # Core
@@ -67,6 +67,12 @@ dmypy.json
 # Sphinx
 docs/_build/
 docs/source/api/generated/
+docs/source/tutorials/
+docs/source/benchmarks/
+docs/source/user_guide/**/quick_start_tutorial.py
+docs/source/user_guide/**/feature_engineering_tutorial.py
+docs/source/user_guide/**/datasets_tutorial.py
+docs/source/user_guide/**/backtesting_tutorial.py
 # docs/_doctrees/
 # docs/_static_gen/
@@ -124,4 +130,17 @@ certificates/
 *.pkl
 # Benchmark outputs
-benchmark_results*/
+benchmark_results*/
+# Local dataset files
+liander_dataset/
+# Mlflow
+/mlflow
+/mlflow_artifacts_local
+.github/instructions
+# Jupyter notebook cache (myst-nb execution outputs)
+.jupyter_cache/
+docs/build.zip

{openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/PKG-INFO RENAMED Viewed

@@ -1,12 +1,12 @@
 Metadata-Version: 2.4
 Name: openstef-core
-Version: 4.0.0.dev1
+Version: 4.0.1
 Summary: Core functionality for OpenSTEF, a framework for short-term energy forecasting.
 Project-URL: Documentation, https://openstef.github.io/openstef/index.html
 Project-URL: Homepage, https://lfenergy.org/projects/openstef/
 Project-URL: Issues, https://github.com/OpenSTEF/openstef/issues
 Project-URL: Repository, https://github.com/OpenSTEF/openstef
-Author-email: "Alliander N.V" <short.term.energy.forecasts@alliander.com>
+Author-email: "Alliander N.V" <openstef@lfenergy.org>
 License-Expression: MPL-2.0
 Keywords: energy,forecasting,machinelearning
 Classifier: Development Status :: 5 - Production/Stable
@@ -22,12 +22,14 @@ Requires-Dist: pandas<3,>=2.3.1
 Requires-Dist: pyarrow>=21
 Requires-Dist: pydantic-extra-types<3,>=2.10.5
 Requires-Dist: pydantic<3,>=2.12.4
+Provides-Extra: benchmark
+Requires-Dist: huggingface-hub>=1.2.2; extra == 'benchmark'
 Description-Content-Type: text/markdown
 <!--
-SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
+SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
 SPDX-License-Identifier: MPL-2.0
 -->
-# openstef-core
+# openstef-core

{openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/README.md RENAMED Viewed

@@ -1,7 +1,7 @@
 <!--
-SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
+SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
 SPDX-License-Identifier: MPL-2.0
 -->
-# openstef-core
+# openstef-core

{openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/pyproject.toml RENAMED Viewed

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
+# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
 #
 # SPDX-License-Identifier: MPL-2.0
@@ -9,13 +9,13 @@ requires = [ "hatchling" ]
 [project]
 name = "openstef-core"
-version = "4.0.0.dev1"
+version = "4.0.1"
 description = "Core functionality for OpenSTEF, a framework for short-term energy forecasting."
 readme = "README.md"
 keywords = [ "energy", "forecasting", "machinelearning" ]
 license = "MPL-2.0"
 authors = [
-  { name = "Alliander N.V", email = "short.term.energy.forecasts@alliander.com" },
+  { name = "Alliander N.V", email = "openstef@lfenergy.org" },
 ]
 requires-python = ">=3.12,<4.0"
 classifiers = [
@@ -36,6 +36,10 @@ dependencies = [
   "pydantic-extra-types>=2.10.5,<3",
 ]
+optional-dependencies.benchmark = [
+  "huggingface-hub>=1.2.2",
+]
 urls.Documentation = "https://openstef.github.io/openstef/index.html"
 urls.Homepage = "https://lfenergy.org/projects/openstef/"
 urls.Issues = "https://github.com/OpenSTEF/openstef/issues"

{openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
+# SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
 #
 # SPDX-License-Identifier: MPL-2.0
 """Core functionality for OpenSTEF, a framework for short-term energy forecasting."""

{openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/base_model.py RENAMED Viewed

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
+# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
 #
 # SPDX-License-Identifier: MPL-2.0
@@ -15,7 +15,14 @@ from typing import Annotated, Any, Self
 import yaml
 from pydantic import BaseModel as PydanticBaseModel
-from pydantic import BeforeValidator, ConfigDict, GetCoreSchemaHandler, TypeAdapter
+from pydantic import (
+    BeforeValidator,
+    ConfigDict,
+    GetCoreSchemaHandler,
+    GetJsonSchemaHandler,
+    TypeAdapter,
+    ValidationInfo,
+)
 from pydantic_core import core_schema
@@ -115,7 +122,7 @@ class PydanticStringPrimitive:
         raise NotImplementedError("Subclasses must implement from_string")
     @classmethod
-    def validate(cls, v: Any, _info: Any = None) -> Self:  # noqa: ANN401
+    def validate(cls, v: Any, _info: ValidationInfo | None = None) -> Self:  # noqa: ANN401
         """Validate and convert input to this type.
         Args:
@@ -150,6 +157,21 @@ class PydanticStringPrimitive:
             function=cls.validate, serialization=core_schema.plain_serializer_function_ser_schema(cls.__str__)
         )
+    @classmethod
+    def __get_pydantic_json_schema__(  # noqa: PLW3201
+        cls,
+        _schema: core_schema.CoreSchema,
+        handler: GetJsonSchemaHandler,
+    ) -> dict[str, Any]:
+        """Generate JSON schema for OpenAPI / FastAPI compatibility.
+        All string-primitive types serialise as plain strings.
+        Returns:
+            JSON schema describing the type as a string.
+        """
+        return {"type": "string"}
     def __eq__(self, other: object) -> bool:
         """Check equality based on string representation.

openstef_core-4.0.1/src/openstef_core/constants.py ADDED Viewed

@@ -0,0 +1,10 @@
+# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
+#
+# SPDX-License-Identifier: MPL-2.0
+"""Shared constants for the openstef_core package."""
+LIANDER_DATASET_REPO_ID = "OpenSTEF/liander2024-energy-forecasting-benchmark"
+__all__ = ["LIANDER_DATASET_REPO_ID"]

{openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/datasets/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
+# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
 #
 # SPDX-License-Identifier: MPL-2.0
@@ -9,6 +9,7 @@ It includes both simple time series datasets and versioned datasets that track d
 over time, enabling realistic backtesting and training and forecasting.
 The module supports:
     - Regular time series with consistent sampling intervals
     - Versioned time series that track when data became available
     - Validated datasets with domain-specific constraints
@@ -19,6 +20,7 @@ The module supports:
 from openstef_core.datasets.timeseries_dataset import TimeSeriesDataset, validate_horizons_present
 from openstef_core.datasets.validated_datasets import (
     EnergyComponentDataset,
+    EnsembleForecastDataset,
     ForecastDataset,
     ForecastInputDataset,
 )
@@ -26,6 +28,7 @@ from openstef_core.datasets.versioned_timeseries_dataset import VersionedTimeSer
 __all__ = [
     "EnergyComponentDataset",
+    "EnsembleForecastDataset",
     "ForecastDataset",
     "ForecastInputDataset",
     "TimeSeriesDataset",

{openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/datasets/mixins.py RENAMED Viewed

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
+# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
 #
 # SPDX-License-Identifier: MPL-2.0
@@ -9,6 +9,7 @@ datasets in OpenSTEF. Protocols enable type checking and documentation of expect
 behavior without requiring inheritance.
 Key protocols:
     - TimeSeries: Core interface for all time series datasets with filtering and versioning
     - DatasetMixin: Interface for dataset persistence operations
 """
@@ -36,6 +37,7 @@ class TimeSeriesMixin(Protocol):
     the dataset's temporal index, and filtering/versioning capabilities.
     Classes implementing this interface must provide:
         - Access to the datetime index
         - Sample interval information
         - Feature names list
@@ -158,6 +160,7 @@ class DatasetMixin(Protocol):
     and reconstructed exactly as they were saved.
     Classes implementing this mixin must:
     - Save all data and metadata necessary for complete reconstruction
     - Store metadata in parquet file attributes using attrs
     - Handle missing metadata gracefully with sensible defaults when loading

{openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/datasets/timeseries_dataset.py RENAMED Viewed

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
+# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
 #
 # SPDX-License-Identifier: MPL-2.0
@@ -42,11 +42,18 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin):  # noqa: PLR0904 - impor
     over time through either a horizon column or an available_at column.
     The dataset automatically detects versioning:
     - If a horizon column exists, data is versioned by forecast horizon
     - If an available_at column exists, data is versioned by availability time
     - Otherwise, data is treated as a regular time series
+    Columns whose names start with a double underscore (``__``) are treated as
+    internal/system columns: they are kept in ``data`` so transforms can pass
+    them along, but are excluded from ``feature_names`` so feature-aware
+    transforms ignore them.
     The dataset guarantees:
         - Data is sorted by timestamp in ascending order
         - Consistent sampling interval across all data points
         - DateTime index for temporal operations
@@ -57,7 +64,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin):  # noqa: PLR0904 - impor
         available_at_column: Name of the column storing availability times (if versioned).
     Example:
-        Create a simple time series dataset:
+        Create a simple time series dataset
         >>> import pandas as pd
         >>> from datetime import timedelta
@@ -71,7 +78,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin):  # noqa: PLR0904 - impor
         >>> dataset.is_versioned
         False
-        Create a versioned dataset with horizons:
+        Create a versioned dataset with horizons
         >>> data_with_horizon = pd.DataFrame({
         ...     'load': [100, 120],
@@ -104,10 +111,12 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin):  # noqa: PLR0904 - impor
         horizon_column: str = "horizon",
         available_at_column: str = "available_at",
         is_sorted: bool = False,
+        check_frequency: bool = False,
     ) -> None:
         """Initialize a time series dataset.
         The dataset automatically detects whether it's versioned based on column presence:
         - If horizon_column exists: versioned by forecast horizon
         - If available_at_column exists: versioned by availability time
         - Otherwise: regular time series
@@ -118,10 +127,12 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin):  # noqa: PLR0904 - impor
             horizon_column: Name of the column storing forecast horizons.
             available_at_column: Name of the column storing availability times.
             is_sorted: Whether the data is sorted by timestamp.
+            check_frequency: Whether to check that the data frequency matches sample_interval.
         Raises:
             TypeError: If data index is not a pandas DatetimeIndex or if versioning
                 columns have incorrect types.
+            ValueError: If data frequency does not match sample_interval.
         """
         if not isinstance(data.index, pd.DatetimeIndex):
             raise TypeError("Data index must be a pandas DatetimeIndex.")
@@ -130,9 +141,15 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin):  # noqa: PLR0904 - impor
         self.horizon_column = horizon_column
         self.available_at_column = available_at_column
         self._sample_interval = sample_interval
-        self._internal_columns = set()
+        self._internal_columns = {col for col in data.columns if col.startswith("__")}
         data.index.name = self.index_name
+        # Check input data frequency matches sample_interval, only if there are enough data points to infer frequency
+        minimum_required_length = 2
+        if check_frequency and len(data) >= minimum_required_length and not self.frequency_matches(data.index):
+            msg = f"Data frequency does not match the sample_interval ({sample_interval})."
+            raise ValueError(msg)
         if self.horizon_column in data.columns:
             validate_timedelta_column(data[self.horizon_column])
             self._version_column = self.horizon_column
@@ -147,7 +164,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin):  # noqa: PLR0904 - impor
             self._horizons = None
         else:
             self._version_column = None
-            self._feature_names = data.columns.to_list()
+            self._feature_names = [col for col in data.columns if col not in self._internal_columns]
             self._horizons = None
         # Ensure invariants: data is sorted by timestamp
@@ -257,7 +274,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin):  # noqa: PLR0904 - impor
         if available_at_series is None:
             return self
-        cutoff = self.index.floor("D") - pd.Timedelta(available_at.lag_from_day)
+        cutoff = available_at.apply_index(self.index)
         data_filtered = self.data[available_at_series <= cutoff]
         return self._copy_with_data(data=data_filtered)
@@ -443,6 +460,44 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin):  # noqa: PLR0904 - impor
             is_sorted=is_sorted,
         )
+    @staticmethod
+    def _infer_frequency(index: pd.DatetimeIndex) -> pd.Timedelta:
+        """Infer the frequency of a pandas DatetimeIndex if the freq attribute is not set.
+        This method calculates the most common time difference between consecutive timestamps,
+        which is more permissive of missing chunks of data than the pandas infer_freq method.
+        Args:
+            index (pd.DatetimeIndex): The datetime index to infer the frequency from.
+        Returns:
+            pd.Timedelta: The inferred frequency as a pandas Timedelta.
+        Raises:
+            ValueError: If the index has fewer than 2 timestamps.
+        """
+        minimum_required_length = 2
+        if len(index) < minimum_required_length:
+            raise ValueError("Cannot infer frequency from an index with fewer than 2 timestamps.")
+        # Calculate the differences between consecutive timestamps
+        deltas = index.to_series().drop_duplicates().sort_values().diff().dropna()
+        # Find the most common difference
+        return deltas.mode().iloc[0]
+    def frequency_matches(self, index: pd.DatetimeIndex) -> bool:
+        """Check if the frequency of the data matches the model frequency.
+        Args:
+            index (pd.DatetimeIndex): The data to check.
+        Returns:
+            bool: True if the frequencies match, False otherwise.
+        """
+        input_sample_interval = self._infer_frequency(index) if index.freq is None else index.freq
+        return input_sample_interval == self.sample_interval
 def validate_horizons_present(dataset: TimeSeriesDataset, horizons: list[LeadTime]) -> None:
     """Validate that the specified forecast horizons are present in the dataset.

{openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/datasets/validated_datasets.py RENAMED Viewed

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
+# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
 #
 # SPDX-License-Identifier: MPL-2.0
@@ -12,6 +12,7 @@ validation to catch data quality issues early.
 from datetime import datetime, timedelta
 from typing import Self, override
+import numpy as np
 import pandas as pd
 from openstef_core.datasets.timeseries_dataset import TimeSeriesDataset
@@ -19,6 +20,8 @@ from openstef_core.datasets.validation import validate_required_columns
 from openstef_core.exceptions import MissingColumnsError
 from openstef_core.types import EnergyComponentType, LeadTime, Quantile
+ENSEMBLE_COLUMN_SEP: str = "__"
 class ForecastInputDataset(TimeSeriesDataset):
     """Time series dataset for forecasting with validated target column.
@@ -78,6 +81,7 @@ class ForecastInputDataset(TimeSeriesDataset):
         *,
         horizon_column: str = "horizon",
         available_at_column: str = "available_at",
+        check_frequency: bool = False,
         sample_weight_column: str = "sample_weight",
         target_column: str = "load",
     ) -> None:
@@ -95,6 +99,7 @@ class ForecastInputDataset(TimeSeriesDataset):
             sample_interval=sample_interval,
             horizon_column=horizon_column,
             available_at_column=available_at_column,
+            check_frequency=check_frequency,
         )
         self._internal_columns.add(self.sample_weight_column)
         self._feature_names = [col for col in self.data.columns if col not in self._internal_columns]
@@ -250,12 +255,14 @@ class ForecastDataset(TimeSeriesDataset):
         *,
         horizon_column: str = "horizon",
         available_at_column: str = "available_at",
+        standard_deviation_column: str = "stdev",
     ) -> None:
         if "forecast_start" in data.attrs:
             self.forecast_start = datetime.fromisoformat(data.attrs["forecast_start"])
         else:
             self.forecast_start = forecast_start if forecast_start is not None else data.index.min().to_pydatetime()
         self.target_column = data.attrs.get("target_column", target_column)
+        self.standard_deviation_column = data.attrs.get("standard_deviation_column", standard_deviation_column)
         super().__init__(
             data=data,
@@ -264,12 +271,42 @@ class ForecastDataset(TimeSeriesDataset):
             available_at_column=available_at_column,
         )
-        quantile_feature_names = [col for col in self.feature_names if col != target_column]
+        exclude_columns = {target_column, standard_deviation_column}
+        quantile_feature_names = [col for col in self.feature_names if col not in exclude_columns]
         if not all(Quantile.is_valid_quantile_string(col) for col in quantile_feature_names):
             raise ValueError("All feature names must be valid quantile strings.")
         self.quantiles = [Quantile.parse(col) for col in quantile_feature_names]
+    @classmethod
+    def from_quantile_predictions(
+        cls,
+        predictions: np.ndarray,
+        index: pd.Index,
+        quantiles: list[Quantile],
+        sample_interval: timedelta,
+        *,
+        target_column: str = "load",
+    ) -> "ForecastDataset":
+        """Build a ``ForecastDataset`` from a raw predictions array.
+        Args:
+            predictions: Shape ``(n_samples, n_quantiles)``.
+            index: Time index for the predictions.
+            quantiles: Quantiles the model was trained on, in the same order as columns in *predictions*.
+            sample_interval: Temporal resolution of the dataset.
+            target_column: Name of the target column to attach to the dataset.
+        Returns:
+            ``ForecastDataset`` with quantile columns and the provided time index.
+        """
+        df = pd.DataFrame(
+            data=predictions,
+            index=index,
+            columns=[q.format() for q in quantiles],
+        )
+        return cls(data=df, sample_interval=sample_interval, target_column=target_column)
     @property
     def target_series(self) -> pd.Series | None:
         """Extract the target time series from the dataset.
@@ -296,6 +333,20 @@ class ForecastDataset(TimeSeriesDataset):
             raise MissingColumnsError(missing_columns=[median_col])
         return self.data[median_col]
+    @property
+    def standard_deviation_series(self) -> pd.Series:
+        """Extract the standard deviation series if it exists.
+        Returns:
+            Time series containing standard deviation values with original datetime index.
+        Raises:
+            MissingColumnsError: If the standard deviation column is not found.
+        """
+        if self.standard_deviation_column not in self.data.columns:
+            raise MissingColumnsError(missing_columns=[self.standard_deviation_column])
+        return self.data[self.standard_deviation_column]  # pyright: ignore[reportUnknownVariableType]
     @property
     def quantiles_data(self) -> pd.DataFrame:
         """Extract DataFrame containing only the quantile forecast columns.
@@ -331,6 +382,7 @@ class ForecastDataset(TimeSeriesDataset):
         df = super().to_pandas()
         df.attrs["target_column"] = self.target_column
         df.attrs["forecast_start"] = self.forecast_start.isoformat()
+        df.attrs["standard_deviation_column"] = self.standard_deviation_column
         return df
     @classmethod
@@ -409,8 +461,156 @@ class EnergyComponentDataset(TimeSeriesDataset):
         )
+class EnsembleForecastDataset(TimeSeriesDataset):
+    """First stage output format for ensemble forecasters."""
+    forecast_start: datetime
+    quantiles: list[Quantile]
+    forecaster_names: list[str]
+    target_column: str
+    @override
+    def __init__(
+        self,
+        data: pd.DataFrame,
+        sample_interval: timedelta = timedelta(minutes=15),
+        forecast_start: datetime | None = None,
+        target_column: str = "load",
+        *,
+        horizon_column: str = "horizon",
+        available_at_column: str = "available_at",
+    ) -> None:
+        if "forecast_start" in data.attrs:
+            self.forecast_start = datetime.fromisoformat(data.attrs["forecast_start"])
+        else:
+            self.forecast_start = forecast_start if forecast_start is not None else data.index.min().to_pydatetime()
+        self.target_column = data.attrs.get("target_column", target_column)
+        super().__init__(
+            data=data,
+            sample_interval=sample_interval,
+            horizon_column=horizon_column,
+            available_at_column=available_at_column,
+        )
+        quantile_feature_names = [col for col in self.feature_names if col != target_column]
+        self.forecaster_names, self.quantiles = self.get_learner_and_quantile(pd.Index(quantile_feature_names))
+        for name in self.forecaster_names:
+            if ENSEMBLE_COLUMN_SEP in name:
+                msg = f"Forecaster name '{name}' must not contain separator '{ENSEMBLE_COLUMN_SEP}'."
+                raise ValueError(msg)
+        n_cols = len(self.forecaster_names) * len(self.quantiles)
+        if len(data.columns) not in {n_cols + 1, n_cols}:
+            raise ValueError("Data columns do not match the expected number based on base forecasters and quantiles.")
+    @property
+    def target_series(self) -> pd.Series | None:
+        """Return the target series if available."""
+        if self.target_column in self.data.columns:
+            return self.data[self.target_column]
+        return None
+    @staticmethod
+    def get_learner_and_quantile(feature_names: pd.Index) -> tuple[list[str], list[Quantile]]:
+        """Extract base forecaster names and quantiles from feature names.
+        Column format is ``{learner}{ENSEMBLE_COLUMN_SEP}{quantile.format()}``,
+        e.g. ``lgbm__quantile_P50``.
+        Args:
+            feature_names: Index of feature names in the dataset.
+        Returns:
+            Tuple containing a list of base forecaster names and a list of quantiles.
+        Raises:
+            ValueError: If a column cannot be parsed or has an invalid quantile string.
+        """
+        forecasters: set[str] = set()
+        quantiles: set[Quantile] = set()
+        for feature_name in feature_names:
+            parts = feature_name.split(ENSEMBLE_COLUMN_SEP, maxsplit=1)
+            if len(parts) != 2:  # noqa: PLR2004
+                msg = f"Column missing separator '{ENSEMBLE_COLUMN_SEP}': {feature_name}"
+                raise ValueError(msg)
+            learner_part, quantile_part = parts
+            if not Quantile.is_valid_quantile_string(quantile_part):
+                msg = f"Column has no valid quantile string: {feature_name}"
+                raise ValueError(msg)
+            forecasters.add(learner_part)
+            quantiles.add(Quantile.parse(quantile_part))
+        return list(forecasters), list(quantiles)
+    @classmethod
+    def from_forecast_datasets(
+        cls,
+        datasets: dict[str, ForecastDataset],
+        target_series: pd.Series | None = None,
+        sample_weights: pd.Series | None = None,
+    ) -> Self:
+        """Create an EnsembleForecastDataset from multiple ForecastDatasets.
+        Args:
+            datasets: Dict of ForecastDatasets to combine.
+            target_series: Optional target series to include in the dataset.
+            sample_weights: Optional sample weights series to include in the dataset.
+        Returns:
+            EnsembleForecastDataset combining all input datasets.
+        """
+        ds1 = next(iter(datasets.values()))
+        additional_columns: dict[str, pd.Series] = {}
+        if isinstance(ds1.target_series, pd.Series):
+            additional_columns[ds1.target_column] = ds1.target_series
+        elif target_series is not None:
+            additional_columns[ds1.target_column] = target_series
+        sample_weight_column = "sample_weight"
+        if sample_weights is not None:
+            additional_columns[sample_weight_column] = sample_weights
+        combined_data = pd.DataFrame({
+            f"{learner}{ENSEMBLE_COLUMN_SEP}{q.format()}": ds.data[q.format()]
+            for learner, ds in datasets.items()
+            for q in ds.quantiles
+        }).assign(**additional_columns)
+        return cls(
+            data=combined_data,
+            sample_interval=ds1.sample_interval,
+            forecast_start=ds1.forecast_start,
+            target_column=ds1.target_column,
+        )
+    def get_base_predictions_for_quantile(self, quantile: Quantile) -> ForecastInputDataset:
+        """Get base forecaster predictions for a specific quantile.
+        Args:
+            quantile: Quantile to select.
+        Returns:
+            ForecastInputDataset containing predictions from all base forecasters at the specified quantile.
+        """
+        selected_columns = [f"{learner}{ENSEMBLE_COLUMN_SEP}{quantile.format()}" for learner in self.forecaster_names]
+        selected_columns.append(self.target_column)
+        prediction_data = self.data[selected_columns].copy()
+        prediction_data.columns = [*self.forecaster_names, self.target_column]
+        return ForecastInputDataset(
+            data=prediction_data,
+            sample_interval=self.sample_interval,
+            target_column=self.target_column,
+            forecast_start=self.forecast_start,
+        )
 __all__ = [
+    "ENSEMBLE_COLUMN_SEP",
     "EnergyComponentDataset",
+    "EnsembleForecastDataset",
     "ForecastDataset",
     "ForecastInputDataset",
 ]

{openstef_core-4.0.0.dev1 → openstef_core-4.0.1}/src/openstef_core/datasets/validation.py RENAMED Viewed

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
+# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
 #
 # SPDX-License-Identifier: MPL-2.0

openstef-core 4.0.0.dev1__tar.gz → 4.0.1__tar.gz

openstef-core 4.0.0.dev1tar.gz → 4.0.1tar.gz