PyPI - openstef-core - Versions diffs - 4.0.0.dev1__tar.gz → 4.1.0__tar.gz - Mend

openstef-core 4.0.0.dev1tar.gz → 4.1.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

{openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/.gitignore RENAMED Viewed

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com> # noqa E501>
+# SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project <openstef@lfenergy.org> # noqa E501>
 # SPDX-License-Identifier: MPL-2.0
 # Core
@@ -35,10 +35,6 @@ MANIFEST
 # Ruff
 .ruff_cache/
-# Pyright
-.pyright/
-# pyright-report/
 # Test, coverage, tox
 .pytest_cache/
 .coverage
@@ -67,6 +63,14 @@ dmypy.json
 # Sphinx
 docs/_build/
 docs/source/api/generated/
+docs/source/tutorials/
+docs/source/benchmarks/
+# Community health files materialized from OpenSTEF/.github at build time
+docs/source/contribute/_community/
+docs/source/user_guide/**/quick_start_tutorial.py
+docs/source/user_guide/**/feature_engineering_tutorial.py
+docs/source/user_guide/**/datasets_tutorial.py
+docs/source/user_guide/**/backtesting_tutorial.py
 # docs/_doctrees/
 # docs/_static_gen/
@@ -124,4 +128,20 @@ certificates/
 *.pkl
 # Benchmark outputs
-benchmark_results*/
+benchmark_results*/
+# Local dataset files
+liander_dataset/
+# Deployment example run artifacts (MLflow store, forecasts, dataset, Celery/Airflow state)
+openstef_deployment_runs/
+# Mlflow
+/mlflow
+/mlflow_artifacts_local
+.github/instructions
+# Jupyter notebook cache (myst-nb execution outputs)
+.jupyter_cache/
+docs/build.zip

{openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/PKG-INFO RENAMED Viewed

@@ -1,12 +1,12 @@
 Metadata-Version: 2.4
 Name: openstef-core
-Version: 4.0.0.dev1
+Version: 4.1.0
 Summary: Core functionality for OpenSTEF, a framework for short-term energy forecasting.
 Project-URL: Documentation, https://openstef.github.io/openstef/index.html
 Project-URL: Homepage, https://lfenergy.org/projects/openstef/
 Project-URL: Issues, https://github.com/OpenSTEF/openstef/issues
 Project-URL: Repository, https://github.com/OpenSTEF/openstef
-Author-email: "Alliander N.V" <short.term.energy.forecasts@alliander.com>
+Author-email: "Alliander N.V" <openstef@lfenergy.org>
 License-Expression: MPL-2.0
 Keywords: energy,forecasting,machinelearning
 Classifier: Development Status :: 5 - Production/Stable
@@ -22,12 +22,14 @@ Requires-Dist: pandas<3,>=2.3.1
 Requires-Dist: pyarrow>=21
 Requires-Dist: pydantic-extra-types<3,>=2.10.5
 Requires-Dist: pydantic<3,>=2.12.4
+Provides-Extra: benchmark
+Requires-Dist: huggingface-hub>=1.2.2; extra == 'benchmark'
 Description-Content-Type: text/markdown
 <!--
-SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
+SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
 SPDX-License-Identifier: MPL-2.0
 -->
-# openstef-core
+# openstef-core

{openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/README.md RENAMED Viewed

@@ -1,7 +1,7 @@
 <!--
-SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
+SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
 SPDX-License-Identifier: MPL-2.0
 -->
-# openstef-core
+# openstef-core

{openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/pyproject.toml RENAMED Viewed

@@ -1,21 +1,19 @@
-# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
+# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
 #
 # SPDX-License-Identifier: MPL-2.0
 [build-system]
 build-backend = "hatchling.build"
 requires = [ "hatchling" ]
 [project]
 name = "openstef-core"
-version = "4.0.0.dev1"
+version = "4.1.0"
 description = "Core functionality for OpenSTEF, a framework for short-term energy forecasting."
 readme = "README.md"
 keywords = [ "energy", "forecasting", "machinelearning" ]
 license = "MPL-2.0"
 authors = [
-  { name = "Alliander N.V", email = "short.term.energy.forecasts@alliander.com" },
+  { name = "Alliander N.V", email = "openstef@lfenergy.org" },
 ]
 requires-python = ">=3.12,<4.0"
 classifiers = [
@@ -26,20 +24,22 @@ classifiers = [
   "Programming Language :: Python :: 3.13",
   "Programming Language :: Python :: 3.14",
 ]
 dependencies = [
   "joblib>=1,<2",
   "numpy>=2.3.2,<3",
+  # Held at <3 pending the pandas 3.0 Copy-on-Write migration (see migration issue).
   "pandas>=2.3.1,<3",
   "pyarrow>=21",
   "pydantic>=2.12.4,<3",
   "pydantic-extra-types>=2.10.5,<3",
 ]
+optional-dependencies.benchmark = [
+  "huggingface-hub>=1.2.2",
+]
 urls.Documentation = "https://openstef.github.io/openstef/index.html"
 urls.Homepage = "https://lfenergy.org/projects/openstef/"
 urls.Issues = "https://github.com/OpenSTEF/openstef/issues"
 urls.Repository = "https://github.com/OpenSTEF/openstef"
-[tool.hatch.build.targets.wheel]
-packages = [ "src/openstef_core" ]
+[tool.hatch]
+build.targets.wheel.packages = [ "src/openstef_core" ]

{openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
+# SPDX-FileCopyrightText: 2017-2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
 #
 # SPDX-License-Identifier: MPL-2.0
 """Core functionality for OpenSTEF, a framework for short-term energy forecasting."""

{openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/base_model.py RENAMED Viewed

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
+# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
 #
 # SPDX-License-Identifier: MPL-2.0
@@ -11,11 +11,18 @@ operate on arbitrary config instances or Pydantic models / adapters.
 """
 from pathlib import Path
-from typing import Annotated, Any, Self
+from typing import Annotated, Any, Self, override
 import yaml
 from pydantic import BaseModel as PydanticBaseModel
-from pydantic import BeforeValidator, ConfigDict, GetCoreSchemaHandler, TypeAdapter
+from pydantic import (
+    BeforeValidator,
+    ConfigDict,
+    GetCoreSchemaHandler,
+    GetJsonSchemaHandler,
+    TypeAdapter,
+    ValidationInfo,
+)
 from pydantic_core import core_schema
@@ -105,6 +112,7 @@ def read_yaml_config[T: BaseConfig, U](path: Path, class_type: type[T] | TypeAda
 class PydanticStringPrimitive:
     """Base class for Pydantic-compatible types with string serialization."""
+    @override
     def __str__(self) -> str:
         """Convert to string representation."""
         raise NotImplementedError("Subclasses must implement __str__")
@@ -115,7 +123,7 @@ class PydanticStringPrimitive:
         raise NotImplementedError("Subclasses must implement from_string")
     @classmethod
-    def validate(cls, v: Any, _info: Any = None) -> Self:  # noqa: ANN401
+    def validate(cls, v: Any, _info: ValidationInfo | None = None) -> Self:  # noqa: ANN401
         """Validate and convert input to this type.
         Args:
@@ -150,6 +158,22 @@ class PydanticStringPrimitive:
             function=cls.validate, serialization=core_schema.plain_serializer_function_ser_schema(cls.__str__)
         )
+    @classmethod
+    def __get_pydantic_json_schema__(
+        cls,
+        _schema: core_schema.CoreSchema,
+        handler: GetJsonSchemaHandler,
+    ) -> dict[str, Any]:
+        """Generate JSON schema for OpenAPI / FastAPI compatibility.
+        All string-primitive types serialise as plain strings.
+        Returns:
+            JSON schema describing the type as a string.
+        """
+        return {"type": "string"}
+    @override
     def __eq__(self, other: object) -> bool:
         """Check equality based on string representation.
@@ -160,6 +184,7 @@ class PydanticStringPrimitive:
             return NotImplemented
         return str(self) == str(other)
+    @override
     def __hash__(self) -> int:
         """Return hash based on string representation."""
         return hash(str(self))

openstef_core-4.1.0/src/openstef_core/constants.py ADDED Viewed

@@ -0,0 +1,10 @@
+# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
+#
+# SPDX-License-Identifier: MPL-2.0
+"""Shared constants for the openstef_core package."""
+LIANDER_DATASET_REPO_ID = "OpenSTEF/liander2024-energy-forecasting-benchmark"
+__all__ = ["LIANDER_DATASET_REPO_ID"]

{openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/datasets/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
+# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
 #
 # SPDX-License-Identifier: MPL-2.0
@@ -9,6 +9,7 @@ It includes both simple time series datasets and versioned datasets that track d
 over time, enabling realistic backtesting and training and forecasting.
 The module supports:
     - Regular time series with consistent sampling intervals
     - Versioned time series that track when data became available
     - Validated datasets with domain-specific constraints
@@ -19,6 +20,7 @@ The module supports:
 from openstef_core.datasets.timeseries_dataset import TimeSeriesDataset, validate_horizons_present
 from openstef_core.datasets.validated_datasets import (
     EnergyComponentDataset,
+    EnsembleForecastDataset,
     ForecastDataset,
     ForecastInputDataset,
 )
@@ -26,6 +28,7 @@ from openstef_core.datasets.versioned_timeseries_dataset import VersionedTimeSer
 __all__ = [
     "EnergyComponentDataset",
+    "EnsembleForecastDataset",
     "ForecastDataset",
     "ForecastInputDataset",
     "TimeSeriesDataset",

{openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/datasets/mixins.py RENAMED Viewed

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
+# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
 #
 # SPDX-License-Identifier: MPL-2.0
@@ -9,6 +9,7 @@ datasets in OpenSTEF. Protocols enable type checking and documentation of expect
 behavior without requiring inheritance.
 Key protocols:
     - TimeSeries: Core interface for all time series datasets with filtering and versioning
     - DatasetMixin: Interface for dataset persistence operations
 """
@@ -36,6 +37,7 @@ class TimeSeriesMixin(Protocol):
     the dataset's temporal index, and filtering/versioning capabilities.
     Classes implementing this interface must provide:
         - Access to the datetime index
         - Sample interval information
         - Feature names list
@@ -158,6 +160,7 @@ class DatasetMixin(Protocol):
     and reconstructed exactly as they were saved.
     Classes implementing this mixin must:
     - Save all data and metadata necessary for complete reconstruction
     - Store metadata in parquet file attributes using attrs
     - Handle missing metadata gracefully with sensible defaults when loading

{openstef_core-4.0.0.dev1 → openstef_core-4.1.0}/src/openstef_core/datasets/timeseries_dataset.py RENAMED Viewed

@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
+# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
 #
 # SPDX-License-Identifier: MPL-2.0
@@ -33,7 +33,7 @@ from openstef_core.utils.pandas import unsafe_sorted_range_slice_idxs
 _logger = logging.getLogger(__name__)
-class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin):  # noqa: PLR0904 - important utility class, allow too many public methods
+class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin):
     """A time series dataset with regular sampling intervals and optional versioning.
     This class represents time series data with a consistent sampling interval
@@ -42,11 +42,18 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin):  # noqa: PLR0904 - impor
     over time through either a horizon column or an available_at column.
     The dataset automatically detects versioning:
     - If a horizon column exists, data is versioned by forecast horizon
     - If an available_at column exists, data is versioned by availability time
     - Otherwise, data is treated as a regular time series
+    Columns whose names start with a double underscore (``__``) are treated as
+    internal/system columns: they are kept in ``data`` so transforms can pass
+    them along, but are excluded from ``feature_names`` so feature-aware
+    transforms ignore them.
     The dataset guarantees:
         - Data is sorted by timestamp in ascending order
         - Consistent sampling interval across all data points
         - DateTime index for temporal operations
@@ -57,7 +64,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin):  # noqa: PLR0904 - impor
         available_at_column: Name of the column storing availability times (if versioned).
     Example:
-        Create a simple time series dataset:
+        Create a simple time series dataset
         >>> import pandas as pd
         >>> from datetime import timedelta
@@ -71,7 +78,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin):  # noqa: PLR0904 - impor
         >>> dataset.is_versioned
         False
-        Create a versioned dataset with horizons:
+        Create a versioned dataset with horizons
         >>> data_with_horizon = pd.DataFrame({
         ...     'load': [100, 120],
@@ -104,10 +111,12 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin):  # noqa: PLR0904 - impor
         horizon_column: str = "horizon",
         available_at_column: str = "available_at",
         is_sorted: bool = False,
+        check_frequency: bool = False,
     ) -> None:
         """Initialize a time series dataset.
         The dataset automatically detects whether it's versioned based on column presence:
         - If horizon_column exists: versioned by forecast horizon
         - If available_at_column exists: versioned by availability time
         - Otherwise: regular time series
@@ -118,10 +127,12 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin):  # noqa: PLR0904 - impor
             horizon_column: Name of the column storing forecast horizons.
             available_at_column: Name of the column storing availability times.
             is_sorted: Whether the data is sorted by timestamp.
+            check_frequency: Whether to check that the data frequency matches sample_interval.
         Raises:
             TypeError: If data index is not a pandas DatetimeIndex or if versioning
                 columns have incorrect types.
+            ValueError: If data frequency does not match sample_interval.
         """
         if not isinstance(data.index, pd.DatetimeIndex):
             raise TypeError("Data index must be a pandas DatetimeIndex.")
@@ -130,9 +141,15 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin):  # noqa: PLR0904 - impor
         self.horizon_column = horizon_column
         self.available_at_column = available_at_column
         self._sample_interval = sample_interval
-        self._internal_columns = set()
+        self._internal_columns = {col for col in data.columns if col.startswith("__")}
         data.index.name = self.index_name
+        # Check input data frequency matches sample_interval, only if there are enough data points to infer frequency
+        minimum_required_length = 2
+        if check_frequency and len(data) >= minimum_required_length and not self.frequency_matches(data.index):
+            msg = f"Data frequency does not match the sample_interval ({sample_interval})."
+            raise ValueError(msg)
         if self.horizon_column in data.columns:
             validate_timedelta_column(data[self.horizon_column])
             self._version_column = self.horizon_column
@@ -147,7 +164,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin):  # noqa: PLR0904 - impor
             self._horizons = None
         else:
             self._version_column = None
-            self._feature_names = data.columns.to_list()
+            self._feature_names = [col for col in data.columns if col not in self._internal_columns]
             self._horizons = None
         # Ensure invariants: data is sorted by timestamp
@@ -257,7 +274,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin):  # noqa: PLR0904 - impor
         if available_at_series is None:
             return self
-        cutoff = self.index.floor("D") - pd.Timedelta(available_at.lag_from_day)
+        cutoff = available_at.apply_index(self.index)
         data_filtered = self.data[available_at_series <= cutoff]
         return self._copy_with_data(data=data_filtered)
@@ -287,7 +304,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin):  # noqa: PLR0904 - impor
         Returns:
             New dataset containing only rows with timestamps in the mask.
         """
-        data_filtered = self.data.loc[self.index.isin(mask)]  # pyright: ignore[reportUnknownMemberType]
+        data_filtered = self.data.loc[self.index.isin(mask)]
         return self._copy_with_data(data=data_filtered)
@@ -303,7 +320,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin):  # noqa: PLR0904 - impor
         if self.horizons is None:
             return self
-        data_selected = self.data[self.lead_time_series == horizon.value]
+        data_selected = cast(pd.DataFrame, self.data[self.lead_time_series == horizon.value])
         return self._copy_with_data(data=data_selected)
     def to_pandas(self) -> pd.DataFrame:
@@ -378,7 +395,7 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin):  # noqa: PLR0904 - impor
         available_at_column: str = "available_at",
         horizon_column: str = "horizon",
     ) -> Self:
-        df = pd.read_parquet(path=path)  # pyright: ignore[reportUnknownMemberType]
+        df = pd.read_parquet(path=path)
         if not isinstance(df.index, pd.DatetimeIndex):
             if timestamp_column not in df.columns:
                 raise TimeSeriesValidationError(
@@ -443,6 +460,44 @@ class TimeSeriesDataset(TimeSeriesMixin, DatasetMixin):  # noqa: PLR0904 - impor
             is_sorted=is_sorted,
         )
+    @staticmethod
+    def _infer_frequency(index: pd.DatetimeIndex) -> pd.Timedelta:
+        """Infer the frequency of a pandas DatetimeIndex if the freq attribute is not set.
+        This method calculates the most common time difference between consecutive timestamps,
+        which is more permissive of missing chunks of data than the pandas infer_freq method.
+        Args:
+            index (pd.DatetimeIndex): The datetime index to infer the frequency from.
+        Returns:
+            pd.Timedelta: The inferred frequency as a pandas Timedelta.
+        Raises:
+            ValueError: If the index has fewer than 2 timestamps.
+        """
+        minimum_required_length = 2
+        if len(index) < minimum_required_length:
+            raise ValueError("Cannot infer frequency from an index with fewer than 2 timestamps.")
+        # Calculate the differences between consecutive timestamps
+        deltas = index.to_series().drop_duplicates().sort_values().diff().dropna()
+        # Find the most common difference
+        return deltas.mode().iloc[0]
+    def frequency_matches(self, index: pd.DatetimeIndex) -> bool:
+        """Check if the frequency of the data matches the model frequency.
+        Args:
+            index (pd.DatetimeIndex): The data to check.
+        Returns:
+            bool: True if the frequencies match, False otherwise.
+        """
+        input_sample_interval = self._infer_frequency(index) if index.freq is None else index.freq
+        return input_sample_interval == self.sample_interval
 def validate_horizons_present(dataset: TimeSeriesDataset, horizons: list[LeadTime]) -> None:
     """Validate that the specified forecast horizons are present in the dataset.
@@ -457,8 +512,8 @@ def validate_horizons_present(dataset: TimeSeriesDataset, horizons: list[LeadTim
     if dataset.horizons is None and len(horizons) == 1:
         return  # Non-versioned dataset can satisfy single-horizon requests
-    required_horizons = set(horizons or [])
-    missing_horizons = [h for h in horizons if h not in required_horizons]
+    available_horizons = set(dataset.horizons or [])
+    missing_horizons = [h for h in horizons if h not in available_horizons]
     if missing_horizons:
         raise TimeSeriesValidationError("Missing forecast horizons: " + ", ".join(map(str, missing_horizons)))

openstef-core 4.0.0.dev1__tar.gz → 4.1.0__tar.gz

openstef-core 4.0.0.dev1tar.gz → 4.1.0tar.gz