openstef-core 4.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstef_core/__init__.py +4 -0
- openstef_core/base_model.py +205 -0
- openstef_core/constants.py +10 -0
- openstef_core/datasets/__init__.py +37 -0
- openstef_core/datasets/mixins.py +221 -0
- openstef_core/datasets/timeseries_dataset.py +521 -0
- openstef_core/datasets/validated_datasets.py +616 -0
- openstef_core/datasets/validation.py +150 -0
- openstef_core/datasets/versioned_timeseries_dataset.py +338 -0
- openstef_core/exceptions.py +217 -0
- openstef_core/mixins/__init__.py +24 -0
- openstef_core/mixins/param_ranges.py +131 -0
- openstef_core/mixins/predictor.py +307 -0
- openstef_core/mixins/stateful.py +132 -0
- openstef_core/mixins/transform.py +184 -0
- openstef_core/testing.py +306 -0
- openstef_core/transforms/__init__.py +17 -0
- openstef_core/transforms/dataset_transforms.py +82 -0
- openstef_core/types.py +453 -0
- openstef_core/utils/__init__.py +34 -0
- openstef_core/utils/datetime.py +109 -0
- openstef_core/utils/invariants.py +31 -0
- openstef_core/utils/itertools.py +108 -0
- openstef_core/utils/multiprocessing.py +77 -0
- openstef_core/utils/pandas.py +105 -0
- openstef_core/utils/pydantic.py +59 -0
- openstef_core-4.0.0.dist-info/METADATA +35 -0
- openstef_core-4.0.0.dist-info/RECORD +29 -0
- openstef_core-4.0.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
+
|
|
5
|
+
"""Configuration utilities for OpenSTEF Beam.
|
|
6
|
+
|
|
7
|
+
This module provides a `BaseConfig` class extending Pydantic's `BaseModel`
|
|
8
|
+
with convenience helpers for reading from and writing to YAML files. It also
|
|
9
|
+
exposes two helper functions `write_yaml_config` and `read_yaml_config` that
|
|
10
|
+
operate on arbitrary config instances or Pydantic models / adapters.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Annotated, Any, Self
|
|
15
|
+
|
|
16
|
+
import yaml
|
|
17
|
+
from pydantic import BaseModel as PydanticBaseModel
|
|
18
|
+
from pydantic import (
|
|
19
|
+
BeforeValidator,
|
|
20
|
+
ConfigDict,
|
|
21
|
+
GetCoreSchemaHandler,
|
|
22
|
+
GetJsonSchemaHandler,
|
|
23
|
+
TypeAdapter,
|
|
24
|
+
ValidationInfo,
|
|
25
|
+
)
|
|
26
|
+
from pydantic_core import core_schema
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class BaseModel(PydanticBaseModel):
|
|
30
|
+
"""Base model class for OpenSTEF components."""
|
|
31
|
+
|
|
32
|
+
model_config = ConfigDict(protected_namespaces=(), arbitrary_types_allowed=True, ser_json_inf_nan="null")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class BaseConfig(PydanticBaseModel):
|
|
36
|
+
"""Base configuration model.
|
|
37
|
+
|
|
38
|
+
It configures Pydantic model for safe YAML serialization / deserialization.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
model_config = ConfigDict(
|
|
42
|
+
protected_namespaces=(),
|
|
43
|
+
extra="ignore",
|
|
44
|
+
arbitrary_types_allowed=False,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
@classmethod
|
|
48
|
+
def read_yaml(cls, path: Path) -> Self:
|
|
49
|
+
"""Create an instance from a YAML file.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
path: Path to the YAML file to read.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
An instance of the config class populated with the file contents.
|
|
56
|
+
"""
|
|
57
|
+
return read_yaml_config(path, class_type=cls)
|
|
58
|
+
|
|
59
|
+
def write_yaml(self, path: Path) -> None:
|
|
60
|
+
"""Write this configuration to a YAML file.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
path: Destination path for the YAML file (will be overwritten).
|
|
64
|
+
"""
|
|
65
|
+
write_yaml_config(self, path)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def write_yaml_config(config: BaseConfig, path: Path) -> None:
|
|
69
|
+
"""Write the config to a YAML file.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
config: The configuration object to serialize.
|
|
73
|
+
path: Destination path for the YAML file (will be overwritten).
|
|
74
|
+
|
|
75
|
+
Example:
|
|
76
|
+
>>> from pathlib import Path
|
|
77
|
+
>>> from pydantic import BaseModel
|
|
78
|
+
>>> class MyConfig(BaseModel):
|
|
79
|
+
... foo: int
|
|
80
|
+
>>> cfg = MyConfig(foo=123)
|
|
81
|
+
>>> write_yaml_config(cfg, Path("/tmp/test.yaml"))
|
|
82
|
+
"""
|
|
83
|
+
with path.open("w", encoding="utf-8") as f:
|
|
84
|
+
yaml.dump(config.model_dump(mode="json"), f, allow_unicode=True)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def read_yaml_config[T: BaseConfig, U](path: Path, class_type: type[T] | TypeAdapter[U]) -> T | U:
|
|
88
|
+
"""Read a configuration object from a YAML file.
|
|
89
|
+
|
|
90
|
+
This function supports two kinds of targets:
|
|
91
|
+
|
|
92
|
+
* A subclass of `BaseConfig`, in which case Pydantic's `model_validate` is used.
|
|
93
|
+
* A `TypeAdapter` instance for more advanced / non-`BaseModel` schema validation.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
path: Path to the YAML file to read.
|
|
97
|
+
class_type: The target type (a `BaseConfig` subclass) or a `TypeAdapter`.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
A validated configuration instance (either ``T`` or ``U`` depending on
|
|
101
|
+
the provided ``class_type``).
|
|
102
|
+
"""
|
|
103
|
+
with path.open("r", encoding="utf-8") as f:
|
|
104
|
+
data = yaml.safe_load(f)
|
|
105
|
+
|
|
106
|
+
if isinstance(class_type, TypeAdapter):
|
|
107
|
+
return class_type.validate_python(data)
|
|
108
|
+
|
|
109
|
+
return class_type.model_validate(data)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class PydanticStringPrimitive:
|
|
113
|
+
"""Base class for Pydantic-compatible types with string serialization."""
|
|
114
|
+
|
|
115
|
+
def __str__(self) -> str:
|
|
116
|
+
"""Convert to string representation."""
|
|
117
|
+
raise NotImplementedError("Subclasses must implement __str__")
|
|
118
|
+
|
|
119
|
+
@classmethod
|
|
120
|
+
def from_string(cls, s: str) -> Self:
|
|
121
|
+
"""Create an instance from string representation."""
|
|
122
|
+
raise NotImplementedError("Subclasses must implement from_string")
|
|
123
|
+
|
|
124
|
+
@classmethod
|
|
125
|
+
def validate(cls, v: Any, _info: ValidationInfo | None = None) -> Self: # noqa: ANN401
|
|
126
|
+
"""Validate and convert input to this type.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
v: Input value to validate.
|
|
130
|
+
_info: Additional validation info (unused).
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
Validated instance of this type.
|
|
134
|
+
|
|
135
|
+
Raises:
|
|
136
|
+
ValueError: If input cannot be converted to this type.
|
|
137
|
+
"""
|
|
138
|
+
if isinstance(v, cls):
|
|
139
|
+
return v
|
|
140
|
+
if isinstance(v, str):
|
|
141
|
+
return cls.from_string(v)
|
|
142
|
+
|
|
143
|
+
# Subclasses should handle their specific types
|
|
144
|
+
error_message = f"Cannot convert {v} to {cls.__name__}"
|
|
145
|
+
raise ValueError(error_message)
|
|
146
|
+
|
|
147
|
+
@classmethod
|
|
148
|
+
def __get_pydantic_core_schema__(
|
|
149
|
+
cls, _source_type: type[Any], _handler: GetCoreSchemaHandler
|
|
150
|
+
) -> core_schema.CoreSchema:
|
|
151
|
+
"""Define Pydantic validation and serialization behavior.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
Core schema for Pydantic validation and serialization.
|
|
155
|
+
"""
|
|
156
|
+
return core_schema.with_info_plain_validator_function(
|
|
157
|
+
function=cls.validate, serialization=core_schema.plain_serializer_function_ser_schema(cls.__str__)
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
@classmethod
|
|
161
|
+
def __get_pydantic_json_schema__( # noqa: PLW3201
|
|
162
|
+
cls,
|
|
163
|
+
_schema: core_schema.CoreSchema,
|
|
164
|
+
handler: GetJsonSchemaHandler,
|
|
165
|
+
) -> dict[str, Any]:
|
|
166
|
+
"""Generate JSON schema for OpenAPI / FastAPI compatibility.
|
|
167
|
+
|
|
168
|
+
All string-primitive types serialise as plain strings.
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
JSON schema describing the type as a string.
|
|
172
|
+
"""
|
|
173
|
+
return {"type": "string"}
|
|
174
|
+
|
|
175
|
+
def __eq__(self, other: object) -> bool:
|
|
176
|
+
"""Check equality based on string representation.
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
True if both objects have the same string representation, False otherwise.
|
|
180
|
+
"""
|
|
181
|
+
if not isinstance(other, self.__class__):
|
|
182
|
+
return NotImplemented
|
|
183
|
+
return str(self) == str(other)
|
|
184
|
+
|
|
185
|
+
def __hash__(self) -> int:
|
|
186
|
+
"""Return hash based on string representation."""
|
|
187
|
+
return hash(str(self))
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _convert_none_to_nan(v: float | None) -> float:
|
|
191
|
+
if v is None:
|
|
192
|
+
return float("nan")
|
|
193
|
+
return v
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
FloatOrNan = Annotated[float, BeforeValidator(_convert_none_to_nan)]
|
|
197
|
+
|
|
198
|
+
__all__ = [
|
|
199
|
+
"BaseConfig",
|
|
200
|
+
"BaseModel",
|
|
201
|
+
"FloatOrNan",
|
|
202
|
+
"PydanticStringPrimitive",
|
|
203
|
+
"read_yaml_config",
|
|
204
|
+
"write_yaml_config",
|
|
205
|
+
]
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
+
|
|
5
|
+
"""Shared constants for the openstef_core package."""
|
|
6
|
+
|
|
7
|
+
LIANDER_DATASET_REPO_ID = "OpenSTEF/liander2024-energy-forecasting-benchmark"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
__all__ = ["LIANDER_DATASET_REPO_ID"]
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
+
|
|
5
|
+
"""Time series datasets and versioned data access.
|
|
6
|
+
|
|
7
|
+
This module provides core data structures for handling time series data in OpenSTEF forecasting.
|
|
8
|
+
It includes both simple time series datasets and versioned datasets that track data availability
|
|
9
|
+
over time, enabling realistic backtesting and training and forecasting.
|
|
10
|
+
|
|
11
|
+
The module supports:
|
|
12
|
+
|
|
13
|
+
- Regular time series with consistent sampling intervals
|
|
14
|
+
- Versioned time series that track when data became available
|
|
15
|
+
- Validated datasets with domain-specific constraints
|
|
16
|
+
- Data transformations and validation utilities
|
|
17
|
+
- Feature concatenation and horizon restriction operations
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from openstef_core.datasets.timeseries_dataset import TimeSeriesDataset, validate_horizons_present
|
|
21
|
+
from openstef_core.datasets.validated_datasets import (
|
|
22
|
+
EnergyComponentDataset,
|
|
23
|
+
EnsembleForecastDataset,
|
|
24
|
+
ForecastDataset,
|
|
25
|
+
ForecastInputDataset,
|
|
26
|
+
)
|
|
27
|
+
from openstef_core.datasets.versioned_timeseries_dataset import VersionedTimeSeriesDataset
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"EnergyComponentDataset",
|
|
31
|
+
"EnsembleForecastDataset",
|
|
32
|
+
"ForecastDataset",
|
|
33
|
+
"ForecastInputDataset",
|
|
34
|
+
"TimeSeriesDataset",
|
|
35
|
+
"VersionedTimeSeriesDataset",
|
|
36
|
+
"validate_horizons_present",
|
|
37
|
+
]
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
+
|
|
5
|
+
"""Protocol definitions for time series dataset interfaces.
|
|
6
|
+
|
|
7
|
+
This module provides protocol classes that define the core interfaces for time series
|
|
8
|
+
datasets in OpenSTEF. Protocols enable type checking and documentation of expected
|
|
9
|
+
behavior without requiring inheritance.
|
|
10
|
+
|
|
11
|
+
Key protocols:
|
|
12
|
+
|
|
13
|
+
- TimeSeries: Core interface for all time series datasets with filtering and versioning
|
|
14
|
+
- DatasetMixin: Interface for dataset persistence operations
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from abc import abstractmethod
|
|
18
|
+
from collections.abc import Callable
|
|
19
|
+
from datetime import datetime, timedelta
|
|
20
|
+
from typing import TYPE_CHECKING, Concatenate, Protocol, Self
|
|
21
|
+
|
|
22
|
+
from pydantic import FilePath
|
|
23
|
+
|
|
24
|
+
from openstef_core.types import AvailableAt, LeadTime
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
import pandas as pd
|
|
28
|
+
|
|
29
|
+
from openstef_core.datasets.timeseries_dataset import TimeSeriesDataset
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class TimeSeriesMixin(Protocol):
|
|
33
|
+
"""Interface defining the interface for time series datasets.
|
|
34
|
+
|
|
35
|
+
This interface defines the essential operations that all time series datasets
|
|
36
|
+
must implement. It provides access to feature metadata, temporal properties,
|
|
37
|
+
the dataset's temporal index, and filtering/versioning capabilities.
|
|
38
|
+
|
|
39
|
+
Classes implementing this interface must provide:
|
|
40
|
+
|
|
41
|
+
- Access to the datetime index
|
|
42
|
+
- Sample interval information
|
|
43
|
+
- Feature names list
|
|
44
|
+
- Versioning status indicator
|
|
45
|
+
- Filtering methods for time ranges, availability, and lead times
|
|
46
|
+
- Version selection for point-in-time data reconstruction
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
@abstractmethod
|
|
51
|
+
def index(self) -> "pd.DatetimeIndex":
|
|
52
|
+
"""Get the datetime index of the dataset.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
DatetimeIndex representing all timestamps in the dataset.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
@abstractmethod
|
|
60
|
+
def sample_interval(self) -> timedelta:
|
|
61
|
+
"""Get the fixed time interval between consecutive data points.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
The sample interval as a timedelta.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
@property
|
|
68
|
+
@abstractmethod
|
|
69
|
+
def feature_names(self) -> list[str]:
|
|
70
|
+
"""Get the names of all available features in the dataset.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
List of feature names, excluding metadata columns like timestamp,
|
|
74
|
+
available_at, or horizon.
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
@abstractmethod
|
|
79
|
+
def is_versioned(self) -> bool:
|
|
80
|
+
"""Check if the dataset tracks data availability over time.
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
True if the dataset is versioned (tracks availability via horizon
|
|
84
|
+
or available_at columns), False for regular time series.
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
@abstractmethod
|
|
88
|
+
def filter_by_range(self, start: datetime | None = None, end: datetime | None = None) -> Self:
|
|
89
|
+
"""Filter the dataset to include only data within the specified time range.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
start: Inclusive start time of the range. If None, no start boundary applied.
|
|
93
|
+
end: Exclusive end time of the range. If None, no end boundary applied.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
New instance containing only data within [start, end).
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
@abstractmethod
|
|
100
|
+
def filter_by_available_before(self, available_before: datetime) -> Self:
|
|
101
|
+
"""Filter to include only data available before the specified timestamp.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
available_before: Cutoff time for data availability.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
New instance containing only data available before the cutoff.
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
@abstractmethod
|
|
111
|
+
def filter_by_available_at(self, available_at: AvailableAt) -> Self:
|
|
112
|
+
"""Filter based on realistic daily data availability constraints.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
available_at: Specification defining when data becomes available.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
New instance with data filtered by availability pattern.
|
|
119
|
+
"""
|
|
120
|
+
|
|
121
|
+
@abstractmethod
|
|
122
|
+
def filter_by_lead_time(self, lead_time: LeadTime) -> Self:
|
|
123
|
+
"""Filter to include only data available at or longer than the specified lead time.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
lead_time: Minimum time gap required between data availability and timestamp.
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
New instance containing only data available with the required lead time.
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
@abstractmethod
|
|
133
|
+
def select_version(self) -> "TimeSeriesDataset":
|
|
134
|
+
"""Select a specific version of the dataset based on data availability.
|
|
135
|
+
|
|
136
|
+
Creates a point-in-time snapshot by selecting the latest available version
|
|
137
|
+
for each timestamp. Essential for preventing lookahead bias in backtesting.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
TimeSeriesDataset containing the selected version of the data.
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
def calculate_time_coverage(self) -> timedelta:
|
|
144
|
+
"""Calculates the total time span covered by the dataset.
|
|
145
|
+
|
|
146
|
+
This method computes the total duration represented by the dataset
|
|
147
|
+
based on its unique timestamps and sample interval.
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
timedelta: Total time coverage of the dataset.
|
|
151
|
+
"""
|
|
152
|
+
return len(self.index.unique()) * self.sample_interval
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class DatasetMixin(Protocol):
|
|
156
|
+
"""Abstract base class for dataset persistence operations.
|
|
157
|
+
|
|
158
|
+
This mixin defines the interface for saving and loading datasets to/from
|
|
159
|
+
parquet files. It ensures datasets can be persisted with all their metadata
|
|
160
|
+
and reconstructed exactly as they were saved.
|
|
161
|
+
|
|
162
|
+
Classes implementing this mixin must:
|
|
163
|
+
|
|
164
|
+
- Save all data and metadata necessary for complete reconstruction
|
|
165
|
+
- Store metadata in parquet file attributes using attrs
|
|
166
|
+
- Handle missing metadata gracefully with sensible defaults when loading
|
|
167
|
+
|
|
168
|
+
See Also:
|
|
169
|
+
TimeSeriesDataset: Implementation for standard time series datasets.
|
|
170
|
+
VersionedTimeSeriesDataset: Implementation for versioned dataset segments.
|
|
171
|
+
"""
|
|
172
|
+
|
|
173
|
+
@abstractmethod
|
|
174
|
+
def to_parquet(self, path: FilePath) -> None:
|
|
175
|
+
"""Save the dataset to a parquet file.
|
|
176
|
+
|
|
177
|
+
Stores both the dataset's data and all necessary metadata for complete
|
|
178
|
+
reconstruction. Metadata should be stored in the parquet file's attrs
|
|
179
|
+
dictionary.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
path: File path where the dataset should be saved.
|
|
183
|
+
|
|
184
|
+
See Also:
|
|
185
|
+
read_parquet: Counterpart method for loading datasets.
|
|
186
|
+
"""
|
|
187
|
+
|
|
188
|
+
@classmethod
|
|
189
|
+
@abstractmethod
|
|
190
|
+
def read_parquet(cls, path: FilePath) -> Self:
|
|
191
|
+
"""Load a dataset from a parquet file.
|
|
192
|
+
|
|
193
|
+
Reconstructs a dataset from a parquet file created with to_parquet,
|
|
194
|
+
including all data and metadata. Should handle missing metadata
|
|
195
|
+
gracefully with sensible defaults.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
path: Path to the parquet file to load.
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
New dataset instance reconstructed from the file.
|
|
202
|
+
|
|
203
|
+
See Also:
|
|
204
|
+
to_parquet: Counterpart method for saving datasets.
|
|
205
|
+
"""
|
|
206
|
+
|
|
207
|
+
def pipe[T, **P](self, func: Callable[Concatenate[Self, P], T], *args: P.args, **kwargs: P.kwargs) -> T:
|
|
208
|
+
"""Applies a function to the dataset and returns the result.
|
|
209
|
+
|
|
210
|
+
This method allows for functional-style transformations and operations
|
|
211
|
+
on the dataset, enabling method chaining and cleaner code.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
func: A callable that takes the dataset instance and returns a value of type T.
|
|
215
|
+
*args: Positional arguments to pass to the function.
|
|
216
|
+
**kwargs: Keyword arguments to pass to the function.
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
The result of applying the function to the dataset.
|
|
220
|
+
"""
|
|
221
|
+
return func(self, *args, **kwargs)
|