openstef 3.4.56__py3-none-any.whl → 4.0.0a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstef-4.0.0a3.dist-info/METADATA +177 -0
- openstef-4.0.0a3.dist-info/RECORD +4 -0
- {openstef-3.4.56.dist-info → openstef-4.0.0a3.dist-info}/WHEEL +1 -2
- openstef/__init__.py +0 -14
- openstef/__main__.py +0 -3
- openstef/app_settings.py +0 -19
- openstef/data/NL_terrestrial_radiation.csv +0 -25585
- openstef/data/NL_terrestrial_radiation.csv.license +0 -3
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z +0 -0
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z.license +0 -3
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md +0 -18
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md.license +0 -3
- openstef/data/dutch_holidays.csv +0 -1759
- openstef/data/dutch_holidays.csv.license +0 -3
- openstef/data/pv_single_coefs.csv +0 -601
- openstef/data/pv_single_coefs.csv.license +0 -3
- openstef/data_classes/__init__.py +0 -3
- openstef/data_classes/data_prep.py +0 -99
- openstef/data_classes/model_specifications.py +0 -30
- openstef/data_classes/prediction_job.py +0 -135
- openstef/data_classes/split_function.py +0 -97
- openstef/enums.py +0 -140
- openstef/exceptions.py +0 -74
- openstef/feature_engineering/__init__.py +0 -3
- openstef/feature_engineering/apply_features.py +0 -138
- openstef/feature_engineering/bidding_zone_to_country_mapping.py +0 -106
- openstef/feature_engineering/cyclic_features.py +0 -161
- openstef/feature_engineering/data_preparation.py +0 -152
- openstef/feature_engineering/feature_adder.py +0 -206
- openstef/feature_engineering/feature_applicator.py +0 -202
- openstef/feature_engineering/general.py +0 -141
- openstef/feature_engineering/holiday_features.py +0 -231
- openstef/feature_engineering/lag_features.py +0 -165
- openstef/feature_engineering/missing_values_transformer.py +0 -141
- openstef/feature_engineering/rolling_features.py +0 -58
- openstef/feature_engineering/weather_features.py +0 -492
- openstef/metrics/__init__.py +0 -3
- openstef/metrics/figure.py +0 -303
- openstef/metrics/metrics.py +0 -486
- openstef/metrics/reporter.py +0 -222
- openstef/model/__init__.py +0 -3
- openstef/model/basecase.py +0 -82
- openstef/model/confidence_interval_applicator.py +0 -242
- openstef/model/fallback.py +0 -77
- openstef/model/metamodels/__init__.py +0 -3
- openstef/model/metamodels/feature_clipper.py +0 -90
- openstef/model/metamodels/grouped_regressor.py +0 -222
- openstef/model/metamodels/missing_values_handler.py +0 -138
- openstef/model/model_creator.py +0 -214
- openstef/model/objective.py +0 -426
- openstef/model/objective_creator.py +0 -65
- openstef/model/regressors/__init__.py +0 -3
- openstef/model/regressors/arima.py +0 -197
- openstef/model/regressors/custom_regressor.py +0 -64
- openstef/model/regressors/dazls.py +0 -116
- openstef/model/regressors/flatliner.py +0 -95
- openstef/model/regressors/gblinear_quantile.py +0 -334
- openstef/model/regressors/lgbm.py +0 -29
- openstef/model/regressors/linear.py +0 -90
- openstef/model/regressors/linear_quantile.py +0 -305
- openstef/model/regressors/regressor.py +0 -114
- openstef/model/regressors/xgb.py +0 -52
- openstef/model/regressors/xgb_multioutput_quantile.py +0 -261
- openstef/model/regressors/xgb_quantile.py +0 -228
- openstef/model/serializer.py +0 -431
- openstef/model/standard_deviation_generator.py +0 -81
- openstef/model_selection/__init__.py +0 -3
- openstef/model_selection/model_selection.py +0 -311
- openstef/monitoring/__init__.py +0 -3
- openstef/monitoring/performance_meter.py +0 -92
- openstef/monitoring/teams.py +0 -203
- openstef/pipeline/__init__.py +0 -3
- openstef/pipeline/create_basecase_forecast.py +0 -133
- openstef/pipeline/create_component_forecast.py +0 -168
- openstef/pipeline/create_forecast.py +0 -171
- openstef/pipeline/optimize_hyperparameters.py +0 -317
- openstef/pipeline/train_create_forecast_backtest.py +0 -163
- openstef/pipeline/train_model.py +0 -561
- openstef/pipeline/utils.py +0 -52
- openstef/postprocessing/__init__.py +0 -3
- openstef/postprocessing/postprocessing.py +0 -275
- openstef/preprocessing/__init__.py +0 -3
- openstef/preprocessing/preprocessing.py +0 -42
- openstef/settings.py +0 -15
- openstef/tasks/__init__.py +0 -3
- openstef/tasks/calculate_kpi.py +0 -324
- openstef/tasks/create_basecase_forecast.py +0 -118
- openstef/tasks/create_components_forecast.py +0 -162
- openstef/tasks/create_forecast.py +0 -145
- openstef/tasks/create_solar_forecast.py +0 -420
- openstef/tasks/create_wind_forecast.py +0 -80
- openstef/tasks/optimize_hyperparameters.py +0 -135
- openstef/tasks/split_forecast.py +0 -273
- openstef/tasks/train_model.py +0 -224
- openstef/tasks/utils/__init__.py +0 -3
- openstef/tasks/utils/dependencies.py +0 -107
- openstef/tasks/utils/predictionjobloop.py +0 -243
- openstef/tasks/utils/taskcontext.py +0 -160
- openstef/validation/__init__.py +0 -3
- openstef/validation/validation.py +0 -322
- openstef-3.4.56.dist-info/METADATA +0 -154
- openstef-3.4.56.dist-info/RECORD +0 -102
- openstef-3.4.56.dist-info/top_level.txt +0 -1
- /openstef-3.4.56.dist-info/LICENSE → /openstef-4.0.0a3.dist-info/licenses/LICENSE.md +0 -0
|
@@ -1,231 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
-
"""This module contains all holiday related features."""
|
|
5
|
-
from datetime import datetime, timedelta
|
|
6
|
-
|
|
7
|
-
import holidays
|
|
8
|
-
import numpy as np
|
|
9
|
-
import pandas as pd
|
|
10
|
-
|
|
11
|
-
from openstef import PROJECT_ROOT
|
|
12
|
-
|
|
13
|
-
HOLIDAY_CSV_PATH: str = PROJECT_ROOT / "openstef" / "data" / "dutch_holidays.csv"
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def generate_holiday_feature_functions(
|
|
17
|
-
country_code: str = "NL",
|
|
18
|
-
years: list[int] | None = None,
|
|
19
|
-
path_to_school_holidays_csv: str = HOLIDAY_CSV_PATH,
|
|
20
|
-
) -> dict:
|
|
21
|
-
"""Generates functions for creating holiday feature.
|
|
22
|
-
|
|
23
|
-
This improves forecast accuracy. Examples of features that are
|
|
24
|
-
added are: 2020-01-01 is 'Nieuwjaarsdag'.
|
|
25
|
-
|
|
26
|
-
2022-12-24 - 2023-01-08 is the 'Kerstvakantie'
|
|
27
|
-
2022-10-15 - 2022-10-23 is the 'HerfstvakantieNoord'
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
The holidays are based on a manually generated csv file.
|
|
31
|
-
The information is collected using:
|
|
32
|
-
https://www.schoolvakanties-nederland.nl/ and the python holiday function
|
|
33
|
-
The official following official ducth holidays are included untill 2023:
|
|
34
|
-
- Kerstvakantie
|
|
35
|
-
- Meivakantie
|
|
36
|
-
- Herstvakantie
|
|
37
|
-
- Bouwvak
|
|
38
|
-
- Zomervakantie
|
|
39
|
-
- Voorjaarsvakantie
|
|
40
|
-
- Nieuwjaarsdag
|
|
41
|
-
- Pasen
|
|
42
|
-
- Koningsdag
|
|
43
|
-
- Hemelvaart
|
|
44
|
-
- Pinksteren
|
|
45
|
-
- Kerst
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
The 'Brugdagen' are updated untill dec 2020. (Generated using agenda)
|
|
49
|
-
|
|
50
|
-
Args:
|
|
51
|
-
country_code: Country for which to create holiday features.
|
|
52
|
-
years: years for which to create holiday features. If None,
|
|
53
|
-
the last 4 years, the current and next year are used.
|
|
54
|
-
path_to_school_holidays_csv: Filepath to csv with school holidays.
|
|
55
|
-
|
|
56
|
-
NOTE: Dutch holidays csv file is only until January 2026.
|
|
57
|
-
|
|
58
|
-
Returns:
|
|
59
|
-
Dictionary with functions that check if a given date is a holiday, keys
|
|
60
|
-
consist of "Is" + the_name_of_the_holiday_to_be_checked
|
|
61
|
-
|
|
62
|
-
"""
|
|
63
|
-
if years is None:
|
|
64
|
-
now = datetime.now()
|
|
65
|
-
years = [
|
|
66
|
-
now.year - 4,
|
|
67
|
-
now.year - 3,
|
|
68
|
-
now.year - 2,
|
|
69
|
-
now.year - 1,
|
|
70
|
-
now.year,
|
|
71
|
-
now.year + 1,
|
|
72
|
-
]
|
|
73
|
-
|
|
74
|
-
country_holidays = holidays.country_holidays(country_code, years=years)
|
|
75
|
-
|
|
76
|
-
# Make holiday function dict
|
|
77
|
-
holiday_functions = {}
|
|
78
|
-
# Add check function that includes all holidays of the provided csv
|
|
79
|
-
holiday_functions.update(
|
|
80
|
-
{
|
|
81
|
-
"is_national_holiday": lambda x: np.isin(
|
|
82
|
-
x.index.date, np.array(list(country_holidays))
|
|
83
|
-
)
|
|
84
|
-
}
|
|
85
|
-
)
|
|
86
|
-
# Define empty list to keep track of bridgedays
|
|
87
|
-
bridge_days = []
|
|
88
|
-
# Loop over list of holidays names
|
|
89
|
-
for date, holiday_name in sorted(country_holidays.items()):
|
|
90
|
-
# Define function explicitely to mitigate 'late binding' problem
|
|
91
|
-
def make_holiday_func(requested_date):
|
|
92
|
-
return lambda x: np.isin(x.index.date, np.array([requested_date]))
|
|
93
|
-
|
|
94
|
-
# Create lag function for each holiday
|
|
95
|
-
holiday_functions.update(
|
|
96
|
-
{"is_" + holiday_name.replace(" ", "_").lower(): make_holiday_func(date)}
|
|
97
|
-
)
|
|
98
|
-
|
|
99
|
-
# Check for bridge day
|
|
100
|
-
holiday_functions, bridge_days = check_for_bridge_day(
|
|
101
|
-
date, holiday_name, country_code, years, holiday_functions, bridge_days
|
|
102
|
-
)
|
|
103
|
-
|
|
104
|
-
# Add feature function that includes all bridgedays
|
|
105
|
-
holiday_functions.update(
|
|
106
|
-
{"is_bridgeday": lambda x: np.isin(x.index.date, np.array(list(bridge_days)))}
|
|
107
|
-
)
|
|
108
|
-
|
|
109
|
-
# Add school holidays if country is NL
|
|
110
|
-
if country_code == "NL":
|
|
111
|
-
# Manully generated csv including all dutch schoolholidays for different regions
|
|
112
|
-
df_holidays = pd.read_csv(path_to_school_holidays_csv, index_col=None)
|
|
113
|
-
df_holidays["datum"] = pd.to_datetime(df_holidays.datum).apply(
|
|
114
|
-
lambda x: x.date()
|
|
115
|
-
)
|
|
116
|
-
|
|
117
|
-
# Add check function that includes all holidays of the provided csv
|
|
118
|
-
holiday_functions.update(
|
|
119
|
-
{
|
|
120
|
-
"is_schoolholiday": lambda x: np.isin(
|
|
121
|
-
x.index.date, df_holidays.datum.values
|
|
122
|
-
)
|
|
123
|
-
}
|
|
124
|
-
)
|
|
125
|
-
|
|
126
|
-
# Loop over list of holidays names
|
|
127
|
-
for holiday_name in list(set(df_holidays.name)):
|
|
128
|
-
# Define function explicitely to mitigate 'late binding' problem
|
|
129
|
-
def make_holiday_func(holidayname=holiday_name):
|
|
130
|
-
return lambda x: np.isin(
|
|
131
|
-
x.index.date,
|
|
132
|
-
df_holidays.datum[df_holidays.name == holidayname].values,
|
|
133
|
-
)
|
|
134
|
-
|
|
135
|
-
# Create lag function for each holiday
|
|
136
|
-
holiday_functions.update(
|
|
137
|
-
{
|
|
138
|
-
"is_"
|
|
139
|
-
+ holiday_name.replace(" ", "_").lower(): make_holiday_func(
|
|
140
|
-
holidayname=holiday_name
|
|
141
|
-
)
|
|
142
|
-
}
|
|
143
|
-
)
|
|
144
|
-
|
|
145
|
-
return holiday_functions
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
# Check for bridgedays
|
|
149
|
-
def check_for_bridge_day(
|
|
150
|
-
date: datetime,
|
|
151
|
-
holiday_name: str,
|
|
152
|
-
country: str,
|
|
153
|
-
years: list,
|
|
154
|
-
holiday_functions: dict,
|
|
155
|
-
bridge_days: list,
|
|
156
|
-
) -> tuple[dict, list]:
|
|
157
|
-
"""Checks for bridgedays associated to a specific holiday with date (date).
|
|
158
|
-
|
|
159
|
-
Any found bridgedays are appende dto the bridgedays list. Also a specific feature
|
|
160
|
-
function for the bridgeday is added to the general holidayfuncitons dictionary.
|
|
161
|
-
|
|
162
|
-
Args:
|
|
163
|
-
date: Date of holiday to check for associated bridgedays.
|
|
164
|
-
holiday_name: Name of the holiday.
|
|
165
|
-
country: Country for which to detect the bridgedays.
|
|
166
|
-
years: List of years for which to detect bridgedays.
|
|
167
|
-
holiday_functions: Dictionary to which the featurefunction has to be appended to in case of a bridgeday.
|
|
168
|
-
bridge_days: List of bridgedays to which any found bridgedays have to be appended.
|
|
169
|
-
|
|
170
|
-
Returns:
|
|
171
|
-
- Dict with holiday feature functions
|
|
172
|
-
- List of bridgedays
|
|
173
|
-
|
|
174
|
-
"""
|
|
175
|
-
country_holidays = holidays.country_holidays(country, years=years)
|
|
176
|
-
|
|
177
|
-
# if the date is a holiday, it is not a bridgeday
|
|
178
|
-
if date in country_holidays:
|
|
179
|
-
return holiday_functions, bridge_days
|
|
180
|
-
|
|
181
|
-
# Define function explicitely to mitigate 'late binding' problem
|
|
182
|
-
def make_holiday_func(requested_date):
|
|
183
|
-
return lambda x: np.isin(x.index.date, np.array([requested_date]))
|
|
184
|
-
|
|
185
|
-
# Looking forward: If day after tomorow is a national holiday or
|
|
186
|
-
# a saturday check if tomorow is not a national holiday
|
|
187
|
-
|
|
188
|
-
is_saturday_in_two_days = (date + timedelta(days=2)).weekday() == 5
|
|
189
|
-
is_holiday_in_two_days = (date + timedelta(days=2)) in country_holidays
|
|
190
|
-
|
|
191
|
-
is_holiday_tommorow = (date + timedelta(days=1)) in country_holidays
|
|
192
|
-
is_weekend_tommorrow = (date + timedelta(days=1)).weekday() in [5, 6]
|
|
193
|
-
|
|
194
|
-
if (
|
|
195
|
-
(is_holiday_in_two_days or is_saturday_in_two_days)
|
|
196
|
-
and (not is_holiday_tommorow and not is_weekend_tommorrow)
|
|
197
|
-
and date not in country_holidays
|
|
198
|
-
):
|
|
199
|
-
# Create feature function for each holiday
|
|
200
|
-
holiday_functions.update(
|
|
201
|
-
{
|
|
202
|
-
"is_bridgeday"
|
|
203
|
-
+ holiday_name.replace(" ", "_").lower(): make_holiday_func(
|
|
204
|
-
(date + timedelta(days=1))
|
|
205
|
-
)
|
|
206
|
-
}
|
|
207
|
-
)
|
|
208
|
-
bridge_days.append((date + timedelta(days=1)))
|
|
209
|
-
|
|
210
|
-
# Looking backward: If the day before is a national holiday
|
|
211
|
-
# or a sunday check if yesterday is a national holiday
|
|
212
|
-
is_saturday_two_days_ago = (date - timedelta(days=2)).weekday() == 6
|
|
213
|
-
is_holiday_two_days_ago = (date - timedelta(days=2)) in country_holidays
|
|
214
|
-
is_holiday_yesterday = (date - timedelta(days=1)) in country_holidays
|
|
215
|
-
is_weekend_yesterday = (date - timedelta(days=1)).weekday() in [5, 6]
|
|
216
|
-
|
|
217
|
-
if (is_saturday_two_days_ago or is_holiday_two_days_ago) and (
|
|
218
|
-
not is_holiday_yesterday and not is_weekend_yesterday
|
|
219
|
-
):
|
|
220
|
-
# Create featurefunction for the bridge function
|
|
221
|
-
holiday_functions.update(
|
|
222
|
-
{
|
|
223
|
-
"is_bridgeday"
|
|
224
|
-
+ holiday_name.replace(" ", "_").lower(): make_holiday_func(
|
|
225
|
-
(date - timedelta(days=1))
|
|
226
|
-
)
|
|
227
|
-
}
|
|
228
|
-
)
|
|
229
|
-
bridge_days.append((date - timedelta(days=1)))
|
|
230
|
-
|
|
231
|
-
return holiday_functions, bridge_days
|
|
@@ -1,165 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
-
"""This module contains all lag features."""
|
|
5
|
-
import re
|
|
6
|
-
|
|
7
|
-
import numpy as np
|
|
8
|
-
import pandas as pd
|
|
9
|
-
import scipy.signal
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def generate_lag_feature_functions(
|
|
13
|
-
feature_names: list[str] = None, horizon: float = 24.0
|
|
14
|
-
) -> dict:
|
|
15
|
-
"""Creates functions to generate lag features in a dataset.
|
|
16
|
-
|
|
17
|
-
Args:
|
|
18
|
-
feature_names: minute lagtimes that where used during training
|
|
19
|
-
of the model. If empty a new set will be automatically generated.
|
|
20
|
-
horizon: Forecast horizon limit in hours.
|
|
21
|
-
|
|
22
|
-
Returns:
|
|
23
|
-
Lag functions.
|
|
24
|
-
|
|
25
|
-
Example:
|
|
26
|
-
|
|
27
|
-
.. code-block:: py
|
|
28
|
-
|
|
29
|
-
lag_functions = generate_lag_functions(data,minute_list,h_ahead)
|
|
30
|
-
|
|
31
|
-
"""
|
|
32
|
-
# Use extracted lag features if provided.
|
|
33
|
-
if feature_names is not None:
|
|
34
|
-
lag_times_minutes, lag_time_days_list = extract_lag_features(
|
|
35
|
-
feature_names, horizon
|
|
36
|
-
)
|
|
37
|
-
else:
|
|
38
|
-
# Generate available lag_times if no features are provided
|
|
39
|
-
lag_times_minutes, lag_time_days_list = generate_trivial_lag_features(horizon)
|
|
40
|
-
|
|
41
|
-
# Empty dict to store all generated lag functions
|
|
42
|
-
lag_functions = {}
|
|
43
|
-
for minutes in lag_times_minutes: # Add intraday-lag functions (lags in minutes)
|
|
44
|
-
|
|
45
|
-
def func(x, shift=minutes):
|
|
46
|
-
return x.shift(freq="min", periods=1 * shift)
|
|
47
|
-
|
|
48
|
-
new = {"T-" + str(int(minutes)) + "min": func}
|
|
49
|
-
lag_functions.update(new)
|
|
50
|
-
|
|
51
|
-
# Add day lag functions:
|
|
52
|
-
for day in lag_time_days_list:
|
|
53
|
-
|
|
54
|
-
def func(x, shift=day):
|
|
55
|
-
return x.shift(freq="1d", periods=1 * shift)
|
|
56
|
-
|
|
57
|
-
new = {"T-" + str(int(day)) + "d": func}
|
|
58
|
-
lag_functions.update(new)
|
|
59
|
-
return lag_functions
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
def extract_lag_features(
|
|
63
|
-
feature_names: list[str], horizon: float = 24.0
|
|
64
|
-
) -> tuple[list, list]:
|
|
65
|
-
"""Creates a list of lag minutes and a list of lag days that were used during the training of the input model.
|
|
66
|
-
|
|
67
|
-
Args:
|
|
68
|
-
feature_names: All requested lag features
|
|
69
|
-
horizon: Forecast horizon limit in hours.
|
|
70
|
-
|
|
71
|
-
Returns:
|
|
72
|
-
- List of minute lags that were used as features during training.
|
|
73
|
-
- List of days lags that were used as features during training.
|
|
74
|
-
|
|
75
|
-
"""
|
|
76
|
-
# Prepare empty lists to append on
|
|
77
|
-
minutes_list = []
|
|
78
|
-
days_list = []
|
|
79
|
-
|
|
80
|
-
for lag_feature in feature_names:
|
|
81
|
-
# Select the number of days or the number of minutes by matching with a regular expression
|
|
82
|
-
number_of_minutes = re.search(r"T-(\d+)min", lag_feature)
|
|
83
|
-
number_of_days = re.search(r"T-(\d+)d", lag_feature)
|
|
84
|
-
|
|
85
|
-
# Append to the appropriate list
|
|
86
|
-
if number_of_minutes is not None:
|
|
87
|
-
minutes_list.append(int(number_of_minutes[1]))
|
|
88
|
-
elif number_of_days is not None:
|
|
89
|
-
days_list.append(int(number_of_days[1]))
|
|
90
|
-
|
|
91
|
-
# Discard lag times that are not available for the specified horizon
|
|
92
|
-
minutes_list = list(set([i for i in minutes_list if i >= horizon * 60]))
|
|
93
|
-
days_list = list(set([i for i in days_list if i >= horizon / 24]))
|
|
94
|
-
|
|
95
|
-
return minutes_list, days_list
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
def generate_trivial_lag_features(horizon: float) -> tuple[list, list]:
|
|
99
|
-
"""Generates relevant lag times for lag feature function creation.
|
|
100
|
-
|
|
101
|
-
This function is mostly used during training of models and not during predicting.
|
|
102
|
-
|
|
103
|
-
Args:
|
|
104
|
-
horizon: Forecast horizon limit in hours.
|
|
105
|
-
|
|
106
|
-
Returns:
|
|
107
|
-
- List of minute lags that were used as features during training.
|
|
108
|
-
- List of days lags that were used as features during training.
|
|
109
|
-
|
|
110
|
-
"""
|
|
111
|
-
mindays = min(int(np.ceil(horizon / 24)), 15)
|
|
112
|
-
lag_time_days_list = list(np.linspace(mindays, 14, 15 - mindays))
|
|
113
|
-
|
|
114
|
-
# Make list of trivial lag times
|
|
115
|
-
trivial_lag_minutes_list = np.linspace(60, 23 * 60, 23).tolist() + [15, 30, 45]
|
|
116
|
-
|
|
117
|
-
# Discard lag times that are not available for the specified horizon
|
|
118
|
-
trivial_lag_times_minutes = list(
|
|
119
|
-
set([i for i in trivial_lag_minutes_list if i >= horizon * 60])
|
|
120
|
-
)
|
|
121
|
-
|
|
122
|
-
return trivial_lag_times_minutes, lag_time_days_list
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
def generate_non_trivial_lag_times(
|
|
126
|
-
data: pd.DataFrame, height_threshold: float = 0.1
|
|
127
|
-
) -> list[int]:
|
|
128
|
-
"""Calculate an autocorrelation curve of the load trace.
|
|
129
|
-
|
|
130
|
-
This curve is subsequently used to add additional lag times as features.
|
|
131
|
-
|
|
132
|
-
Args:
|
|
133
|
-
data: Dataframe with input data in the form pd.DataFrame(index = datetime,
|
|
134
|
-
columns = [label, predictor_1,..., predictor_n])
|
|
135
|
-
height_threshold: Minimal autocorrelation value to be recognized as a peak.
|
|
136
|
-
|
|
137
|
-
Returns:
|
|
138
|
-
Aditional non-trivial minute lags
|
|
139
|
-
|
|
140
|
-
"""
|
|
141
|
-
|
|
142
|
-
def autocorr(x: np.array, lags: range) -> np.array:
|
|
143
|
-
"""Make an autocorrelation curve."""
|
|
144
|
-
mean = x.mean()
|
|
145
|
-
var = np.var(x)
|
|
146
|
-
xp = x - mean
|
|
147
|
-
corr = np.correlate(xp, xp, "full")[len(x) - 1 :] / var / len(x)
|
|
148
|
-
|
|
149
|
-
return corr[: len(lags)]
|
|
150
|
-
|
|
151
|
-
try:
|
|
152
|
-
# Get rid of nans as the autocorrelation handles these values badly
|
|
153
|
-
data = data[data.columns[0]].dropna() # First column contains the load
|
|
154
|
-
# Get autocorrelation curve
|
|
155
|
-
y = autocorr(data, range(10000))
|
|
156
|
-
# Determine the peaks (positive and negative) larger than a specified threshold
|
|
157
|
-
peaks = scipy.signal.find_peaks(np.abs(y), height=height_threshold)
|
|
158
|
-
peaks = peaks[0]
|
|
159
|
-
# Convert peaks to lag times in minutes
|
|
160
|
-
peaks = peaks[peaks < (60 * 4)]
|
|
161
|
-
additional_minute_space = peaks * 15
|
|
162
|
-
except Exception:
|
|
163
|
-
return []
|
|
164
|
-
# Return list of additional minute lags to be procceses by apply features
|
|
165
|
-
return list(additional_minute_space)
|
|
@@ -1,141 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
-
from typing import Union, List, Optional
|
|
5
|
-
|
|
6
|
-
import numpy as np
|
|
7
|
-
import pandas as pd
|
|
8
|
-
from sklearn.impute import SimpleImputer
|
|
9
|
-
from sklearn.preprocessing import FunctionTransformer
|
|
10
|
-
from sklearn.utils.validation import check_array, check_is_fitted
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class MissingValuesTransformer:
|
|
14
|
-
"""MissingValuesTransformer handles missing values in data by imputing them with a given strategy.
|
|
15
|
-
|
|
16
|
-
It also removes columns that are always null from the data.
|
|
17
|
-
|
|
18
|
-
"""
|
|
19
|
-
|
|
20
|
-
in_feature_names: Optional[List[str]] = None
|
|
21
|
-
_n_in_features: Optional[int] = None
|
|
22
|
-
|
|
23
|
-
non_null_feature_names: List[str] = None
|
|
24
|
-
|
|
25
|
-
def __init__(
|
|
26
|
-
self,
|
|
27
|
-
missing_values: Union[int, float, str, None] = np.nan,
|
|
28
|
-
imputation_strategy: str = None,
|
|
29
|
-
fill_value: Union[str, int, float] = None,
|
|
30
|
-
no_fill_future_values_features: List[str] = None,
|
|
31
|
-
):
|
|
32
|
-
"""Initialize missing values handler.
|
|
33
|
-
|
|
34
|
-
Args:
|
|
35
|
-
missing_values: The placeholder for the missing values. All occurrences of
|
|
36
|
-
`missing_values` will be imputed.
|
|
37
|
-
imputation_strategy: The imputation strategy to use
|
|
38
|
-
Can be one of "mean", "median", "most_frequent", "constant" or None.
|
|
39
|
-
fill_value: When strategy == "constant", fill_value is used to replace all
|
|
40
|
-
occurrences of missing_values.
|
|
41
|
-
no_fill_future_values_features: The features for which it does not make sense
|
|
42
|
-
to fill future values. Rows that contain trailing null values for these
|
|
43
|
-
features will be removed from the data.
|
|
44
|
-
|
|
45
|
-
"""
|
|
46
|
-
self.missing_values = missing_values
|
|
47
|
-
self.imputation_strategy = imputation_strategy
|
|
48
|
-
self.fill_value = fill_value
|
|
49
|
-
self.no_fill_future_values_features = no_fill_future_values_features or []
|
|
50
|
-
self.is_fitted_ = False
|
|
51
|
-
|
|
52
|
-
# Build the proper imputation transformer
|
|
53
|
-
# - Identity function if strategy is None
|
|
54
|
-
# - SimpleImputer with the dedicated strategy
|
|
55
|
-
if self.imputation_strategy is None:
|
|
56
|
-
self.imputer_ = FunctionTransformer(func=self._identity)
|
|
57
|
-
else:
|
|
58
|
-
self.imputer_ = SimpleImputer(
|
|
59
|
-
missing_values=self.missing_values,
|
|
60
|
-
strategy=self.imputation_strategy,
|
|
61
|
-
fill_value=self.fill_value,
|
|
62
|
-
).set_output(transform="pandas")
|
|
63
|
-
self.imputer_._validate_params()
|
|
64
|
-
|
|
65
|
-
@staticmethod
|
|
66
|
-
def _determine_trailing_null_rows(x: pd.DataFrame) -> pd.Series:
|
|
67
|
-
"""Determine rows with trailing null values in a DataFrame."""
|
|
68
|
-
return ~x.bfill().isnull().any(axis="columns")
|
|
69
|
-
|
|
70
|
-
def fit(self, x, y=None):
|
|
71
|
-
"""Fit the imputer on the input data."""
|
|
72
|
-
_ = check_array(x, force_all_finite="allow-nan")
|
|
73
|
-
if not isinstance(x, pd.DataFrame):
|
|
74
|
-
x = pd.DataFrame(np.asarray(x))
|
|
75
|
-
|
|
76
|
-
self.in_feature_names = list(x.columns)
|
|
77
|
-
self._n_in_features = x.shape[1]
|
|
78
|
-
|
|
79
|
-
# Remove always null columns
|
|
80
|
-
is_column_null = x.isnull().all(axis="index")
|
|
81
|
-
self.non_null_feature_names = list(x.columns[~is_column_null])
|
|
82
|
-
x = x[self.non_null_feature_names]
|
|
83
|
-
|
|
84
|
-
# Remove trailing null rows for features that should
|
|
85
|
-
# not be imputed in the future
|
|
86
|
-
trailing_null_rows = self._determine_trailing_null_rows(
|
|
87
|
-
x[self.no_fill_future_values_features]
|
|
88
|
-
)
|
|
89
|
-
x = x.loc[trailing_null_rows]
|
|
90
|
-
|
|
91
|
-
# Imputers do not support labels
|
|
92
|
-
self.imputer_.fit(X=x, y=None)
|
|
93
|
-
self.is_fitted_ = True
|
|
94
|
-
|
|
95
|
-
def transform(self, x) -> pd.DataFrame:
|
|
96
|
-
"""Transform the input data by imputing missing values."""
|
|
97
|
-
check_is_fitted(self)
|
|
98
|
-
_ = check_array(x, force_all_finite="allow-nan")
|
|
99
|
-
if not isinstance(x, pd.DataFrame):
|
|
100
|
-
x = pd.DataFrame(np.asarray(x))
|
|
101
|
-
|
|
102
|
-
x = x[self.non_null_feature_names]
|
|
103
|
-
|
|
104
|
-
transformed = self.imputer_.transform(x)
|
|
105
|
-
|
|
106
|
-
return transformed
|
|
107
|
-
|
|
108
|
-
def fit_transform(self, x, y=None) -> tuple[pd.DataFrame, Optional[pd.Series]]:
|
|
109
|
-
"""Fit the imputer on the input data and transform it.
|
|
110
|
-
|
|
111
|
-
Returns:
|
|
112
|
-
The data with missing values imputed.
|
|
113
|
-
|
|
114
|
-
"""
|
|
115
|
-
self.fit(x, y)
|
|
116
|
-
|
|
117
|
-
if not isinstance(x, pd.DataFrame):
|
|
118
|
-
x = pd.DataFrame(np.asarray(x))
|
|
119
|
-
|
|
120
|
-
x = x[self.non_null_feature_names]
|
|
121
|
-
|
|
122
|
-
# Remove trailing null rows for features that should
|
|
123
|
-
# not be imputed in the future
|
|
124
|
-
non_trailing_null_rows = self._determine_trailing_null_rows(
|
|
125
|
-
x[self.no_fill_future_values_features]
|
|
126
|
-
)
|
|
127
|
-
x = x.loc[non_trailing_null_rows]
|
|
128
|
-
|
|
129
|
-
x = self.transform(x)
|
|
130
|
-
|
|
131
|
-
if y is not None:
|
|
132
|
-
y = y.loc[non_trailing_null_rows]
|
|
133
|
-
|
|
134
|
-
return x, y
|
|
135
|
-
|
|
136
|
-
@classmethod
|
|
137
|
-
def _identity(cls, x):
|
|
138
|
-
return x
|
|
139
|
-
|
|
140
|
-
def __sklearn_is_fitted__(self) -> bool:
|
|
141
|
-
return self.in_feature_names is not None
|
|
@@ -1,58 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
-
from datetime import timedelta
|
|
5
|
-
|
|
6
|
-
import pandas as pd
|
|
7
|
-
|
|
8
|
-
from openstef.data_classes.prediction_job import PredictionJobDataClass
|
|
9
|
-
from pydantic import TypeAdapter
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def convert_timedelta_to_isoformat(td: timedelta) -> str:
|
|
13
|
-
"""
|
|
14
|
-
Converts a timedelta to an ISO 8601 formatted period string.
|
|
15
|
-
|
|
16
|
-
Args:
|
|
17
|
-
td: timedelta object to convert.
|
|
18
|
-
|
|
19
|
-
Returns:
|
|
20
|
-
ISO 8601 formatted period string.
|
|
21
|
-
"""
|
|
22
|
-
timedelta_adapter = TypeAdapter(timedelta)
|
|
23
|
-
return timedelta_adapter.dump_python(td, mode="json")
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def add_rolling_aggregate_features(
|
|
27
|
-
data: pd.DataFrame,
|
|
28
|
-
pj: PredictionJobDataClass,
|
|
29
|
-
rolling_window: timedelta = timedelta(hours=24),
|
|
30
|
-
) -> pd.DataFrame:
|
|
31
|
-
"""
|
|
32
|
-
Adds rolling aggregate features to the input dataframe.
|
|
33
|
-
|
|
34
|
-
These features are calculated with an aggregation over a rolling window of the data.
|
|
35
|
-
A list of requested features is used to determine whether to add the rolling features
|
|
36
|
-
or not.
|
|
37
|
-
|
|
38
|
-
Args:
|
|
39
|
-
data: Input dataframe to which the rolling features will be added.
|
|
40
|
-
pj: Prediction job data.
|
|
41
|
-
rolling_window: Rolling window size for the aggregation.
|
|
42
|
-
|
|
43
|
-
Returns:
|
|
44
|
-
DataFrame with added rolling features.
|
|
45
|
-
"""
|
|
46
|
-
# Ensure the index is a DatetimeIndex
|
|
47
|
-
if not isinstance(data.index, pd.DatetimeIndex):
|
|
48
|
-
raise ValueError("The DataFrame index must be a DatetimeIndex.")
|
|
49
|
-
|
|
50
|
-
if "load" not in data.columns:
|
|
51
|
-
raise ValueError("The DataFrame must contain a 'load' column.")
|
|
52
|
-
rolling_window_load = data["load"].rolling(window=rolling_window)
|
|
53
|
-
|
|
54
|
-
for aggregate_func in pj["rolling_aggregate_features"]:
|
|
55
|
-
data[
|
|
56
|
-
f"rolling_{aggregate_func.value}_load_{convert_timedelta_to_isoformat(rolling_window)}"
|
|
57
|
-
] = rolling_window_load.aggregate(aggregate_func.value)
|
|
58
|
-
return data
|