openstef 3.4.29__py3-none-any.whl → 3.4.44__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z +0 -0
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md +18 -0
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md.license +3 -0
- openstef/data/dutch_holidays.csv +1759 -0
- openstef/data/dutch_holidays.csv.license +3 -0
- openstef/data_classes/prediction_job.py +3 -1
- openstef/enums.py +105 -2
- openstef/feature_engineering/apply_features.py +26 -1
- openstef/feature_engineering/bidding_zone_to_country_mapping.py +106 -0
- openstef/feature_engineering/cyclic_features.py +102 -0
- openstef/feature_engineering/holiday_features.py +35 -26
- openstef/feature_engineering/missing_values_transformer.py +57 -15
- openstef/model/model_creator.py +24 -20
- openstef/model/objective.py +7 -7
- openstef/model/objective_creator.py +11 -11
- openstef/model/regressors/flatliner.py +4 -9
- openstef/model/regressors/linear_quantile.py +58 -9
- openstef/model/regressors/xgb.py +23 -0
- openstef/model_selection/model_selection.py +1 -1
- openstef/pipeline/create_component_forecast.py +13 -6
- openstef/pipeline/train_model.py +8 -5
- openstef/tasks/calculate_kpi.py +3 -3
- openstef/tasks/create_basecase_forecast.py +2 -2
- openstef/tasks/create_components_forecast.py +4 -4
- openstef/tasks/create_forecast.py +4 -4
- openstef/tasks/create_solar_forecast.py +4 -4
- openstef/tasks/optimize_hyperparameters.py +2 -2
- openstef/tasks/split_forecast.py +2 -2
- openstef/tasks/train_model.py +2 -2
- openstef/validation/validation.py +1 -1
- {openstef-3.4.29.dist-info → openstef-3.4.44.dist-info}/METADATA +38 -26
- {openstef-3.4.29.dist-info → openstef-3.4.44.dist-info}/RECORD +36 -30
- {openstef-3.4.29.dist-info → openstef-3.4.44.dist-info}/WHEEL +1 -1
- openstef/data/dutch_holidays_2020-2022.csv +0 -831
- /openstef/data/{dutch_holidays_2020-2022.csv.license → dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z.license} +0 -0
- {openstef-3.4.29.dist-info → openstef-3.4.44.dist-info}/LICENSE +0 -0
- {openstef-3.4.29.dist-info → openstef-3.4.44.dist-info}/top_level.txt +0 -0
@@ -9,7 +9,7 @@ from pydantic.v1 import BaseModel
|
|
9
9
|
from openstef.data_classes.data_prep import DataPrepDataClass
|
10
10
|
from openstef.data_classes.model_specifications import ModelSpecificationDataClass
|
11
11
|
from openstef.data_classes.split_function import SplitFuncDataClass
|
12
|
-
from openstef.enums import PipelineType
|
12
|
+
from openstef.enums import PipelineType, BiddingZone
|
13
13
|
|
14
14
|
|
15
15
|
class PredictionJobDataClass(BaseModel):
|
@@ -54,6 +54,8 @@ class PredictionJobDataClass(BaseModel):
|
|
54
54
|
lon: Optional[float] = 5.291266
|
55
55
|
"""Longitude of the forecasted location in degrees. Used for fetching weather data in tasks, calculating derrived features and component splitting."""
|
56
56
|
name: str
|
57
|
+
"""Bidding zone is used to determine the electricity price. It is also used to determine the holidays that should be used. Currently only ENTSO-E bidding zones are supported."""
|
58
|
+
electricity_bidding_zone: Optional[BiddingZone] = BiddingZone.NL
|
57
59
|
"""Name of the forecast, e.g. the location name."""
|
58
60
|
train_components: Optional[bool]
|
59
61
|
"""Whether splitting the forecasts in wind, solar, rest is desired."""
|
openstef/enums.py
CHANGED
@@ -4,8 +4,111 @@
|
|
4
4
|
from enum import Enum
|
5
5
|
|
6
6
|
|
7
|
-
|
8
|
-
|
7
|
+
class BiddingZone(Enum):
|
8
|
+
DE_50HZ = "DE_50HZ"
|
9
|
+
AL = "AL"
|
10
|
+
DE_AMPRION = "DE_AMPRION"
|
11
|
+
AT = "AT"
|
12
|
+
BY = "BY"
|
13
|
+
BE = "BE"
|
14
|
+
BA = "BA"
|
15
|
+
BG = "BG"
|
16
|
+
CZ_DE_SK = "CZ_DE_SK"
|
17
|
+
HR = "HR"
|
18
|
+
CWE = "CWE"
|
19
|
+
CY = "CY"
|
20
|
+
CZ = "CZ"
|
21
|
+
DE_AT_LU = "DE_AT_LU"
|
22
|
+
DE_LU = "DE_LU"
|
23
|
+
DK = "DK"
|
24
|
+
DK_1 = "DK_1"
|
25
|
+
DK_1_NO_1 = "DK_1_NO_1"
|
26
|
+
DK_2 = "DK_2"
|
27
|
+
DK_CA = "DK_CA"
|
28
|
+
EE = "EE"
|
29
|
+
FI = "FI"
|
30
|
+
MK = "MK"
|
31
|
+
FR = "FR"
|
32
|
+
DE = "DE"
|
33
|
+
GR = "GR"
|
34
|
+
HU = "HU"
|
35
|
+
IS = "IS"
|
36
|
+
IE_SEM = "IE_SEM"
|
37
|
+
IE = "IE"
|
38
|
+
IT = "IT"
|
39
|
+
IT_SACO_AC = "IT_SACO_AC"
|
40
|
+
IT_CALA = "IT_CALA"
|
41
|
+
IT_SACO_DC = "IT_SACO_DC"
|
42
|
+
IT_BRNN = "IT_BRNN"
|
43
|
+
IT_CNOR = "IT_CNOR"
|
44
|
+
IT_CSUD = "IT_CSUD"
|
45
|
+
IT_FOGN = "IT_FOGN"
|
46
|
+
IT_GR = "IT_GR"
|
47
|
+
IT_MACRO_NORTH = "IT_MACRO_NORTH"
|
48
|
+
IT_MACRO_SOUTH = "IT_MACRO_SOUTH"
|
49
|
+
IT_MALTA = "IT_MALTA"
|
50
|
+
IT_NORD = "IT_NORD"
|
51
|
+
IT_NORD_AT = "IT_NORD_AT"
|
52
|
+
IT_NORD_CH = "IT_NORD_CH"
|
53
|
+
IT_NORD_FR = "IT_NORD_FR"
|
54
|
+
IT_NORD_SI = "IT_NORD_SI"
|
55
|
+
IT_PRGP = "IT_PRGP"
|
56
|
+
IT_ROSN = "IT_ROSN"
|
57
|
+
IT_SARD = "IT_SARD"
|
58
|
+
IT_SICI = "IT_SICI"
|
59
|
+
IT_SUD = "IT_SUD"
|
60
|
+
RU_KGD = "RU_KGD"
|
61
|
+
LV = "LV"
|
62
|
+
LT = "LT"
|
63
|
+
LU = "LU"
|
64
|
+
LU_BZN = "LU_BZN"
|
65
|
+
MT = "MT"
|
66
|
+
ME = "ME"
|
67
|
+
GB = "GB"
|
68
|
+
GE = "GE"
|
69
|
+
GB_IFA = "GB_IFA"
|
70
|
+
GB_IFA2 = "GB_IFA2"
|
71
|
+
GB_ELECLINK = "GB_ELECLINK"
|
72
|
+
UK = "UK"
|
73
|
+
NL = "NL"
|
74
|
+
NO_1 = "NO_1"
|
75
|
+
NO_1A = "NO_1A"
|
76
|
+
NO_2 = "NO_2"
|
77
|
+
NO_2_NSL = "NO_2_NSL"
|
78
|
+
NO_2A = "NO_2A"
|
79
|
+
NO_3 = "NO_3"
|
80
|
+
NO_4 = "NO_4"
|
81
|
+
NO_5 = "NO_5"
|
82
|
+
NO = "NO"
|
83
|
+
PL_CZ = "PL_CZ"
|
84
|
+
PL = "PL"
|
85
|
+
PT = "PT"
|
86
|
+
MD = "MD"
|
87
|
+
RO = "RO"
|
88
|
+
RU = "RU"
|
89
|
+
SE_1 = "SE_1"
|
90
|
+
SE_2 = "SE_2"
|
91
|
+
SE_3 = "SE_3"
|
92
|
+
SE_4 = "SE_4"
|
93
|
+
RS = "RS"
|
94
|
+
SK = "SK"
|
95
|
+
SI = "SI"
|
96
|
+
GB_NIR = "GB_NIR"
|
97
|
+
ES = "ES"
|
98
|
+
SE = "SE"
|
99
|
+
CH = "CH"
|
100
|
+
DE_TENNET = "DE_TENNET"
|
101
|
+
DE_TRANSNET = "DE_TRANSNET"
|
102
|
+
TR = "TR"
|
103
|
+
UA = "UA"
|
104
|
+
UA_DOBTPP = "UA_DOBTPP"
|
105
|
+
UA_BEI = "UA_BEI"
|
106
|
+
UA_IPS = "UA_IPS"
|
107
|
+
XK = "XK"
|
108
|
+
DE_AMP_LU = "DE_AMP_LU"
|
109
|
+
|
110
|
+
|
111
|
+
class ModelType(Enum):
|
9
112
|
XGB = "xgb"
|
10
113
|
XGB_QUANTILE = "xgb_quantile"
|
11
114
|
XGB_MULTIOUTPUT_QUANTILE = "xgb_multioutput_quantile"
|
@@ -14,16 +14,25 @@ Examples of features that are added:
|
|
14
14
|
import pandas as pd
|
15
15
|
|
16
16
|
from openstef.data_classes.prediction_job import PredictionJobDataClass
|
17
|
+
from openstef.enums import BiddingZone
|
17
18
|
from openstef.feature_engineering.holiday_features import (
|
18
19
|
generate_holiday_feature_functions,
|
19
20
|
)
|
20
21
|
from openstef.feature_engineering.lag_features import generate_lag_feature_functions
|
22
|
+
from openstef.feature_engineering.bidding_zone_to_country_mapping import (
|
23
|
+
BIDDING_ZONE_TO_COUNTRY_CODE_MAPPING,
|
24
|
+
)
|
21
25
|
from openstef.feature_engineering.weather_features import (
|
22
26
|
add_additional_solar_features,
|
23
27
|
add_additional_wind_features,
|
24
28
|
add_humidity_features,
|
25
29
|
)
|
26
30
|
|
31
|
+
from openstef.feature_engineering.cyclic_features import (
|
32
|
+
add_seasonal_cyclic_features,
|
33
|
+
add_time_cyclic_features,
|
34
|
+
)
|
35
|
+
|
27
36
|
|
28
37
|
def apply_features(
|
29
38
|
data: pd.DataFrame,
|
@@ -58,6 +67,7 @@ def apply_features(
|
|
58
67
|
|
59
68
|
import pandas as pd
|
60
69
|
import numpy as np
|
70
|
+
from geopy.geocoders import Nominatim
|
61
71
|
index = pd.date_range(start = "2017-01-01 09:00:00",
|
62
72
|
freq = '15T', periods = 200)
|
63
73
|
data = pd.DataFrame(index = index,
|
@@ -66,6 +76,9 @@ def apply_features(
|
|
66
76
|
np.random.uniform(0.7,1.7, 200)))
|
67
77
|
|
68
78
|
"""
|
79
|
+
if pj is None:
|
80
|
+
pj = {"electricity_bidding_zone": BiddingZone.NL}
|
81
|
+
|
69
82
|
# Get lag feature functions
|
70
83
|
feature_functions = generate_lag_feature_functions(feature_names, horizon)
|
71
84
|
|
@@ -80,8 +93,14 @@ def apply_features(
|
|
80
93
|
}
|
81
94
|
)
|
82
95
|
|
96
|
+
# Get country code from bidding zone if available
|
97
|
+
electricity_bidding_zone = pj.get("electricity_bidding_zone", BiddingZone.NL)
|
98
|
+
country_code = BIDDING_ZONE_TO_COUNTRY_CODE_MAPPING[electricity_bidding_zone.name]
|
99
|
+
|
83
100
|
# Get holiday feature functions
|
84
|
-
feature_functions.update(
|
101
|
+
feature_functions.update(
|
102
|
+
generate_holiday_feature_functions(country_code=country_code)
|
103
|
+
)
|
85
104
|
|
86
105
|
# Add the features to the dataframe using previously defined feature functions
|
87
106
|
for key, featfunc in feature_functions.items():
|
@@ -99,5 +118,11 @@ def apply_features(
|
|
99
118
|
# Add solar features; when pj is unavailable a default location is used.
|
100
119
|
data = add_additional_solar_features(data, pj, feature_names)
|
101
120
|
|
121
|
+
# Adds cyclical features to capture seasonal and periodic patterns in time-based data.
|
122
|
+
data = add_seasonal_cyclic_features(data)
|
123
|
+
|
124
|
+
# Adds polar time features (sine and cosine) to capture periodic patterns based on the timestamp index.
|
125
|
+
data = add_time_cyclic_features(data)
|
126
|
+
|
102
127
|
# Return dataframe including all requested features
|
103
128
|
return data
|
@@ -0,0 +1,106 @@
|
|
1
|
+
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
2
|
+
#
|
3
|
+
# SPDX-License-Identifier: MPL-2.0
|
4
|
+
BIDDING_ZONE_TO_COUNTRY_CODE_MAPPING = {
|
5
|
+
"DE_50HZ": "DE",
|
6
|
+
"AL": "AL",
|
7
|
+
"DE_AMPRION": "DE",
|
8
|
+
"AT": "AT",
|
9
|
+
"BY": "BY",
|
10
|
+
"BE": "BE",
|
11
|
+
"BA": "BA",
|
12
|
+
"BG": "BG",
|
13
|
+
"CZ_DE_SK": "CZ",
|
14
|
+
"HR": "HR",
|
15
|
+
"CWE": "CWE",
|
16
|
+
"CY": "CY",
|
17
|
+
"CZ": "CZ",
|
18
|
+
"DE_AT_LU": "DE",
|
19
|
+
"DE_LU": "DE",
|
20
|
+
"DK": "DK",
|
21
|
+
"DK_1": "DK",
|
22
|
+
"DK_1_NO_1": "DK",
|
23
|
+
"DK_2": "DK",
|
24
|
+
"DK_CA": "DK",
|
25
|
+
"EE": "EE",
|
26
|
+
"FI": "FI",
|
27
|
+
"MK": "MK",
|
28
|
+
"FR": "FR",
|
29
|
+
"DE": "DE",
|
30
|
+
"GR": "GR",
|
31
|
+
"HU": "HU",
|
32
|
+
"IS": "IS",
|
33
|
+
"IE_SEM": "IE",
|
34
|
+
"IE": "IE",
|
35
|
+
"IT": "IT",
|
36
|
+
"IT_SACO_AC": "IT",
|
37
|
+
"IT_CALA": "IT",
|
38
|
+
"IT_SACO_DC": "IT",
|
39
|
+
"IT_BRNN": "IT",
|
40
|
+
"IT_CNOR": "IT",
|
41
|
+
"IT_CSUD": "IT",
|
42
|
+
"IT_FOGN": "IT",
|
43
|
+
"IT_GR": "IT",
|
44
|
+
"IT_MACRO_NORTH": "IT",
|
45
|
+
"IT_MACRO_SOUTH": "IT",
|
46
|
+
"IT_MALTA": "IT",
|
47
|
+
"IT_NORD": "IT",
|
48
|
+
"IT_NORD_AT": "IT",
|
49
|
+
"IT_NORD_CH": "IT",
|
50
|
+
"IT_NORD_FR": "IT",
|
51
|
+
"IT_NORD_SI": "IT",
|
52
|
+
"IT_PRGP": "IT",
|
53
|
+
"IT_ROSN": "IT",
|
54
|
+
"IT_SARD": "IT",
|
55
|
+
"IT_SICI": "IT",
|
56
|
+
"IT_SUD": "IT",
|
57
|
+
"RU_KGD": "RU",
|
58
|
+
"LV": "LV",
|
59
|
+
"LT": "LT",
|
60
|
+
"LU": "LU",
|
61
|
+
"LU_BZN": "LU",
|
62
|
+
"MT": "MT",
|
63
|
+
"ME": "ME",
|
64
|
+
"GB": "GB",
|
65
|
+
"GE": "GE",
|
66
|
+
"GB_IFA": "GB",
|
67
|
+
"GB_IFA2": "GB",
|
68
|
+
"GB_ELECLINK": "GB",
|
69
|
+
"UK": "UK",
|
70
|
+
"NL": "NL",
|
71
|
+
"NO_1": "NO",
|
72
|
+
"NO_1A": "NO",
|
73
|
+
"NO_2": "NO",
|
74
|
+
"NO_2_NSL": "NO",
|
75
|
+
"NO_2A": "NO",
|
76
|
+
"NO_3": "NO",
|
77
|
+
"NO_4": "NO",
|
78
|
+
"NO_5": "NO",
|
79
|
+
"NO": "NO",
|
80
|
+
"PL_CZ": "PL",
|
81
|
+
"PL": "PL",
|
82
|
+
"PT": "PT",
|
83
|
+
"MD": "MD",
|
84
|
+
"RO": "RO",
|
85
|
+
"RU": "RU",
|
86
|
+
"SE_1": "SE",
|
87
|
+
"SE_2": "SE",
|
88
|
+
"SE_3": "SE",
|
89
|
+
"SE_4": "SE",
|
90
|
+
"RS": "RS",
|
91
|
+
"SK": "SK",
|
92
|
+
"SI": "SI",
|
93
|
+
"GB_NIR": "GB",
|
94
|
+
"ES": "ES",
|
95
|
+
"SE": "SE",
|
96
|
+
"CH": "CH",
|
97
|
+
"DE_TENNET": "DE",
|
98
|
+
"DE_TRANSNET": "DE",
|
99
|
+
"TR": "TR",
|
100
|
+
"UA": "UA",
|
101
|
+
"UA_DOBTPP": "UA",
|
102
|
+
"UA_BEI": "UA",
|
103
|
+
"UA_IPS": "UA",
|
104
|
+
"XK": "XK",
|
105
|
+
"DE_AMP_LU": "DE",
|
106
|
+
}
|
@@ -0,0 +1,102 @@
|
|
1
|
+
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
2
|
+
#
|
3
|
+
# SPDX-License-Identifier: MPL-2.0
|
4
|
+
|
5
|
+
# Module for adding temporal cyclic features to time-based data for capturing seasonality and periodic patterns.
|
6
|
+
# Features include yearly, weekly, and monthly seasonality, as well as time-of-day periodicity.
|
7
|
+
|
8
|
+
|
9
|
+
import numpy as np
|
10
|
+
import pandas as pd
|
11
|
+
|
12
|
+
import structlog
|
13
|
+
import logging
|
14
|
+
|
15
|
+
from openstef.settings import Settings
|
16
|
+
|
17
|
+
structlog.configure(
|
18
|
+
wrapper_class=structlog.make_filtering_bound_logger(
|
19
|
+
logging.getLevelName(Settings.log_level)
|
20
|
+
)
|
21
|
+
)
|
22
|
+
logger = structlog.get_logger(__name__)
|
23
|
+
|
24
|
+
|
25
|
+
NUM_SECONDS_IN_A_DAY = 24 * 60 * 60
|
26
|
+
|
27
|
+
|
28
|
+
def add_time_cyclic_features(
|
29
|
+
data: pd.DataFrame,
|
30
|
+
) -> pd.DataFrame:
|
31
|
+
"""Adds time of the day features cyclically encoded using sine and cosine to the input data.
|
32
|
+
|
33
|
+
Args:
|
34
|
+
data: Dataframe indexed by datetime.
|
35
|
+
|
36
|
+
Returns:
|
37
|
+
DataFrame that is the same as input dataframe with extra columns for the added time of the day features.
|
38
|
+
"""
|
39
|
+
# Ensure the index is a DatetimeIndex
|
40
|
+
if not isinstance(data.index, pd.DatetimeIndex):
|
41
|
+
raise ValueError("Index should be a pandas DatetimeIndex")
|
42
|
+
|
43
|
+
# Make a copy of the DataFrame to avoid modifying the original
|
44
|
+
data = data.copy()
|
45
|
+
|
46
|
+
second_of_the_day = (
|
47
|
+
data.index.second + data.index.minute * 60 + data.index.hour * 60 * 60
|
48
|
+
)
|
49
|
+
period_of_the_day = 2 * np.pi * second_of_the_day / NUM_SECONDS_IN_A_DAY
|
50
|
+
|
51
|
+
data["time0fday_sine"] = np.sin(period_of_the_day)
|
52
|
+
data["time0fday_cosine"] = np.cos(period_of_the_day)
|
53
|
+
|
54
|
+
return data
|
55
|
+
|
56
|
+
|
57
|
+
def add_seasonal_cyclic_features(
|
58
|
+
data: pd.DataFrame, compute_features: list = None
|
59
|
+
) -> pd.DataFrame:
|
60
|
+
"""Adds cyclical features to capture seasonal and periodic patterns in time-based data.
|
61
|
+
|
62
|
+
Args:
|
63
|
+
- data (pd.DataFrame): DataFrame with a DatetimeIndex.
|
64
|
+
- compute_features (list): Optional. List of features to compute. Options are:
|
65
|
+
['season', 'dayofweek', 'month']. Default is all features.
|
66
|
+
|
67
|
+
Returns:
|
68
|
+
- pd.DataFrame: DataFrame with added cyclical features.
|
69
|
+
|
70
|
+
Example:
|
71
|
+
>>> data = pd.DataFrame(index=pd.date_range(start='2023-01-01', periods=365, freq='D'))
|
72
|
+
>>> data_with_features = add_cyclical_features(data)
|
73
|
+
>>> print(data_with_features.head())
|
74
|
+
"""
|
75
|
+
# Ensure the index is a DatetimeIndex
|
76
|
+
if not isinstance(data.index, pd.DatetimeIndex):
|
77
|
+
raise ValueError("The DataFrame index must be a DatetimeIndex.")
|
78
|
+
|
79
|
+
# Make a copy of the DataFrame to avoid modifying the original
|
80
|
+
data = data.copy()
|
81
|
+
|
82
|
+
# Default to all features if none specified
|
83
|
+
compute_features = compute_features or ["season", "dayofweek", "month"]
|
84
|
+
|
85
|
+
days_in_year = 365.25 # Account for leap years
|
86
|
+
|
87
|
+
# Add seasonality features (day of year)
|
88
|
+
if "season" in compute_features:
|
89
|
+
data["season_sine"] = np.sin(2 * np.pi * data.index.dayofyear / days_in_year)
|
90
|
+
data["season_cosine"] = np.cos(2 * np.pi * data.index.dayofyear / days_in_year)
|
91
|
+
|
92
|
+
# Add weekly features (day of the week)
|
93
|
+
if "dayofweek" in compute_features:
|
94
|
+
data["day0fweek_sine"] = np.sin(2 * np.pi * data.index.day_of_week / 7)
|
95
|
+
data["day0fweek_cosine"] = np.cos(2 * np.pi * data.index.day_of_week / 7)
|
96
|
+
|
97
|
+
# Add monthly features (month of the year)
|
98
|
+
if "month" in compute_features:
|
99
|
+
data["month_sine"] = np.sin(2 * np.pi * data.index.month / 12)
|
100
|
+
data["month_cosine"] = np.cos(2 * np.pi * data.index.month / 12)
|
101
|
+
|
102
|
+
return data
|
@@ -10,13 +10,11 @@ import pandas as pd
|
|
10
10
|
|
11
11
|
from openstef import PROJECT_ROOT
|
12
12
|
|
13
|
-
HOLIDAY_CSV_PATH: str =
|
14
|
-
PROJECT_ROOT / "openstef" / "data" / "dutch_holidays_2020-2022.csv"
|
15
|
-
)
|
13
|
+
HOLIDAY_CSV_PATH: str = PROJECT_ROOT / "openstef" / "data" / "dutch_holidays.csv"
|
16
14
|
|
17
15
|
|
18
16
|
def generate_holiday_feature_functions(
|
19
|
-
|
17
|
+
country_code: str = "NL",
|
20
18
|
years: list = None,
|
21
19
|
path_to_school_holidays_csv: str = HOLIDAY_CSV_PATH,
|
22
20
|
) -> dict:
|
@@ -46,12 +44,14 @@ def generate_holiday_feature_functions(
|
|
46
44
|
- Pinksteren
|
47
45
|
- Kerst
|
48
46
|
|
47
|
+
|
49
48
|
The 'Brugdagen' are updated untill dec 2020. (Generated using agenda)
|
50
49
|
|
51
50
|
Args:
|
52
51
|
country: Country for which to create holiday features.
|
53
52
|
years: years for which to create holiday features.
|
54
53
|
path_to_school_holidays_csv: Filepath to csv with school holidays.
|
54
|
+
NOTE: Dutch holidays csv file is only until January 2026.
|
55
55
|
|
56
56
|
Returns:
|
57
57
|
Dictionary with functions that check if a given date is a holiday, keys
|
@@ -69,7 +69,7 @@ def generate_holiday_feature_functions(
|
|
69
69
|
now.year + 1,
|
70
70
|
]
|
71
71
|
|
72
|
-
country_holidays = holidays.country_holidays(
|
72
|
+
country_holidays = holidays.country_holidays(country_code, years=years)
|
73
73
|
|
74
74
|
# Make holiday function dict
|
75
75
|
holiday_functions = {}
|
@@ -96,7 +96,7 @@ def generate_holiday_feature_functions(
|
|
96
96
|
|
97
97
|
# Check for bridge day
|
98
98
|
holiday_functions, bridge_days = check_for_bridge_day(
|
99
|
-
date, holiday_name,
|
99
|
+
date, holiday_name, country_code, years, holiday_functions, bridge_days
|
100
100
|
)
|
101
101
|
|
102
102
|
# Add feature function that includes all bridgedays
|
@@ -104,33 +104,42 @@ def generate_holiday_feature_functions(
|
|
104
104
|
{"is_bridgeday": lambda x: np.isin(x.index.date, np.array(list(bridge_days)))}
|
105
105
|
)
|
106
106
|
|
107
|
-
#
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
)
|
115
|
-
|
116
|
-
# Loop over list of holidays names
|
117
|
-
for holiday_name in list(set(df_holidays.name)):
|
118
|
-
# Define function explicitely to mitigate 'late binding' problem
|
119
|
-
def make_holiday_func(holidayname=holiday_name):
|
120
|
-
return lambda x: np.isin(
|
121
|
-
x.index.date, df_holidays.datum[df_holidays.name == holidayname].values
|
122
|
-
)
|
107
|
+
# Add school holidays if country is NL
|
108
|
+
if country_code == "NL":
|
109
|
+
# Manully generated csv including all dutch schoolholidays for different regions
|
110
|
+
df_holidays = pd.read_csv(path_to_school_holidays_csv, index_col=None)
|
111
|
+
df_holidays["datum"] = pd.to_datetime(df_holidays.datum).apply(
|
112
|
+
lambda x: x.date()
|
113
|
+
)
|
123
114
|
|
124
|
-
#
|
115
|
+
# Add check function that includes all holidays of the provided csv
|
125
116
|
holiday_functions.update(
|
126
117
|
{
|
127
|
-
"
|
128
|
-
|
129
|
-
holidayname=holiday_name
|
118
|
+
"is_schoolholiday": lambda x: np.isin(
|
119
|
+
x.index.date, df_holidays.datum.values
|
130
120
|
)
|
131
121
|
}
|
132
122
|
)
|
133
123
|
|
124
|
+
# Loop over list of holidays names
|
125
|
+
for holiday_name in list(set(df_holidays.name)):
|
126
|
+
# Define function explicitely to mitigate 'late binding' problem
|
127
|
+
def make_holiday_func(holidayname=holiday_name):
|
128
|
+
return lambda x: np.isin(
|
129
|
+
x.index.date,
|
130
|
+
df_holidays.datum[df_holidays.name == holidayname].values,
|
131
|
+
)
|
132
|
+
|
133
|
+
# Create lag function for each holiday
|
134
|
+
holiday_functions.update(
|
135
|
+
{
|
136
|
+
"is_"
|
137
|
+
+ holiday_name.replace(" ", "_").lower(): make_holiday_func(
|
138
|
+
holidayname=holiday_name
|
139
|
+
)
|
140
|
+
}
|
141
|
+
)
|
142
|
+
|
134
143
|
return holiday_functions
|
135
144
|
|
136
145
|
|
@@ -7,7 +7,7 @@ import numpy as np
|
|
7
7
|
import pandas as pd
|
8
8
|
from sklearn.impute import SimpleImputer
|
9
9
|
from sklearn.preprocessing import FunctionTransformer
|
10
|
-
from sklearn.utils.validation import check_array
|
10
|
+
from sklearn.utils.validation import check_array, check_is_fitted
|
11
11
|
|
12
12
|
|
13
13
|
class MissingValuesTransformer:
|
@@ -27,6 +27,7 @@ class MissingValuesTransformer:
|
|
27
27
|
missing_values: Union[int, float, str, None] = np.nan,
|
28
28
|
imputation_strategy: str = None,
|
29
29
|
fill_value: Union[str, int, float] = None,
|
30
|
+
no_fill_future_values_features: List[str] = None,
|
30
31
|
):
|
31
32
|
"""Initialize missing values handler.
|
32
33
|
|
@@ -37,11 +38,34 @@ class MissingValuesTransformer:
|
|
37
38
|
Can be one of "mean", "median", "most_frequent", "constant" or None.
|
38
39
|
fill_value: When strategy == "constant", fill_value is used to replace all
|
39
40
|
occurrences of missing_values.
|
41
|
+
no_fill_future_values_features: The features for which it does not make sense
|
42
|
+
to fill future values. Rows that contain trailing null values for these
|
43
|
+
features will be removed from the data.
|
40
44
|
|
41
45
|
"""
|
42
46
|
self.missing_values = missing_values
|
43
47
|
self.imputation_strategy = imputation_strategy
|
44
48
|
self.fill_value = fill_value
|
49
|
+
self.no_fill_future_values_features = no_fill_future_values_features or []
|
50
|
+
self.is_fitted_ = False
|
51
|
+
|
52
|
+
# Build the proper imputation transformer
|
53
|
+
# - Identity function if strategy is None
|
54
|
+
# - SimpleImputer with the dedicated strategy
|
55
|
+
if self.imputation_strategy is None:
|
56
|
+
self.imputer_ = FunctionTransformer(func=self._identity)
|
57
|
+
else:
|
58
|
+
self.imputer_ = SimpleImputer(
|
59
|
+
missing_values=self.missing_values,
|
60
|
+
strategy=self.imputation_strategy,
|
61
|
+
fill_value=self.fill_value,
|
62
|
+
).set_output(transform="pandas")
|
63
|
+
self.imputer_._validate_params()
|
64
|
+
|
65
|
+
@staticmethod
|
66
|
+
def _determine_trailing_null_rows(x: pd.DataFrame) -> pd.Series:
|
67
|
+
"""Determine rows with trailing null values in a DataFrame."""
|
68
|
+
return ~x.bfill().isnull().any(axis="columns")
|
45
69
|
|
46
70
|
def fit(self, x, y=None):
|
47
71
|
"""Fit the imputer on the input data."""
|
@@ -55,33 +79,33 @@ class MissingValuesTransformer:
|
|
55
79
|
# Remove always null columns
|
56
80
|
is_column_null = x.isnull().all(axis="index")
|
57
81
|
self.non_null_feature_names = list(x.columns[~is_column_null])
|
82
|
+
x = x[self.non_null_feature_names]
|
58
83
|
|
59
|
-
#
|
60
|
-
#
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
self.imputer_ = SimpleImputer(
|
66
|
-
missing_values=self.missing_values,
|
67
|
-
strategy=self.imputation_strategy,
|
68
|
-
fill_value=self.fill_value,
|
69
|
-
).set_output(transform="pandas")
|
84
|
+
# Remove trailing null rows for features that should
|
85
|
+
# not be imputed in the future
|
86
|
+
trailing_null_rows = self._determine_trailing_null_rows(
|
87
|
+
x[self.no_fill_future_values_features]
|
88
|
+
)
|
89
|
+
x = x.loc[trailing_null_rows]
|
70
90
|
|
71
91
|
# Imputers do not support labels
|
72
92
|
self.imputer_.fit(X=x, y=None)
|
93
|
+
self.is_fitted_ = True
|
73
94
|
|
74
95
|
def transform(self, x) -> pd.DataFrame:
|
75
96
|
"""Transform the input data by imputing missing values."""
|
97
|
+
check_is_fitted(self)
|
76
98
|
_ = check_array(x, force_all_finite="allow-nan")
|
77
99
|
if not isinstance(x, pd.DataFrame):
|
78
100
|
x = pd.DataFrame(np.asarray(x))
|
79
101
|
|
80
102
|
x = x[self.non_null_feature_names]
|
81
103
|
|
82
|
-
|
104
|
+
transformed = self.imputer_.transform(x)
|
83
105
|
|
84
|
-
|
106
|
+
return transformed
|
107
|
+
|
108
|
+
def fit_transform(self, x, y=None) -> tuple[pd.DataFrame, Optional[pd.Series]]:
|
85
109
|
"""Fit the imputer on the input data and transform it.
|
86
110
|
|
87
111
|
Returns:
|
@@ -89,7 +113,25 @@ class MissingValuesTransformer:
|
|
89
113
|
|
90
114
|
"""
|
91
115
|
self.fit(x, y)
|
92
|
-
|
116
|
+
|
117
|
+
if not isinstance(x, pd.DataFrame):
|
118
|
+
x = pd.DataFrame(np.asarray(x))
|
119
|
+
|
120
|
+
x = x[self.non_null_feature_names]
|
121
|
+
|
122
|
+
# Remove trailing null rows for features that should
|
123
|
+
# not be imputed in the future
|
124
|
+
non_trailing_null_rows = self._determine_trailing_null_rows(
|
125
|
+
x[self.no_fill_future_values_features]
|
126
|
+
)
|
127
|
+
x = x.loc[non_trailing_null_rows]
|
128
|
+
|
129
|
+
x = self.transform(x)
|
130
|
+
|
131
|
+
if y is not None:
|
132
|
+
y = y.loc[non_trailing_null_rows]
|
133
|
+
|
134
|
+
return x, y
|
93
135
|
|
94
136
|
@classmethod
|
95
137
|
def _identity(cls, x):
|