openstef 3.4.56__py3-none-any.whl → 4.0.0a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. openstef-4.0.0a3.dist-info/METADATA +177 -0
  2. openstef-4.0.0a3.dist-info/RECORD +4 -0
  3. {openstef-3.4.56.dist-info → openstef-4.0.0a3.dist-info}/WHEEL +1 -2
  4. openstef/__init__.py +0 -14
  5. openstef/__main__.py +0 -3
  6. openstef/app_settings.py +0 -19
  7. openstef/data/NL_terrestrial_radiation.csv +0 -25585
  8. openstef/data/NL_terrestrial_radiation.csv.license +0 -3
  9. openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z +0 -0
  10. openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z.license +0 -3
  11. openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md +0 -18
  12. openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md.license +0 -3
  13. openstef/data/dutch_holidays.csv +0 -1759
  14. openstef/data/dutch_holidays.csv.license +0 -3
  15. openstef/data/pv_single_coefs.csv +0 -601
  16. openstef/data/pv_single_coefs.csv.license +0 -3
  17. openstef/data_classes/__init__.py +0 -3
  18. openstef/data_classes/data_prep.py +0 -99
  19. openstef/data_classes/model_specifications.py +0 -30
  20. openstef/data_classes/prediction_job.py +0 -135
  21. openstef/data_classes/split_function.py +0 -97
  22. openstef/enums.py +0 -140
  23. openstef/exceptions.py +0 -74
  24. openstef/feature_engineering/__init__.py +0 -3
  25. openstef/feature_engineering/apply_features.py +0 -138
  26. openstef/feature_engineering/bidding_zone_to_country_mapping.py +0 -106
  27. openstef/feature_engineering/cyclic_features.py +0 -161
  28. openstef/feature_engineering/data_preparation.py +0 -152
  29. openstef/feature_engineering/feature_adder.py +0 -206
  30. openstef/feature_engineering/feature_applicator.py +0 -202
  31. openstef/feature_engineering/general.py +0 -141
  32. openstef/feature_engineering/holiday_features.py +0 -231
  33. openstef/feature_engineering/lag_features.py +0 -165
  34. openstef/feature_engineering/missing_values_transformer.py +0 -141
  35. openstef/feature_engineering/rolling_features.py +0 -58
  36. openstef/feature_engineering/weather_features.py +0 -492
  37. openstef/metrics/__init__.py +0 -3
  38. openstef/metrics/figure.py +0 -303
  39. openstef/metrics/metrics.py +0 -486
  40. openstef/metrics/reporter.py +0 -222
  41. openstef/model/__init__.py +0 -3
  42. openstef/model/basecase.py +0 -82
  43. openstef/model/confidence_interval_applicator.py +0 -242
  44. openstef/model/fallback.py +0 -77
  45. openstef/model/metamodels/__init__.py +0 -3
  46. openstef/model/metamodels/feature_clipper.py +0 -90
  47. openstef/model/metamodels/grouped_regressor.py +0 -222
  48. openstef/model/metamodels/missing_values_handler.py +0 -138
  49. openstef/model/model_creator.py +0 -214
  50. openstef/model/objective.py +0 -426
  51. openstef/model/objective_creator.py +0 -65
  52. openstef/model/regressors/__init__.py +0 -3
  53. openstef/model/regressors/arima.py +0 -197
  54. openstef/model/regressors/custom_regressor.py +0 -64
  55. openstef/model/regressors/dazls.py +0 -116
  56. openstef/model/regressors/flatliner.py +0 -95
  57. openstef/model/regressors/gblinear_quantile.py +0 -334
  58. openstef/model/regressors/lgbm.py +0 -29
  59. openstef/model/regressors/linear.py +0 -90
  60. openstef/model/regressors/linear_quantile.py +0 -305
  61. openstef/model/regressors/regressor.py +0 -114
  62. openstef/model/regressors/xgb.py +0 -52
  63. openstef/model/regressors/xgb_multioutput_quantile.py +0 -261
  64. openstef/model/regressors/xgb_quantile.py +0 -228
  65. openstef/model/serializer.py +0 -431
  66. openstef/model/standard_deviation_generator.py +0 -81
  67. openstef/model_selection/__init__.py +0 -3
  68. openstef/model_selection/model_selection.py +0 -311
  69. openstef/monitoring/__init__.py +0 -3
  70. openstef/monitoring/performance_meter.py +0 -92
  71. openstef/monitoring/teams.py +0 -203
  72. openstef/pipeline/__init__.py +0 -3
  73. openstef/pipeline/create_basecase_forecast.py +0 -133
  74. openstef/pipeline/create_component_forecast.py +0 -168
  75. openstef/pipeline/create_forecast.py +0 -171
  76. openstef/pipeline/optimize_hyperparameters.py +0 -317
  77. openstef/pipeline/train_create_forecast_backtest.py +0 -163
  78. openstef/pipeline/train_model.py +0 -561
  79. openstef/pipeline/utils.py +0 -52
  80. openstef/postprocessing/__init__.py +0 -3
  81. openstef/postprocessing/postprocessing.py +0 -275
  82. openstef/preprocessing/__init__.py +0 -3
  83. openstef/preprocessing/preprocessing.py +0 -42
  84. openstef/settings.py +0 -15
  85. openstef/tasks/__init__.py +0 -3
  86. openstef/tasks/calculate_kpi.py +0 -324
  87. openstef/tasks/create_basecase_forecast.py +0 -118
  88. openstef/tasks/create_components_forecast.py +0 -162
  89. openstef/tasks/create_forecast.py +0 -145
  90. openstef/tasks/create_solar_forecast.py +0 -420
  91. openstef/tasks/create_wind_forecast.py +0 -80
  92. openstef/tasks/optimize_hyperparameters.py +0 -135
  93. openstef/tasks/split_forecast.py +0 -273
  94. openstef/tasks/train_model.py +0 -224
  95. openstef/tasks/utils/__init__.py +0 -3
  96. openstef/tasks/utils/dependencies.py +0 -107
  97. openstef/tasks/utils/predictionjobloop.py +0 -243
  98. openstef/tasks/utils/taskcontext.py +0 -160
  99. openstef/validation/__init__.py +0 -3
  100. openstef/validation/validation.py +0 -322
  101. openstef-3.4.56.dist-info/METADATA +0 -154
  102. openstef-3.4.56.dist-info/RECORD +0 -102
  103. openstef-3.4.56.dist-info/top_level.txt +0 -1
  104. /openstef-3.4.56.dist-info/LICENSE → /openstef-4.0.0a3.dist-info/licenses/LICENSE.md +0 -0
@@ -1,231 +0,0 @@
1
- # SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
2
- #
3
- # SPDX-License-Identifier: MPL-2.0
4
- """This module contains all holiday related features."""
5
- from datetime import datetime, timedelta
6
-
7
- import holidays
8
- import numpy as np
9
- import pandas as pd
10
-
11
- from openstef import PROJECT_ROOT
12
-
13
- HOLIDAY_CSV_PATH: str = PROJECT_ROOT / "openstef" / "data" / "dutch_holidays.csv"
14
-
15
-
16
- def generate_holiday_feature_functions(
17
- country_code: str = "NL",
18
- years: list[int] | None = None,
19
- path_to_school_holidays_csv: str = HOLIDAY_CSV_PATH,
20
- ) -> dict:
21
- """Generates functions for creating holiday feature.
22
-
23
- This improves forecast accuracy. Examples of features that are
24
- added are: 2020-01-01 is 'Nieuwjaarsdag'.
25
-
26
- 2022-12-24 - 2023-01-08 is the 'Kerstvakantie'
27
- 2022-10-15 - 2022-10-23 is the 'HerfstvakantieNoord'
28
-
29
-
30
- The holidays are based on a manually generated csv file.
31
- The information is collected using:
32
- https://www.schoolvakanties-nederland.nl/ and the python holiday function
33
- The official following official ducth holidays are included untill 2023:
34
- - Kerstvakantie
35
- - Meivakantie
36
- - Herstvakantie
37
- - Bouwvak
38
- - Zomervakantie
39
- - Voorjaarsvakantie
40
- - Nieuwjaarsdag
41
- - Pasen
42
- - Koningsdag
43
- - Hemelvaart
44
- - Pinksteren
45
- - Kerst
46
-
47
-
48
- The 'Brugdagen' are updated untill dec 2020. (Generated using agenda)
49
-
50
- Args:
51
- country_code: Country for which to create holiday features.
52
- years: years for which to create holiday features. If None,
53
- the last 4 years, the current and next year are used.
54
- path_to_school_holidays_csv: Filepath to csv with school holidays.
55
-
56
- NOTE: Dutch holidays csv file is only until January 2026.
57
-
58
- Returns:
59
- Dictionary with functions that check if a given date is a holiday, keys
60
- consist of "Is" + the_name_of_the_holiday_to_be_checked
61
-
62
- """
63
- if years is None:
64
- now = datetime.now()
65
- years = [
66
- now.year - 4,
67
- now.year - 3,
68
- now.year - 2,
69
- now.year - 1,
70
- now.year,
71
- now.year + 1,
72
- ]
73
-
74
- country_holidays = holidays.country_holidays(country_code, years=years)
75
-
76
- # Make holiday function dict
77
- holiday_functions = {}
78
- # Add check function that includes all holidays of the provided csv
79
- holiday_functions.update(
80
- {
81
- "is_national_holiday": lambda x: np.isin(
82
- x.index.date, np.array(list(country_holidays))
83
- )
84
- }
85
- )
86
- # Define empty list to keep track of bridgedays
87
- bridge_days = []
88
- # Loop over list of holidays names
89
- for date, holiday_name in sorted(country_holidays.items()):
90
- # Define function explicitely to mitigate 'late binding' problem
91
- def make_holiday_func(requested_date):
92
- return lambda x: np.isin(x.index.date, np.array([requested_date]))
93
-
94
- # Create lag function for each holiday
95
- holiday_functions.update(
96
- {"is_" + holiday_name.replace(" ", "_").lower(): make_holiday_func(date)}
97
- )
98
-
99
- # Check for bridge day
100
- holiday_functions, bridge_days = check_for_bridge_day(
101
- date, holiday_name, country_code, years, holiday_functions, bridge_days
102
- )
103
-
104
- # Add feature function that includes all bridgedays
105
- holiday_functions.update(
106
- {"is_bridgeday": lambda x: np.isin(x.index.date, np.array(list(bridge_days)))}
107
- )
108
-
109
- # Add school holidays if country is NL
110
- if country_code == "NL":
111
- # Manully generated csv including all dutch schoolholidays for different regions
112
- df_holidays = pd.read_csv(path_to_school_holidays_csv, index_col=None)
113
- df_holidays["datum"] = pd.to_datetime(df_holidays.datum).apply(
114
- lambda x: x.date()
115
- )
116
-
117
- # Add check function that includes all holidays of the provided csv
118
- holiday_functions.update(
119
- {
120
- "is_schoolholiday": lambda x: np.isin(
121
- x.index.date, df_holidays.datum.values
122
- )
123
- }
124
- )
125
-
126
- # Loop over list of holidays names
127
- for holiday_name in list(set(df_holidays.name)):
128
- # Define function explicitely to mitigate 'late binding' problem
129
- def make_holiday_func(holidayname=holiday_name):
130
- return lambda x: np.isin(
131
- x.index.date,
132
- df_holidays.datum[df_holidays.name == holidayname].values,
133
- )
134
-
135
- # Create lag function for each holiday
136
- holiday_functions.update(
137
- {
138
- "is_"
139
- + holiday_name.replace(" ", "_").lower(): make_holiday_func(
140
- holidayname=holiday_name
141
- )
142
- }
143
- )
144
-
145
- return holiday_functions
146
-
147
-
148
- # Check for bridgedays
149
- def check_for_bridge_day(
150
- date: datetime,
151
- holiday_name: str,
152
- country: str,
153
- years: list,
154
- holiday_functions: dict,
155
- bridge_days: list,
156
- ) -> tuple[dict, list]:
157
- """Checks for bridgedays associated to a specific holiday with date (date).
158
-
159
- Any found bridgedays are appende dto the bridgedays list. Also a specific feature
160
- function for the bridgeday is added to the general holidayfuncitons dictionary.
161
-
162
- Args:
163
- date: Date of holiday to check for associated bridgedays.
164
- holiday_name: Name of the holiday.
165
- country: Country for which to detect the bridgedays.
166
- years: List of years for which to detect bridgedays.
167
- holiday_functions: Dictionary to which the featurefunction has to be appended to in case of a bridgeday.
168
- bridge_days: List of bridgedays to which any found bridgedays have to be appended.
169
-
170
- Returns:
171
- - Dict with holiday feature functions
172
- - List of bridgedays
173
-
174
- """
175
- country_holidays = holidays.country_holidays(country, years=years)
176
-
177
- # if the date is a holiday, it is not a bridgeday
178
- if date in country_holidays:
179
- return holiday_functions, bridge_days
180
-
181
- # Define function explicitely to mitigate 'late binding' problem
182
- def make_holiday_func(requested_date):
183
- return lambda x: np.isin(x.index.date, np.array([requested_date]))
184
-
185
- # Looking forward: If day after tomorow is a national holiday or
186
- # a saturday check if tomorow is not a national holiday
187
-
188
- is_saturday_in_two_days = (date + timedelta(days=2)).weekday() == 5
189
- is_holiday_in_two_days = (date + timedelta(days=2)) in country_holidays
190
-
191
- is_holiday_tommorow = (date + timedelta(days=1)) in country_holidays
192
- is_weekend_tommorrow = (date + timedelta(days=1)).weekday() in [5, 6]
193
-
194
- if (
195
- (is_holiday_in_two_days or is_saturday_in_two_days)
196
- and (not is_holiday_tommorow and not is_weekend_tommorrow)
197
- and date not in country_holidays
198
- ):
199
- # Create feature function for each holiday
200
- holiday_functions.update(
201
- {
202
- "is_bridgeday"
203
- + holiday_name.replace(" ", "_").lower(): make_holiday_func(
204
- (date + timedelta(days=1))
205
- )
206
- }
207
- )
208
- bridge_days.append((date + timedelta(days=1)))
209
-
210
- # Looking backward: If the day before is a national holiday
211
- # or a sunday check if yesterday is a national holiday
212
- is_saturday_two_days_ago = (date - timedelta(days=2)).weekday() == 6
213
- is_holiday_two_days_ago = (date - timedelta(days=2)) in country_holidays
214
- is_holiday_yesterday = (date - timedelta(days=1)) in country_holidays
215
- is_weekend_yesterday = (date - timedelta(days=1)).weekday() in [5, 6]
216
-
217
- if (is_saturday_two_days_ago or is_holiday_two_days_ago) and (
218
- not is_holiday_yesterday and not is_weekend_yesterday
219
- ):
220
- # Create featurefunction for the bridge function
221
- holiday_functions.update(
222
- {
223
- "is_bridgeday"
224
- + holiday_name.replace(" ", "_").lower(): make_holiday_func(
225
- (date - timedelta(days=1))
226
- )
227
- }
228
- )
229
- bridge_days.append((date - timedelta(days=1)))
230
-
231
- return holiday_functions, bridge_days
@@ -1,165 +0,0 @@
1
- # SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
2
- #
3
- # SPDX-License-Identifier: MPL-2.0
4
- """This module contains all lag features."""
5
- import re
6
-
7
- import numpy as np
8
- import pandas as pd
9
- import scipy.signal
10
-
11
-
12
- def generate_lag_feature_functions(
13
- feature_names: list[str] = None, horizon: float = 24.0
14
- ) -> dict:
15
- """Creates functions to generate lag features in a dataset.
16
-
17
- Args:
18
- feature_names: minute lagtimes that where used during training
19
- of the model. If empty a new set will be automatically generated.
20
- horizon: Forecast horizon limit in hours.
21
-
22
- Returns:
23
- Lag functions.
24
-
25
- Example:
26
-
27
- .. code-block:: py
28
-
29
- lag_functions = generate_lag_functions(data,minute_list,h_ahead)
30
-
31
- """
32
- # Use extracted lag features if provided.
33
- if feature_names is not None:
34
- lag_times_minutes, lag_time_days_list = extract_lag_features(
35
- feature_names, horizon
36
- )
37
- else:
38
- # Generate available lag_times if no features are provided
39
- lag_times_minutes, lag_time_days_list = generate_trivial_lag_features(horizon)
40
-
41
- # Empty dict to store all generated lag functions
42
- lag_functions = {}
43
- for minutes in lag_times_minutes: # Add intraday-lag functions (lags in minutes)
44
-
45
- def func(x, shift=minutes):
46
- return x.shift(freq="min", periods=1 * shift)
47
-
48
- new = {"T-" + str(int(minutes)) + "min": func}
49
- lag_functions.update(new)
50
-
51
- # Add day lag functions:
52
- for day in lag_time_days_list:
53
-
54
- def func(x, shift=day):
55
- return x.shift(freq="1d", periods=1 * shift)
56
-
57
- new = {"T-" + str(int(day)) + "d": func}
58
- lag_functions.update(new)
59
- return lag_functions
60
-
61
-
62
- def extract_lag_features(
63
- feature_names: list[str], horizon: float = 24.0
64
- ) -> tuple[list, list]:
65
- """Creates a list of lag minutes and a list of lag days that were used during the training of the input model.
66
-
67
- Args:
68
- feature_names: All requested lag features
69
- horizon: Forecast horizon limit in hours.
70
-
71
- Returns:
72
- - List of minute lags that were used as features during training.
73
- - List of days lags that were used as features during training.
74
-
75
- """
76
- # Prepare empty lists to append on
77
- minutes_list = []
78
- days_list = []
79
-
80
- for lag_feature in feature_names:
81
- # Select the number of days or the number of minutes by matching with a regular expression
82
- number_of_minutes = re.search(r"T-(\d+)min", lag_feature)
83
- number_of_days = re.search(r"T-(\d+)d", lag_feature)
84
-
85
- # Append to the appropriate list
86
- if number_of_minutes is not None:
87
- minutes_list.append(int(number_of_minutes[1]))
88
- elif number_of_days is not None:
89
- days_list.append(int(number_of_days[1]))
90
-
91
- # Discard lag times that are not available for the specified horizon
92
- minutes_list = list(set([i for i in minutes_list if i >= horizon * 60]))
93
- days_list = list(set([i for i in days_list if i >= horizon / 24]))
94
-
95
- return minutes_list, days_list
96
-
97
-
98
- def generate_trivial_lag_features(horizon: float) -> tuple[list, list]:
99
- """Generates relevant lag times for lag feature function creation.
100
-
101
- This function is mostly used during training of models and not during predicting.
102
-
103
- Args:
104
- horizon: Forecast horizon limit in hours.
105
-
106
- Returns:
107
- - List of minute lags that were used as features during training.
108
- - List of days lags that were used as features during training.
109
-
110
- """
111
- mindays = min(int(np.ceil(horizon / 24)), 15)
112
- lag_time_days_list = list(np.linspace(mindays, 14, 15 - mindays))
113
-
114
- # Make list of trivial lag times
115
- trivial_lag_minutes_list = np.linspace(60, 23 * 60, 23).tolist() + [15, 30, 45]
116
-
117
- # Discard lag times that are not available for the specified horizon
118
- trivial_lag_times_minutes = list(
119
- set([i for i in trivial_lag_minutes_list if i >= horizon * 60])
120
- )
121
-
122
- return trivial_lag_times_minutes, lag_time_days_list
123
-
124
-
125
- def generate_non_trivial_lag_times(
126
- data: pd.DataFrame, height_threshold: float = 0.1
127
- ) -> list[int]:
128
- """Calculate an autocorrelation curve of the load trace.
129
-
130
- This curve is subsequently used to add additional lag times as features.
131
-
132
- Args:
133
- data: Dataframe with input data in the form pd.DataFrame(index = datetime,
134
- columns = [label, predictor_1,..., predictor_n])
135
- height_threshold: Minimal autocorrelation value to be recognized as a peak.
136
-
137
- Returns:
138
- Aditional non-trivial minute lags
139
-
140
- """
141
-
142
- def autocorr(x: np.array, lags: range) -> np.array:
143
- """Make an autocorrelation curve."""
144
- mean = x.mean()
145
- var = np.var(x)
146
- xp = x - mean
147
- corr = np.correlate(xp, xp, "full")[len(x) - 1 :] / var / len(x)
148
-
149
- return corr[: len(lags)]
150
-
151
- try:
152
- # Get rid of nans as the autocorrelation handles these values badly
153
- data = data[data.columns[0]].dropna() # First column contains the load
154
- # Get autocorrelation curve
155
- y = autocorr(data, range(10000))
156
- # Determine the peaks (positive and negative) larger than a specified threshold
157
- peaks = scipy.signal.find_peaks(np.abs(y), height=height_threshold)
158
- peaks = peaks[0]
159
- # Convert peaks to lag times in minutes
160
- peaks = peaks[peaks < (60 * 4)]
161
- additional_minute_space = peaks * 15
162
- except Exception:
163
- return []
164
- # Return list of additional minute lags to be procceses by apply features
165
- return list(additional_minute_space)
@@ -1,141 +0,0 @@
1
- # SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
2
- #
3
- # SPDX-License-Identifier: MPL-2.0
4
- from typing import Union, List, Optional
5
-
6
- import numpy as np
7
- import pandas as pd
8
- from sklearn.impute import SimpleImputer
9
- from sklearn.preprocessing import FunctionTransformer
10
- from sklearn.utils.validation import check_array, check_is_fitted
11
-
12
-
13
- class MissingValuesTransformer:
14
- """MissingValuesTransformer handles missing values in data by imputing them with a given strategy.
15
-
16
- It also removes columns that are always null from the data.
17
-
18
- """
19
-
20
- in_feature_names: Optional[List[str]] = None
21
- _n_in_features: Optional[int] = None
22
-
23
- non_null_feature_names: List[str] = None
24
-
25
- def __init__(
26
- self,
27
- missing_values: Union[int, float, str, None] = np.nan,
28
- imputation_strategy: str = None,
29
- fill_value: Union[str, int, float] = None,
30
- no_fill_future_values_features: List[str] = None,
31
- ):
32
- """Initialize missing values handler.
33
-
34
- Args:
35
- missing_values: The placeholder for the missing values. All occurrences of
36
- `missing_values` will be imputed.
37
- imputation_strategy: The imputation strategy to use
38
- Can be one of "mean", "median", "most_frequent", "constant" or None.
39
- fill_value: When strategy == "constant", fill_value is used to replace all
40
- occurrences of missing_values.
41
- no_fill_future_values_features: The features for which it does not make sense
42
- to fill future values. Rows that contain trailing null values for these
43
- features will be removed from the data.
44
-
45
- """
46
- self.missing_values = missing_values
47
- self.imputation_strategy = imputation_strategy
48
- self.fill_value = fill_value
49
- self.no_fill_future_values_features = no_fill_future_values_features or []
50
- self.is_fitted_ = False
51
-
52
- # Build the proper imputation transformer
53
- # - Identity function if strategy is None
54
- # - SimpleImputer with the dedicated strategy
55
- if self.imputation_strategy is None:
56
- self.imputer_ = FunctionTransformer(func=self._identity)
57
- else:
58
- self.imputer_ = SimpleImputer(
59
- missing_values=self.missing_values,
60
- strategy=self.imputation_strategy,
61
- fill_value=self.fill_value,
62
- ).set_output(transform="pandas")
63
- self.imputer_._validate_params()
64
-
65
- @staticmethod
66
- def _determine_trailing_null_rows(x: pd.DataFrame) -> pd.Series:
67
- """Determine rows with trailing null values in a DataFrame."""
68
- return ~x.bfill().isnull().any(axis="columns")
69
-
70
- def fit(self, x, y=None):
71
- """Fit the imputer on the input data."""
72
- _ = check_array(x, force_all_finite="allow-nan")
73
- if not isinstance(x, pd.DataFrame):
74
- x = pd.DataFrame(np.asarray(x))
75
-
76
- self.in_feature_names = list(x.columns)
77
- self._n_in_features = x.shape[1]
78
-
79
- # Remove always null columns
80
- is_column_null = x.isnull().all(axis="index")
81
- self.non_null_feature_names = list(x.columns[~is_column_null])
82
- x = x[self.non_null_feature_names]
83
-
84
- # Remove trailing null rows for features that should
85
- # not be imputed in the future
86
- trailing_null_rows = self._determine_trailing_null_rows(
87
- x[self.no_fill_future_values_features]
88
- )
89
- x = x.loc[trailing_null_rows]
90
-
91
- # Imputers do not support labels
92
- self.imputer_.fit(X=x, y=None)
93
- self.is_fitted_ = True
94
-
95
- def transform(self, x) -> pd.DataFrame:
96
- """Transform the input data by imputing missing values."""
97
- check_is_fitted(self)
98
- _ = check_array(x, force_all_finite="allow-nan")
99
- if not isinstance(x, pd.DataFrame):
100
- x = pd.DataFrame(np.asarray(x))
101
-
102
- x = x[self.non_null_feature_names]
103
-
104
- transformed = self.imputer_.transform(x)
105
-
106
- return transformed
107
-
108
- def fit_transform(self, x, y=None) -> tuple[pd.DataFrame, Optional[pd.Series]]:
109
- """Fit the imputer on the input data and transform it.
110
-
111
- Returns:
112
- The data with missing values imputed.
113
-
114
- """
115
- self.fit(x, y)
116
-
117
- if not isinstance(x, pd.DataFrame):
118
- x = pd.DataFrame(np.asarray(x))
119
-
120
- x = x[self.non_null_feature_names]
121
-
122
- # Remove trailing null rows for features that should
123
- # not be imputed in the future
124
- non_trailing_null_rows = self._determine_trailing_null_rows(
125
- x[self.no_fill_future_values_features]
126
- )
127
- x = x.loc[non_trailing_null_rows]
128
-
129
- x = self.transform(x)
130
-
131
- if y is not None:
132
- y = y.loc[non_trailing_null_rows]
133
-
134
- return x, y
135
-
136
- @classmethod
137
- def _identity(cls, x):
138
- return x
139
-
140
- def __sklearn_is_fitted__(self) -> bool:
141
- return self.in_feature_names is not None
@@ -1,58 +0,0 @@
1
- # SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
2
- #
3
- # SPDX-License-Identifier: MPL-2.0
4
- from datetime import timedelta
5
-
6
- import pandas as pd
7
-
8
- from openstef.data_classes.prediction_job import PredictionJobDataClass
9
- from pydantic import TypeAdapter
10
-
11
-
12
- def convert_timedelta_to_isoformat(td: timedelta) -> str:
13
- """
14
- Converts a timedelta to an ISO 8601 formatted period string.
15
-
16
- Args:
17
- td: timedelta object to convert.
18
-
19
- Returns:
20
- ISO 8601 formatted period string.
21
- """
22
- timedelta_adapter = TypeAdapter(timedelta)
23
- return timedelta_adapter.dump_python(td, mode="json")
24
-
25
-
26
- def add_rolling_aggregate_features(
27
- data: pd.DataFrame,
28
- pj: PredictionJobDataClass,
29
- rolling_window: timedelta = timedelta(hours=24),
30
- ) -> pd.DataFrame:
31
- """
32
- Adds rolling aggregate features to the input dataframe.
33
-
34
- These features are calculated with an aggregation over a rolling window of the data.
35
- A list of requested features is used to determine whether to add the rolling features
36
- or not.
37
-
38
- Args:
39
- data: Input dataframe to which the rolling features will be added.
40
- pj: Prediction job data.
41
- rolling_window: Rolling window size for the aggregation.
42
-
43
- Returns:
44
- DataFrame with added rolling features.
45
- """
46
- # Ensure the index is a DatetimeIndex
47
- if not isinstance(data.index, pd.DatetimeIndex):
48
- raise ValueError("The DataFrame index must be a DatetimeIndex.")
49
-
50
- if "load" not in data.columns:
51
- raise ValueError("The DataFrame must contain a 'load' column.")
52
- rolling_window_load = data["load"].rolling(window=rolling_window)
53
-
54
- for aggregate_func in pj["rolling_aggregate_features"]:
55
- data[
56
- f"rolling_{aggregate_func.value}_load_{convert_timedelta_to_isoformat(rolling_window)}"
57
- ] = rolling_window_load.aggregate(aggregate_func.value)
58
- return data