openstef 3.4.56__py3-none-any.whl → 4.0.0a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. openstef-4.0.0a3.dist-info/METADATA +177 -0
  2. openstef-4.0.0a3.dist-info/RECORD +4 -0
  3. {openstef-3.4.56.dist-info → openstef-4.0.0a3.dist-info}/WHEEL +1 -2
  4. openstef/__init__.py +0 -14
  5. openstef/__main__.py +0 -3
  6. openstef/app_settings.py +0 -19
  7. openstef/data/NL_terrestrial_radiation.csv +0 -25585
  8. openstef/data/NL_terrestrial_radiation.csv.license +0 -3
  9. openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z +0 -0
  10. openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z.license +0 -3
  11. openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md +0 -18
  12. openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md.license +0 -3
  13. openstef/data/dutch_holidays.csv +0 -1759
  14. openstef/data/dutch_holidays.csv.license +0 -3
  15. openstef/data/pv_single_coefs.csv +0 -601
  16. openstef/data/pv_single_coefs.csv.license +0 -3
  17. openstef/data_classes/__init__.py +0 -3
  18. openstef/data_classes/data_prep.py +0 -99
  19. openstef/data_classes/model_specifications.py +0 -30
  20. openstef/data_classes/prediction_job.py +0 -135
  21. openstef/data_classes/split_function.py +0 -97
  22. openstef/enums.py +0 -140
  23. openstef/exceptions.py +0 -74
  24. openstef/feature_engineering/__init__.py +0 -3
  25. openstef/feature_engineering/apply_features.py +0 -138
  26. openstef/feature_engineering/bidding_zone_to_country_mapping.py +0 -106
  27. openstef/feature_engineering/cyclic_features.py +0 -161
  28. openstef/feature_engineering/data_preparation.py +0 -152
  29. openstef/feature_engineering/feature_adder.py +0 -206
  30. openstef/feature_engineering/feature_applicator.py +0 -202
  31. openstef/feature_engineering/general.py +0 -141
  32. openstef/feature_engineering/holiday_features.py +0 -231
  33. openstef/feature_engineering/lag_features.py +0 -165
  34. openstef/feature_engineering/missing_values_transformer.py +0 -141
  35. openstef/feature_engineering/rolling_features.py +0 -58
  36. openstef/feature_engineering/weather_features.py +0 -492
  37. openstef/metrics/__init__.py +0 -3
  38. openstef/metrics/figure.py +0 -303
  39. openstef/metrics/metrics.py +0 -486
  40. openstef/metrics/reporter.py +0 -222
  41. openstef/model/__init__.py +0 -3
  42. openstef/model/basecase.py +0 -82
  43. openstef/model/confidence_interval_applicator.py +0 -242
  44. openstef/model/fallback.py +0 -77
  45. openstef/model/metamodels/__init__.py +0 -3
  46. openstef/model/metamodels/feature_clipper.py +0 -90
  47. openstef/model/metamodels/grouped_regressor.py +0 -222
  48. openstef/model/metamodels/missing_values_handler.py +0 -138
  49. openstef/model/model_creator.py +0 -214
  50. openstef/model/objective.py +0 -426
  51. openstef/model/objective_creator.py +0 -65
  52. openstef/model/regressors/__init__.py +0 -3
  53. openstef/model/regressors/arima.py +0 -197
  54. openstef/model/regressors/custom_regressor.py +0 -64
  55. openstef/model/regressors/dazls.py +0 -116
  56. openstef/model/regressors/flatliner.py +0 -95
  57. openstef/model/regressors/gblinear_quantile.py +0 -334
  58. openstef/model/regressors/lgbm.py +0 -29
  59. openstef/model/regressors/linear.py +0 -90
  60. openstef/model/regressors/linear_quantile.py +0 -305
  61. openstef/model/regressors/regressor.py +0 -114
  62. openstef/model/regressors/xgb.py +0 -52
  63. openstef/model/regressors/xgb_multioutput_quantile.py +0 -261
  64. openstef/model/regressors/xgb_quantile.py +0 -228
  65. openstef/model/serializer.py +0 -431
  66. openstef/model/standard_deviation_generator.py +0 -81
  67. openstef/model_selection/__init__.py +0 -3
  68. openstef/model_selection/model_selection.py +0 -311
  69. openstef/monitoring/__init__.py +0 -3
  70. openstef/monitoring/performance_meter.py +0 -92
  71. openstef/monitoring/teams.py +0 -203
  72. openstef/pipeline/__init__.py +0 -3
  73. openstef/pipeline/create_basecase_forecast.py +0 -133
  74. openstef/pipeline/create_component_forecast.py +0 -168
  75. openstef/pipeline/create_forecast.py +0 -171
  76. openstef/pipeline/optimize_hyperparameters.py +0 -317
  77. openstef/pipeline/train_create_forecast_backtest.py +0 -163
  78. openstef/pipeline/train_model.py +0 -561
  79. openstef/pipeline/utils.py +0 -52
  80. openstef/postprocessing/__init__.py +0 -3
  81. openstef/postprocessing/postprocessing.py +0 -275
  82. openstef/preprocessing/__init__.py +0 -3
  83. openstef/preprocessing/preprocessing.py +0 -42
  84. openstef/settings.py +0 -15
  85. openstef/tasks/__init__.py +0 -3
  86. openstef/tasks/calculate_kpi.py +0 -324
  87. openstef/tasks/create_basecase_forecast.py +0 -118
  88. openstef/tasks/create_components_forecast.py +0 -162
  89. openstef/tasks/create_forecast.py +0 -145
  90. openstef/tasks/create_solar_forecast.py +0 -420
  91. openstef/tasks/create_wind_forecast.py +0 -80
  92. openstef/tasks/optimize_hyperparameters.py +0 -135
  93. openstef/tasks/split_forecast.py +0 -273
  94. openstef/tasks/train_model.py +0 -224
  95. openstef/tasks/utils/__init__.py +0 -3
  96. openstef/tasks/utils/dependencies.py +0 -107
  97. openstef/tasks/utils/predictionjobloop.py +0 -243
  98. openstef/tasks/utils/taskcontext.py +0 -160
  99. openstef/validation/__init__.py +0 -3
  100. openstef/validation/validation.py +0 -322
  101. openstef-3.4.56.dist-info/METADATA +0 -154
  102. openstef-3.4.56.dist-info/RECORD +0 -102
  103. openstef-3.4.56.dist-info/top_level.txt +0 -1
  104. /openstef-3.4.56.dist-info/LICENSE → /openstef-4.0.0a3.dist-info/licenses/LICENSE.md +0 -0
@@ -1,561 +0,0 @@
1
- # SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
2
- #
3
- # SPDX-License-Identifier: MPL-2.0
4
- import logging
5
- import os
6
- from typing import Optional, Union, Tuple
7
-
8
- import pandas as pd
9
- import structlog
10
-
11
- from openstef.data_classes.model_specifications import ModelSpecificationDataClass
12
- from openstef.data_classes.prediction_job import PredictionJobDataClass
13
- from openstef.exceptions import (
14
- InputDataInsufficientError,
15
- InputDataWrongColumnOrderError,
16
- OldModelHigherScoreError,
17
- SkipSaveTrainingForecasts,
18
- )
19
- from openstef.feature_engineering.feature_applicator import TrainFeatureApplicator
20
- from openstef.metrics.reporter import Report, Reporter
21
- from openstef.model.model_creator import ModelCreator
22
- from openstef.model.regressors.regressor import OpenstfRegressor
23
- from openstef.model.serializer import MLflowSerializer
24
- from openstef.model.standard_deviation_generator import StandardDeviationGenerator
25
- from openstef.model_selection.model_selection import split_data_train_validation_test
26
- from openstef.settings import Settings
27
- from openstef.validation import validation
28
-
29
- DEFAULT_TRAIN_HORIZONS_HOURS: list[float] = [0.25, 47.0]
30
- MAXIMUM_MODEL_AGE: int = 7
31
-
32
- DEFAULT_EARLY_STOPPING_ROUNDS: int = 10
33
- PENALTY_FACTOR_OLD_MODEL: float = 1.2
34
-
35
- structlog.configure(
36
- wrapper_class=structlog.make_filtering_bound_logger(
37
- logging.getLevelName(Settings.log_level)
38
- )
39
- )
40
- logger = structlog.get_logger(__name__)
41
-
42
-
43
- def train_model_pipeline(
44
- pj: PredictionJobDataClass,
45
- input_data: pd.DataFrame,
46
- check_old_model_age: bool,
47
- mlflow_tracking_uri: str,
48
- artifact_folder: str,
49
- ) -> Optional[tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]]:
50
- """Middle level pipeline that takes care of all persistent storage dependencies.
51
-
52
- Expected prediction jobs keys: "id",
53
- "model", "hyper_params", "feature_names".
54
-
55
- Args:
56
- pj: Prediction job
57
- input_data: Raw training input data
58
- check_old_model_age: Check if training should be skipped because the model is too young
59
- mlflow_tracking_uri: Tracking URI for MLFlow
60
- artifact_folder: Path where artifacts, such as trained models, are stored
61
-
62
- Returns:
63
- If pj.save_train_forecasts is False, None is returned
64
- Otherwise:
65
- - The train dataset with forecasts
66
- - The validation dataset with forecasts
67
- - The test dataset with forecasts
68
-
69
- Raises:
70
- InputDataInsufficientError: when input data is insufficient.
71
- InputDataWrongColumnOrderError: when input data has a invalid column order.
72
- 'load' column should be first and 'horizon' column last.
73
- OldModelHigherScoreError: When old model is better than new model.
74
- SkipSaveTrainingForecasts: If old model is better or younger than `MAXIMUM_MODEL_AGE`, the model is not saved.
75
-
76
- """
77
- # Initialize serializer
78
- serializer = MLflowSerializer(mlflow_tracking_uri=mlflow_tracking_uri)
79
-
80
- # Get old model and age
81
- old_model, model_specs, old_model_age = train_pipeline_step_load_model(
82
- pj, serializer
83
- )
84
-
85
- # Check old model age and continue yes/no
86
- if (old_model_age < MAXIMUM_MODEL_AGE) and check_old_model_age:
87
- logger.warning(
88
- f"Old model is younger than {MAXIMUM_MODEL_AGE} days, skip training"
89
- )
90
- if pj.save_train_forecasts:
91
- raise SkipSaveTrainingForecasts
92
- return
93
-
94
- # Train model with core pipeline
95
- try:
96
- if pj.train_horizons_minutes is None:
97
- horizons = DEFAULT_TRAIN_HORIZONS_HOURS
98
- else:
99
- horizons = [
100
- horizon_minutes / 60 for horizon_minutes in pj.train_horizons_minutes
101
- ]
102
-
103
- model, report, model_specs_updated, data_sets = train_model_pipeline_core(
104
- pj,
105
- model_specs,
106
- input_data,
107
- old_model,
108
- horizons=horizons,
109
- )
110
- except OldModelHigherScoreError as OMHSE:
111
- logger.error("Old model is better than new model", pid=pj["id"], exc_info=OMHSE)
112
- if pj.save_train_forecasts:
113
- raise SkipSaveTrainingForecasts from OMHSE
114
- return
115
-
116
- except InputDataInsufficientError as IDIE:
117
- logger.error(
118
- "Input data is insufficient after validation and cleaning",
119
- pid=pj["id"],
120
- exc_info=IDIE,
121
- )
122
- raise InputDataInsufficientError(IDIE)
123
-
124
- except InputDataWrongColumnOrderError as IDWCOE:
125
- logger.error(
126
- "Wrong column order, 'load' column should be first and 'horizon' column"
127
- " last.",
128
- pid=pj["id"],
129
- exc_info=IDWCOE,
130
- )
131
- raise InputDataWrongColumnOrderError(IDWCOE)
132
-
133
- # Save model and report. Report is always saved to MLFlow and optionally to disk
134
- serializer.save_model(
135
- model=model,
136
- experiment_name=str(pj["id"]),
137
- model_type=pj["model"],
138
- model_specs=model_specs_updated,
139
- report=report,
140
- )
141
- if artifact_folder:
142
- report_folder = os.path.join(artifact_folder, str(pj["id"]))
143
- Reporter.write_report_to_disk(report=report, report_folder=report_folder)
144
-
145
- # Clean up older models
146
- serializer.remove_old_models(experiment_name=str(pj["id"]))
147
-
148
- if pj.save_train_forecasts:
149
- return data_sets
150
-
151
-
152
- def train_model_pipeline_core(
153
- pj: PredictionJobDataClass,
154
- model_specs: ModelSpecificationDataClass,
155
- input_data: pd.DataFrame,
156
- old_model: OpenstfRegressor = None,
157
- horizons: list[float] = DEFAULT_TRAIN_HORIZONS_HOURS,
158
- ) -> Tuple[
159
- OpenstfRegressor,
160
- Report,
161
- ModelSpecificationDataClass,
162
- tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame],
163
- ]:
164
- """Train model core pipeline.
165
-
166
- Trains a new model given a prediction job, input data and compares it to an old model.
167
- This pipeline has no database or persistent storage dependencies.
168
-
169
- Args:
170
- pj: Prediction job
171
- model_specs: Dataclass containing model specifications
172
- input_data: Input data
173
- old_model: Old model to compare to. Defaults to None.
174
- horizons: Horizons to train on in hours, relevant for feature engineering.
175
-
176
- Raises:
177
- InputDataInsufficientError: when input data is insufficient.
178
- InputDataWrongColumnOrderError: when input data has a invalid column order.
179
- OldModelHigherScoreError: When old model is better than new model.
180
- InputDataOngoingZeroFlatlinerError: when all recent load measurements are zero.
181
-
182
- Returns:
183
- - Fitted_model (OpenstfRegressor)
184
- - Report (Report)
185
- - Modelspecs (ModelSpecificationDataClass)
186
- - Datasets (tuple[pd.DataFrmae, pd.DataFrame, pd.Dataframe): The train, validation and test sets
187
-
188
- """
189
- # Call common pipeline
190
- (
191
- model,
192
- report,
193
- train_data,
194
- validation_data,
195
- test_data,
196
- operational_score_data,
197
- ) = train_pipeline_common(
198
- pj,
199
- model_specs,
200
- input_data,
201
- horizons,
202
- )
203
- model_specs.feature_names = list(train_data.columns)
204
-
205
- # Check if new model is better than old model
206
- if old_model:
207
- combined = pd.concat([train_data, validation_data])
208
- # skip the forecast column added at the end of dataframes
209
- if pj.save_train_forecasts:
210
- combined = combined.iloc[:, :-1]
211
-
212
- x_data, y_data = (
213
- operational_score_data.iloc[:, 1:-1],
214
- operational_score_data.iloc[:, 0],
215
- )
216
-
217
- # Score method always returns R^2
218
- score_new_model = model.score(x_data, y_data)
219
-
220
- # Try to compare new model to old model.
221
- # If this does not success, for example since the feature names of the
222
- # old model differ from the new model, the new model is considered better
223
- try:
224
- score_old_model = old_model.score(x_data, y_data)
225
-
226
- # Check if R^2 is better for old model
227
- if score_old_model > score_new_model * PENALTY_FACTOR_OLD_MODEL:
228
- raise OldModelHigherScoreError(
229
- f"Old model is better than new model for {pj['id']}."
230
- )
231
-
232
- logger.info(
233
- "New model is better than old model, continuing with training procces"
234
- )
235
- except ValueError as e:
236
- logger.info("Could not compare to old model", pid=pj["id"], exc_info=e)
237
-
238
- return model, report, model_specs, (train_data, validation_data, test_data)
239
-
240
-
241
- def train_pipeline_common(
242
- pj: PredictionJobDataClass,
243
- model_specs: ModelSpecificationDataClass,
244
- input_data: pd.DataFrame,
245
- horizons: list[float],
246
- test_fraction: float = 0.0,
247
- backtest: bool = False,
248
- test_data_predefined: pd.DataFrame = pd.DataFrame(),
249
- ) -> tuple[
250
- OpenstfRegressor, Report, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame
251
- ]:
252
- """Common pipeline shared with operational training and backtest training.
253
-
254
- Args:
255
- pj: Prediction job
256
- model_specs: Dataclass containing model specifications
257
- input_data: Input data
258
- horizons: horizons to train on in hours.
259
- test_fraction: fraction of data to use for testing
260
- backtest: boolean if we need to do a backtest
261
- test_data_predefined: Predefined test data frame to be used in the pipeline
262
- (empty data frame by default)
263
-
264
- Returns:
265
- - The trained model
266
- - Report
267
- - The train data
268
- - The validation data
269
- - The test data
270
-
271
- Raises:
272
- InputDataInsufficientError: when input data is insufficient.
273
- InputDataWrongColumnOrderError: when input data has a invalid column order.
274
- 'load' column should be first and 'horizon' column last.
275
- InputDataOngoingZeroFlatlinerError: when all recent load measurements are zero.
276
-
277
- """
278
- data_with_features = train_pipeline_step_compute_features(
279
- pj=pj,
280
- model_specs=model_specs,
281
- input_data=input_data,
282
- horizons=horizons,
283
- )
284
-
285
- (
286
- train_data,
287
- validation_data,
288
- test_data,
289
- operational_score_data,
290
- ) = train_pipeline_step_split_data(
291
- data_with_features=data_with_features,
292
- pj=pj,
293
- test_fraction=test_fraction,
294
- backtest=backtest,
295
- test_data_predefined=test_data_predefined,
296
- )
297
-
298
- model = train_pipeline_step_train_model(
299
- pj=pj,
300
- model_specs=model_specs,
301
- train_data=train_data,
302
- validation_data=validation_data,
303
- )
304
-
305
- # Report about the training process
306
- reporter = Reporter(train_data, validation_data, test_data, pj.quantiles)
307
- report = reporter.generate_report(model)
308
-
309
- if pj.save_train_forecasts:
310
- train_data["forecast"] = model.predict(train_data.iloc[:, 1:-1])
311
- validation_data["forecast"] = model.predict(validation_data.iloc[:, 1:-1])
312
- test_data["forecast"] = model.predict(test_data.iloc[:, 1:-1])
313
-
314
- return model, report, train_data, validation_data, test_data, operational_score_data
315
-
316
-
317
- def train_pipeline_step_load_model(
318
- pj: PredictionJobDataClass, serializer: MLflowSerializer
319
- ) -> Tuple[OpenstfRegressor, ModelSpecificationDataClass, Union[int, float]]:
320
- old_model: Optional[OpenstfRegressor]
321
- try:
322
- old_model, model_specs = serializer.load_model(experiment_name=str(pj.id))
323
- old_model_age = old_model.age # Age attribute is openstef specific
324
- return old_model, model_specs, old_model_age
325
- except (AttributeError, FileNotFoundError, LookupError):
326
- logger.warning("No old model found, training new model", pid=pj.id)
327
- except Exception:
328
- logger.exception("Old model could not be loaded, training new model", pid=pj.id)
329
- old_model = None
330
- old_model_age = float("inf")
331
- if pj["default_modelspecs"] is not None:
332
- model_specs = pj["default_modelspecs"]
333
- if model_specs.id != pj.id:
334
- raise RuntimeError(
335
- "The id of the prediction job and its default model_specs do not"
336
- " match."
337
- )
338
- else:
339
- # create basic model_specs
340
- model_specs = ModelSpecificationDataClass(id=pj["id"])
341
-
342
- return old_model, model_specs, old_model_age
343
-
344
-
345
- def train_pipeline_step_compute_features(
346
- pj: PredictionJobDataClass,
347
- model_specs: ModelSpecificationDataClass,
348
- input_data: pd.DataFrame,
349
- horizons=list[float],
350
- ) -> pd.DataFrame:
351
- """Compute features and perform consistency checks.
352
-
353
- Args:
354
- pj: Prediction job
355
- model_specs: Dataclass containing model specifications
356
- input_data: Input data
357
- horizons: horizons to train on in hours.
358
-
359
- Returns:
360
- The dataframe with features need to train the model
361
-
362
- Raises:
363
- InputDataInsufficientError: when input data is insufficient.
364
- InputDataWrongColumnOrderError: when input data has a invalid column order.
365
- ValueError: when the horizon is a string and the corresponding column in not in the input data
366
- InputDataOngoingZeroFlatlinerError: when all recent load measurements are zero.
367
-
368
- """
369
- if input_data.empty:
370
- raise InputDataInsufficientError("Input dataframe is empty")
371
- elif "load" not in input_data.columns:
372
- raise InputDataWrongColumnOrderError(
373
- "Missing the load column in the input dataframe"
374
- )
375
-
376
- if isinstance(horizons, str):
377
- if horizons not in set(input_data.columns):
378
- raise ValueError(
379
- f"The horizon parameter specifies a column name ({horizons}) missing in"
380
- " the input data."
381
- )
382
- else:
383
- # sort data to avoid same date repeated multiple time
384
- input_data = input_data.sort_values(horizons)
385
- # Validate and clean data
386
- validated_data = validation.drop_target_na(
387
- validation.validate(
388
- pj["id"],
389
- input_data,
390
- pj["flatliner_threshold_minutes"],
391
- pj["resolution_minutes"],
392
- )
393
- )
394
- # Check if sufficient data is left after cleaning
395
- if not validation.is_data_sufficient(
396
- validated_data,
397
- pj["completeness_threshold"],
398
- pj["minimal_table_length"],
399
- ):
400
- raise InputDataInsufficientError(
401
- "Input data is insufficient, after validation and cleaning"
402
- )
403
-
404
- # Custom data prep or legacy behavior
405
- if pj.data_prep_class:
406
- data_prep_class, data_prep_args = pj.data_prep_class.load()
407
- data_with_features = data_prep_class(
408
- pj=pj,
409
- model_specs=model_specs,
410
- horizons=horizons,
411
- **data_prep_args,
412
- ).prepare_train_data(validated_data)
413
- else:
414
- data_with_features = TrainFeatureApplicator(
415
- horizons=horizons,
416
- feature_names=model_specs.feature_names,
417
- feature_modules=model_specs.feature_modules,
418
- ).add_features(validated_data, pj=pj)
419
-
420
- return data_with_features
421
-
422
-
423
- def train_pipeline_step_train_model(
424
- pj: PredictionJobDataClass,
425
- model_specs: ModelSpecificationDataClass,
426
- train_data: pd.DataFrame,
427
- validation_data: pd.DataFrame,
428
- ) -> OpenstfRegressor:
429
- """Train the model.
430
-
431
- Args:
432
- pj: Prediction job
433
- model_specs: Dataclass containing model specifications
434
- train_data: The training data
435
- validation_data: The test data
436
-
437
- Returns:
438
- The trained model
439
-
440
- Raises:
441
- NotImplementedError: When using invalid model type in the prediction job.
442
- InputDataWrongColumnOrderError: When 'load' column is not first and 'horizon' column is not last.
443
-
444
- """
445
- # Test if first column is "load" and last column is "horizon"
446
- if train_data.columns[0] != "load" or train_data.columns[-1] != "horizon":
447
- raise InputDataWrongColumnOrderError(
448
- f"Wrong column order for {pj['id']} "
449
- "'load' column should be first and 'horizon' column last."
450
- )
451
-
452
- # Create relevant model
453
- model = ModelCreator.create_model(
454
- pj["model"],
455
- quantiles=pj["quantiles"],
456
- **(pj.model_kwargs or {}),
457
- )
458
-
459
- # split x and y data
460
- train_x, train_y = train_data.iloc[:, 1:-1], train_data.iloc[:, 0]
461
- validation_x, validation_y = (
462
- validation_data.iloc[:, 1:-1],
463
- validation_data.iloc[:, 0],
464
- )
465
-
466
- # Configure evals for early stopping
467
- eval_set = [(train_x, train_y), (validation_x, validation_y)]
468
-
469
- # Set relevant hyperparameters
470
- # define protected hyperparams which are derived from prediction_job
471
- protected_hyperparams = ["quantiles"]
472
- valid_hyper_parameters = {
473
- key: value
474
- for key, value in model_specs.hyper_params.items()
475
- if key in model.get_params().keys() and key not in protected_hyperparams
476
- }
477
-
478
- # Add early stopping to set_params if this is supported by the model
479
- if "early_stopping_rounds" in model.get_params().keys():
480
- valid_hyper_parameters.update(
481
- dict(early_stopping_rounds=DEFAULT_EARLY_STOPPING_ROUNDS)
482
- )
483
-
484
- # Temporary fix to allow xgboost version upgrade -> set n_estimators if present and None
485
- if not valid_hyper_parameters.get("n_estimators", True):
486
- valid_hyper_parameters.update(dict(n_estimators=100))
487
- logging.info("Deprecation warning: n_estimators=None found, overwriting.")
488
-
489
- model.set_params(**valid_hyper_parameters)
490
- model.fit(
491
- train_x,
492
- train_y,
493
- eval_set=eval_set,
494
- verbose=False,
495
- )
496
- # Gets the feature importance df or None if we don't have feature importance
497
- model.feature_importance_dataframe = model.set_feature_importance()
498
-
499
- logging.info("Fitted a new model, not yet stored")
500
-
501
- # Do confidence interval determination
502
- model = StandardDeviationGenerator(
503
- validation_data
504
- ).generate_standard_deviation_data(model)
505
-
506
- return model
507
-
508
-
509
- def train_pipeline_step_split_data(
510
- data_with_features: pd.DataFrame,
511
- pj: PredictionJobDataClass,
512
- test_fraction: float,
513
- backtest: bool = False,
514
- test_data_predefined: pd.DataFrame = pd.DataFrame(),
515
- ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
516
- """The default way to perform train, val, test split.
517
-
518
- Args:
519
- data_with_features: Input data
520
- pj: Prediction job
521
- test_fraction: fraction of data to use for testing
522
- backtest: boolean if we need to do a backtest
523
- test_data_predefined: Predefined test data frame to be used in the pipeline
524
- (empty data frame by default)
525
-
526
- Returns:
527
- - Train dataset
528
- - Validation dataset
529
- - Test dataset
530
-
531
- """
532
- # if test_data is predefined, apply the pipeline only on the remaining data
533
- if not test_data_predefined.empty:
534
- test_data_predefined = data_with_features[
535
- data_with_features.index.isin(test_data_predefined.index)
536
- ].sort_index()
537
- data_with_features = data_with_features[
538
- ~data_with_features.index.isin(test_data_predefined.index)
539
- ].sort_index()
540
-
541
- # Split data
542
- if pj.train_split_func is None:
543
- split_func = split_data_train_validation_test
544
- split_args = {
545
- "stratification_min_max": True,
546
- "back_test": backtest,
547
- }
548
- else:
549
- split_func, split_args = pj.train_split_func.load(
550
- required_arguments=["data", "test_fraction"]
551
- )
552
-
553
- train_data, validation_data, test_data, operational_score_data = split_func(
554
- data_with_features, test_fraction, **split_args
555
- )
556
-
557
- # if test_data is predefined, use this over the returned test_data of split function
558
- if not test_data_predefined.empty:
559
- test_data = test_data_predefined
560
-
561
- return train_data, validation_data, test_data, operational_score_data
@@ -1,52 +0,0 @@
1
- # SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
2
- #
3
- # SPDX-License-Identifier: MPL-2.0
4
- from datetime import datetime
5
-
6
- import pandas as pd
7
- import scipy.ndimage as mnts
8
-
9
-
10
- def generate_forecast_datetime_range(
11
- forecast_data: pd.DataFrame,
12
- ) -> tuple[datetime, datetime]:
13
- """Generate forecast range based on last cluster of null values in first target column of forecast data.
14
-
15
- Example:
16
-
17
- A forecast dataset with data between 2021-11-05 and 2021-11-19, and the
18
- target column 'load' as first column is given as input to this function. The first
19
- column 'load' has null values between 2021-11-17 04:00:00 and 2021-11-19 05:00:00.
20
- The null values at the end of the column indicate when forecasts are needed.
21
- Therefore this function sets starting time of forecasts as 2021-11-17 04:00:00 and
22
- end time of forecasts as 2021-11-19 05:00:00.
23
-
24
- Args:
25
- forecast_data: The forecast dataframe.
26
-
27
- Returns:
28
- Start and end datetimes of the forecast range.
29
-
30
- Raises:
31
- ValueError: If the target column does not have null values.
32
-
33
- """
34
- # By labeling the True/False values (based on the isnull() statement) as clusters,
35
- # we find what True value belongs to what cluster and the amount of True clusters.
36
- label_clusters, n_clusters = mnts.label(forecast_data.iloc[:, 0].isnull().values)
37
-
38
- # If there are zero true clusters, it means the target column does not have nulls
39
- if n_clusters == 0:
40
- raise ValueError(
41
- "Forecast target column must have null values to indicate "
42
- "when forecast starts and ends."
43
- )
44
-
45
- # If there are multiple true clusters, we select the last cluster as forecast range
46
- forecast_range_data = forecast_data.loc[label_clusters == n_clusters]
47
- # We select first datetime index of last cluster
48
- forecast_start_dt = forecast_range_data.index[0].to_pydatetime()
49
-
50
- # Forecast end is based on last datetime of given forecast data
51
- forecast_end_dt = forecast_data.index[-1].to_pydatetime()
52
- return forecast_start_dt, forecast_end_dt
@@ -1,3 +0,0 @@
1
- # SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
2
- #
3
- # SPDX-License-Identifier: MPL-2.0