openstef 3.4.56__py3-none-any.whl → 4.0.0a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstef-4.0.0a3.dist-info/METADATA +177 -0
- openstef-4.0.0a3.dist-info/RECORD +4 -0
- {openstef-3.4.56.dist-info → openstef-4.0.0a3.dist-info}/WHEEL +1 -2
- openstef/__init__.py +0 -14
- openstef/__main__.py +0 -3
- openstef/app_settings.py +0 -19
- openstef/data/NL_terrestrial_radiation.csv +0 -25585
- openstef/data/NL_terrestrial_radiation.csv.license +0 -3
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z +0 -0
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_baseline_model.z.license +0 -3
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md +0 -18
- openstef/data/dazls_model_3.4.24/dazls_stored_3.4.24_model_card.md.license +0 -3
- openstef/data/dutch_holidays.csv +0 -1759
- openstef/data/dutch_holidays.csv.license +0 -3
- openstef/data/pv_single_coefs.csv +0 -601
- openstef/data/pv_single_coefs.csv.license +0 -3
- openstef/data_classes/__init__.py +0 -3
- openstef/data_classes/data_prep.py +0 -99
- openstef/data_classes/model_specifications.py +0 -30
- openstef/data_classes/prediction_job.py +0 -135
- openstef/data_classes/split_function.py +0 -97
- openstef/enums.py +0 -140
- openstef/exceptions.py +0 -74
- openstef/feature_engineering/__init__.py +0 -3
- openstef/feature_engineering/apply_features.py +0 -138
- openstef/feature_engineering/bidding_zone_to_country_mapping.py +0 -106
- openstef/feature_engineering/cyclic_features.py +0 -161
- openstef/feature_engineering/data_preparation.py +0 -152
- openstef/feature_engineering/feature_adder.py +0 -206
- openstef/feature_engineering/feature_applicator.py +0 -202
- openstef/feature_engineering/general.py +0 -141
- openstef/feature_engineering/holiday_features.py +0 -231
- openstef/feature_engineering/lag_features.py +0 -165
- openstef/feature_engineering/missing_values_transformer.py +0 -141
- openstef/feature_engineering/rolling_features.py +0 -58
- openstef/feature_engineering/weather_features.py +0 -492
- openstef/metrics/__init__.py +0 -3
- openstef/metrics/figure.py +0 -303
- openstef/metrics/metrics.py +0 -486
- openstef/metrics/reporter.py +0 -222
- openstef/model/__init__.py +0 -3
- openstef/model/basecase.py +0 -82
- openstef/model/confidence_interval_applicator.py +0 -242
- openstef/model/fallback.py +0 -77
- openstef/model/metamodels/__init__.py +0 -3
- openstef/model/metamodels/feature_clipper.py +0 -90
- openstef/model/metamodels/grouped_regressor.py +0 -222
- openstef/model/metamodels/missing_values_handler.py +0 -138
- openstef/model/model_creator.py +0 -214
- openstef/model/objective.py +0 -426
- openstef/model/objective_creator.py +0 -65
- openstef/model/regressors/__init__.py +0 -3
- openstef/model/regressors/arima.py +0 -197
- openstef/model/regressors/custom_regressor.py +0 -64
- openstef/model/regressors/dazls.py +0 -116
- openstef/model/regressors/flatliner.py +0 -95
- openstef/model/regressors/gblinear_quantile.py +0 -334
- openstef/model/regressors/lgbm.py +0 -29
- openstef/model/regressors/linear.py +0 -90
- openstef/model/regressors/linear_quantile.py +0 -305
- openstef/model/regressors/regressor.py +0 -114
- openstef/model/regressors/xgb.py +0 -52
- openstef/model/regressors/xgb_multioutput_quantile.py +0 -261
- openstef/model/regressors/xgb_quantile.py +0 -228
- openstef/model/serializer.py +0 -431
- openstef/model/standard_deviation_generator.py +0 -81
- openstef/model_selection/__init__.py +0 -3
- openstef/model_selection/model_selection.py +0 -311
- openstef/monitoring/__init__.py +0 -3
- openstef/monitoring/performance_meter.py +0 -92
- openstef/monitoring/teams.py +0 -203
- openstef/pipeline/__init__.py +0 -3
- openstef/pipeline/create_basecase_forecast.py +0 -133
- openstef/pipeline/create_component_forecast.py +0 -168
- openstef/pipeline/create_forecast.py +0 -171
- openstef/pipeline/optimize_hyperparameters.py +0 -317
- openstef/pipeline/train_create_forecast_backtest.py +0 -163
- openstef/pipeline/train_model.py +0 -561
- openstef/pipeline/utils.py +0 -52
- openstef/postprocessing/__init__.py +0 -3
- openstef/postprocessing/postprocessing.py +0 -275
- openstef/preprocessing/__init__.py +0 -3
- openstef/preprocessing/preprocessing.py +0 -42
- openstef/settings.py +0 -15
- openstef/tasks/__init__.py +0 -3
- openstef/tasks/calculate_kpi.py +0 -324
- openstef/tasks/create_basecase_forecast.py +0 -118
- openstef/tasks/create_components_forecast.py +0 -162
- openstef/tasks/create_forecast.py +0 -145
- openstef/tasks/create_solar_forecast.py +0 -420
- openstef/tasks/create_wind_forecast.py +0 -80
- openstef/tasks/optimize_hyperparameters.py +0 -135
- openstef/tasks/split_forecast.py +0 -273
- openstef/tasks/train_model.py +0 -224
- openstef/tasks/utils/__init__.py +0 -3
- openstef/tasks/utils/dependencies.py +0 -107
- openstef/tasks/utils/predictionjobloop.py +0 -243
- openstef/tasks/utils/taskcontext.py +0 -160
- openstef/validation/__init__.py +0 -3
- openstef/validation/validation.py +0 -322
- openstef-3.4.56.dist-info/METADATA +0 -154
- openstef-3.4.56.dist-info/RECORD +0 -102
- openstef-3.4.56.dist-info/top_level.txt +0 -1
- /openstef-3.4.56.dist-info/LICENSE → /openstef-4.0.0a3.dist-info/licenses/LICENSE.md +0 -0
openstef/model/serializer.py
DELETED
|
@@ -1,431 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
-
import json
|
|
5
|
-
import logging
|
|
6
|
-
import os
|
|
7
|
-
import shutil
|
|
8
|
-
from datetime import datetime
|
|
9
|
-
from json import JSONDecodeError
|
|
10
|
-
from typing import Optional, Union
|
|
11
|
-
from urllib.parse import unquote, urlparse
|
|
12
|
-
|
|
13
|
-
import mlflow
|
|
14
|
-
import numpy as np
|
|
15
|
-
import pandas as pd
|
|
16
|
-
import structlog
|
|
17
|
-
from mlflow.exceptions import MlflowException
|
|
18
|
-
from mlflow.store.artifact.artifact_repository_registry import get_artifact_repository
|
|
19
|
-
from xgboost import XGBModel # Temporary for backward compatibility
|
|
20
|
-
|
|
21
|
-
from openstef.data_classes.model_specifications import ModelSpecificationDataClass
|
|
22
|
-
from openstef.metrics.reporter import Report
|
|
23
|
-
from openstef.model.regressors.regressor import OpenstfRegressor
|
|
24
|
-
from openstef.settings import Settings
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
class MLflowSerializer:
|
|
28
|
-
def __init__(self, mlflow_tracking_uri: str):
|
|
29
|
-
structlog.configure(
|
|
30
|
-
wrapper_class=structlog.make_filtering_bound_logger(
|
|
31
|
-
logging.getLevelName(Settings.log_level)
|
|
32
|
-
)
|
|
33
|
-
)
|
|
34
|
-
self.logger = structlog.get_logger(self.__class__.__name__)
|
|
35
|
-
mlflow.set_tracking_uri(mlflow_tracking_uri)
|
|
36
|
-
self.logger.debug(f"MLflow tracking uri at init= {mlflow_tracking_uri}")
|
|
37
|
-
self.experiment_name_prefix = (
|
|
38
|
-
os.environ["DATABRICKS_WORKSPACE_PATH"]
|
|
39
|
-
if "DATABRICKS_WORKSPACE_PATH" in os.environ
|
|
40
|
-
else ""
|
|
41
|
-
)
|
|
42
|
-
|
|
43
|
-
def save_model(
|
|
44
|
-
self,
|
|
45
|
-
model: OpenstfRegressor,
|
|
46
|
-
experiment_name: str,
|
|
47
|
-
model_type: str,
|
|
48
|
-
model_specs: ModelSpecificationDataClass,
|
|
49
|
-
report: Report,
|
|
50
|
-
phase: str = "training",
|
|
51
|
-
**kwargs,
|
|
52
|
-
) -> None:
|
|
53
|
-
"""Save sklearn compatible model to MLFlow."""
|
|
54
|
-
mlflow.set_experiment(
|
|
55
|
-
experiment_name=self.experiment_name_prefix + experiment_name
|
|
56
|
-
)
|
|
57
|
-
with mlflow.start_run(run_name=experiment_name):
|
|
58
|
-
self._log_model_with_mlflow(
|
|
59
|
-
model=model,
|
|
60
|
-
experiment_name=experiment_name,
|
|
61
|
-
model_type=model_type,
|
|
62
|
-
model_specs=model_specs,
|
|
63
|
-
report=report,
|
|
64
|
-
phase=phase,
|
|
65
|
-
**kwargs,
|
|
66
|
-
)
|
|
67
|
-
self._log_figures_with_mlflow(report)
|
|
68
|
-
|
|
69
|
-
def _log_model_with_mlflow(
|
|
70
|
-
self,
|
|
71
|
-
model: OpenstfRegressor,
|
|
72
|
-
experiment_name: str,
|
|
73
|
-
model_type: str,
|
|
74
|
-
model_specs: ModelSpecificationDataClass,
|
|
75
|
-
report: Report,
|
|
76
|
-
phase: str,
|
|
77
|
-
**kwargs,
|
|
78
|
-
) -> None:
|
|
79
|
-
"""Log model with MLflow.
|
|
80
|
-
|
|
81
|
-
Note: **kwargs has extra information to be logged with mlflow
|
|
82
|
-
|
|
83
|
-
"""
|
|
84
|
-
# Get previous run id
|
|
85
|
-
models_df = self._find_models(
|
|
86
|
-
self.experiment_name_prefix + experiment_name, max_results=1
|
|
87
|
-
) # returns latest model
|
|
88
|
-
if not models_df.empty:
|
|
89
|
-
previous_run_id = models_df["run_id"][
|
|
90
|
-
0
|
|
91
|
-
] # Use [0] to only get latest run id
|
|
92
|
-
else:
|
|
93
|
-
self.logger.info(
|
|
94
|
-
"No previous model found in MLflow", experiment_name=experiment_name
|
|
95
|
-
)
|
|
96
|
-
previous_run_id = None
|
|
97
|
-
|
|
98
|
-
# Set tags to the run, can be used to filter on the UI
|
|
99
|
-
mlflow.set_tag("run_id", mlflow.active_run().info.run_id)
|
|
100
|
-
mlflow.set_tag("phase", phase) # phase can be Training or Hyperparameter_opt
|
|
101
|
-
mlflow.set_tag("Previous_version_id", previous_run_id)
|
|
102
|
-
mlflow.set_tag("model_type", model_type)
|
|
103
|
-
mlflow.set_tag("prediction_job", experiment_name)
|
|
104
|
-
|
|
105
|
-
# Add feature names, target, feature modules, metrics and params to the run
|
|
106
|
-
mlflow.set_tag(
|
|
107
|
-
"feature_names", model_specs.feature_names[1:]
|
|
108
|
-
) # feature names are 1+ columns
|
|
109
|
-
mlflow.set_tag("target", model_specs.feature_names[0]) # target is first column
|
|
110
|
-
mlflow.set_tag("feature_modules", model_specs.feature_modules)
|
|
111
|
-
mlflow.log_metrics(report.metrics)
|
|
112
|
-
model_specs.hyper_params.update(model.get_params())
|
|
113
|
-
# TODO: Remove this hardcoded hyper params fix with loop after fix by mlflow
|
|
114
|
-
# https://github.com/mlflow/mlflow/issues/6384
|
|
115
|
-
for key, value in model_specs.hyper_params.items():
|
|
116
|
-
if value == "":
|
|
117
|
-
model_specs.hyper_params[key] = " "
|
|
118
|
-
mlflow.log_params(model_specs.hyper_params)
|
|
119
|
-
|
|
120
|
-
# Process args
|
|
121
|
-
for key, value in kwargs.items():
|
|
122
|
-
if isinstance(value, dict):
|
|
123
|
-
mlflow.log_dict(value, f"{key}.json")
|
|
124
|
-
elif isinstance(value, str) or isinstance(value, int):
|
|
125
|
-
mlflow.set_tag(key, value)
|
|
126
|
-
else:
|
|
127
|
-
self.logger.warning(
|
|
128
|
-
f"Couldn't log {key}, {type(key)} not supported",
|
|
129
|
-
experiment_name=experiment_name,
|
|
130
|
-
)
|
|
131
|
-
|
|
132
|
-
# Log the model to the run. Signature describes model input and output scheme
|
|
133
|
-
mlflow.sklearn.log_model(
|
|
134
|
-
sk_model=model, artifact_path="model", signature=report.signature
|
|
135
|
-
)
|
|
136
|
-
self.logger.info("Model saved with MLflow", experiment_name=experiment_name)
|
|
137
|
-
|
|
138
|
-
def _log_figures_with_mlflow(self, report) -> None:
|
|
139
|
-
"""Log figures with MLflow in the artifact folder."""
|
|
140
|
-
if report.feature_importance_figure is not None:
|
|
141
|
-
mlflow.log_figure(
|
|
142
|
-
report.feature_importance_figure, "figures/weight_plot.html"
|
|
143
|
-
)
|
|
144
|
-
for key, figure in report.data_series_figures.items():
|
|
145
|
-
mlflow.log_figure(figure, f"figures/{key}.html")
|
|
146
|
-
self.logger.info("Logged figures to MLflow.")
|
|
147
|
-
|
|
148
|
-
def load_model(
|
|
149
|
-
self,
|
|
150
|
-
experiment_name: str,
|
|
151
|
-
) -> tuple[OpenstfRegressor, ModelSpecificationDataClass]:
|
|
152
|
-
"""Load sklearn compatible model from MLFlow.
|
|
153
|
-
|
|
154
|
-
Args:
|
|
155
|
-
experiment_name: Name of the experiment, often the id of the predition job.
|
|
156
|
-
|
|
157
|
-
Raises:
|
|
158
|
-
LookupError: If model is not found in MLflow.
|
|
159
|
-
|
|
160
|
-
"""
|
|
161
|
-
try:
|
|
162
|
-
models_df = self._find_models(
|
|
163
|
-
self.experiment_name_prefix + experiment_name, max_results=1
|
|
164
|
-
) # return the latest finished run of the model
|
|
165
|
-
if not models_df.empty:
|
|
166
|
-
latest_run = models_df.iloc[0] # Use .iloc[0] to only get latest run
|
|
167
|
-
else:
|
|
168
|
-
raise LookupError("Model not found. First train a model!")
|
|
169
|
-
model_uri = self._get_model_uri(latest_run.artifact_uri)
|
|
170
|
-
loaded_model = mlflow.sklearn.load_model(model_uri)
|
|
171
|
-
loaded_model.age = self._determine_model_age_from_mlflow_run(latest_run)
|
|
172
|
-
model_specs = self._get_model_specs(
|
|
173
|
-
experiment_name, loaded_model, latest_run
|
|
174
|
-
)
|
|
175
|
-
loaded_model.path = unquote(
|
|
176
|
-
urlparse(model_uri).path
|
|
177
|
-
) # Path without file:///
|
|
178
|
-
self.logger.info("Model successfully loaded with MLflow")
|
|
179
|
-
return loaded_model, model_specs
|
|
180
|
-
except (AttributeError, MlflowException, OSError) as exception:
|
|
181
|
-
raise LookupError("Model not found. First train a model!") from exception
|
|
182
|
-
|
|
183
|
-
def get_model_age(
|
|
184
|
-
self, experiment_name: str, hyperparameter_optimization_only: bool = False
|
|
185
|
-
) -> int:
|
|
186
|
-
"""Get model age of most recent model.
|
|
187
|
-
|
|
188
|
-
Args:
|
|
189
|
-
experiment_name: Name of the experiment, often the id of the predition job.
|
|
190
|
-
hyperparameter_optimization_only: Set to true if only hyperparameters optimaisation events should be considered.
|
|
191
|
-
|
|
192
|
-
"""
|
|
193
|
-
filter_string = "attribute.status = 'FINISHED'"
|
|
194
|
-
if hyperparameter_optimization_only:
|
|
195
|
-
filter_string += " AND tags.phase = 'Hyperparameter_opt'"
|
|
196
|
-
models_df = self._find_models(
|
|
197
|
-
self.experiment_name_prefix + experiment_name,
|
|
198
|
-
max_results=1,
|
|
199
|
-
filter_string=filter_string,
|
|
200
|
-
)
|
|
201
|
-
if not models_df.empty:
|
|
202
|
-
run = models_df.iloc[0] # Use .iloc[0] to only get latest run
|
|
203
|
-
return self._determine_model_age_from_mlflow_run(run)
|
|
204
|
-
else:
|
|
205
|
-
self.logger.info("No model found returning infinite model age!")
|
|
206
|
-
return np.inf
|
|
207
|
-
|
|
208
|
-
def _find_models(
|
|
209
|
-
self,
|
|
210
|
-
experiment_name: str,
|
|
211
|
-
max_results: Optional[int] = 100,
|
|
212
|
-
filter_string: str = "attribute.status = 'FINISHED'",
|
|
213
|
-
) -> pd.DataFrame:
|
|
214
|
-
"""Finds trained models for specific experiment_name sorted by age in descending order."""
|
|
215
|
-
models_df = mlflow.search_runs(
|
|
216
|
-
experiment_names=[experiment_name],
|
|
217
|
-
max_results=max_results,
|
|
218
|
-
filter_string=filter_string,
|
|
219
|
-
)
|
|
220
|
-
return models_df
|
|
221
|
-
|
|
222
|
-
def _get_model_specs(
|
|
223
|
-
self,
|
|
224
|
-
experiment_name: str,
|
|
225
|
-
loaded_model: OpenstfRegressor,
|
|
226
|
-
latest_run: pd.Series,
|
|
227
|
-
) -> ModelSpecificationDataClass:
|
|
228
|
-
"""Get model specifications from existing model."""
|
|
229
|
-
model_specs = ModelSpecificationDataClass(id=experiment_name)
|
|
230
|
-
|
|
231
|
-
# Temporary fix for update of xgboost
|
|
232
|
-
# new version requires some attributes that the old (stored) models don't have yet
|
|
233
|
-
# see: https://stackoverflow.com/questions/71912084/attributeerror-xgbmodel-object-has-no-attribute-callbacks
|
|
234
|
-
new_attrs = [
|
|
235
|
-
"grow_policy",
|
|
236
|
-
"max_bin",
|
|
237
|
-
"eval_metric",
|
|
238
|
-
"callbacks",
|
|
239
|
-
"early_stopping_rounds",
|
|
240
|
-
"max_cat_to_onehot",
|
|
241
|
-
"max_leaves",
|
|
242
|
-
"sampling_method",
|
|
243
|
-
]
|
|
244
|
-
|
|
245
|
-
manual_additional_attrs = [
|
|
246
|
-
"enable_categorical",
|
|
247
|
-
"predictor",
|
|
248
|
-
] # these ones are not mentioned in the stackoverflow post
|
|
249
|
-
automatic_additional_attrs = [
|
|
250
|
-
x
|
|
251
|
-
for x in XGBModel._get_param_names()
|
|
252
|
-
if x
|
|
253
|
-
not in new_attrs + manual_additional_attrs + loaded_model._get_param_names()
|
|
254
|
-
]
|
|
255
|
-
|
|
256
|
-
for attr in new_attrs + manual_additional_attrs + automatic_additional_attrs:
|
|
257
|
-
setattr(loaded_model, attr, None)
|
|
258
|
-
|
|
259
|
-
# This one is new is should be set to a specific value (https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.training)
|
|
260
|
-
setattr(loaded_model, "missing", np.nan)
|
|
261
|
-
setattr(loaded_model, "n_estimators", 100)
|
|
262
|
-
|
|
263
|
-
# End temporary fix
|
|
264
|
-
|
|
265
|
-
# get the parameters from old model, we insert these later into new model
|
|
266
|
-
model_specs.hyper_params = loaded_model.get_params()
|
|
267
|
-
# TODO: Remove this hardcoded hyper params fix with loop after fix by mlflow
|
|
268
|
-
# https://github.com/mlflow/mlflow/issues/6384
|
|
269
|
-
for key, value in model_specs.hyper_params.items():
|
|
270
|
-
if value == " ":
|
|
271
|
-
model_specs.hyper_params[key] = ""
|
|
272
|
-
# get used feature names else use all feature names
|
|
273
|
-
model_specs.feature_names = self._get_feature_names(
|
|
274
|
-
experiment_name, latest_run, model_specs, loaded_model
|
|
275
|
-
)
|
|
276
|
-
# get feature_modules
|
|
277
|
-
model_specs.feature_modules = self._get_feature_modules(
|
|
278
|
-
experiment_name, latest_run, model_specs, loaded_model
|
|
279
|
-
)
|
|
280
|
-
return model_specs
|
|
281
|
-
|
|
282
|
-
def _determine_model_age_from_mlflow_run(self, run: pd.Series) -> Union[int, float]:
|
|
283
|
-
"""Determines how many days ago a model is trained from the mlflow run."""
|
|
284
|
-
try:
|
|
285
|
-
model_datetime = run.end_time.to_pydatetime()
|
|
286
|
-
model_datetime = model_datetime.replace(tzinfo=None)
|
|
287
|
-
model_age_days = (datetime.utcnow() - model_datetime).days
|
|
288
|
-
except Exception as e:
|
|
289
|
-
self.logger.warning(
|
|
290
|
-
"Could not get model age. Returning infinite age!", exception=str(e)
|
|
291
|
-
)
|
|
292
|
-
return np.inf # Return fallback age
|
|
293
|
-
return model_age_days
|
|
294
|
-
|
|
295
|
-
def remove_old_models(
|
|
296
|
-
self,
|
|
297
|
-
experiment_name: str,
|
|
298
|
-
max_n_models: int = 10,
|
|
299
|
-
):
|
|
300
|
-
"""Remove old models per experiment."""
|
|
301
|
-
if max_n_models < 1:
|
|
302
|
-
raise ValueError(
|
|
303
|
-
f"Max models to keep should be greater than 1! Received: {max_n_models}"
|
|
304
|
-
)
|
|
305
|
-
previous_runs = self._find_models(
|
|
306
|
-
experiment_name=self.experiment_name_prefix + experiment_name
|
|
307
|
-
)
|
|
308
|
-
if len(previous_runs) > max_n_models:
|
|
309
|
-
self.logger.debug(
|
|
310
|
-
f"Going to delete old models. {len(previous_runs)} > {max_n_models}"
|
|
311
|
-
)
|
|
312
|
-
# Find run_ids of oldest runs
|
|
313
|
-
runs_to_remove = previous_runs.sort_values(
|
|
314
|
-
by="end_time", ascending=False
|
|
315
|
-
).loc[max_n_models:, :]
|
|
316
|
-
for _, run in runs_to_remove.iterrows():
|
|
317
|
-
self.logger.debug(
|
|
318
|
-
f"Going to remove run {run.run_id}, from {run.end_time}."
|
|
319
|
-
)
|
|
320
|
-
mlflow.delete_run(run.run_id)
|
|
321
|
-
self.logger.debug("Removed run")
|
|
322
|
-
|
|
323
|
-
# mlflow.delete_run marks it as deleted but does not delete it by itself
|
|
324
|
-
# Remove artifacts to save disk space
|
|
325
|
-
try:
|
|
326
|
-
repository = get_artifact_repository(
|
|
327
|
-
mlflow.get_run(run.run_id).info.artifact_uri
|
|
328
|
-
)
|
|
329
|
-
repository.delete_artifacts()
|
|
330
|
-
self.logger.debug("Removed artifacts")
|
|
331
|
-
except Exception as e:
|
|
332
|
-
self.logger.info(f"Failed removing artifacts: {e}")
|
|
333
|
-
|
|
334
|
-
def _get_feature_names(
|
|
335
|
-
self,
|
|
336
|
-
experiment_name: str,
|
|
337
|
-
latest_run: pd.Series,
|
|
338
|
-
model_specs: ModelSpecificationDataClass,
|
|
339
|
-
loaded_model: OpenstfRegressor,
|
|
340
|
-
) -> list:
|
|
341
|
-
"""Get the feature_names from MLflow or the old model."""
|
|
342
|
-
error_message = "feature_names not loaded and using None, because it"
|
|
343
|
-
try:
|
|
344
|
-
model_specs.feature_names = json.loads(
|
|
345
|
-
latest_run["tags.feature_names"].replace("'", '"')
|
|
346
|
-
)
|
|
347
|
-
except KeyError:
|
|
348
|
-
self.logger.warning(
|
|
349
|
-
f"{error_message} did not exist in run",
|
|
350
|
-
experiment_name=experiment_name,
|
|
351
|
-
)
|
|
352
|
-
except AttributeError:
|
|
353
|
-
self.logger.warning(
|
|
354
|
-
f"{error_message} needs to be a string",
|
|
355
|
-
experiment_name=experiment_name,
|
|
356
|
-
)
|
|
357
|
-
except JSONDecodeError:
|
|
358
|
-
self.logger.warning(
|
|
359
|
-
f"{error_message} needs to be a string of a list",
|
|
360
|
-
experiment_name=experiment_name,
|
|
361
|
-
)
|
|
362
|
-
|
|
363
|
-
# if feature names is none, see if we can retrieve them from the old model
|
|
364
|
-
if model_specs.feature_names is None:
|
|
365
|
-
try:
|
|
366
|
-
if loaded_model.feature_names is not None:
|
|
367
|
-
model_specs.feature_names = loaded_model.feature_names
|
|
368
|
-
self.logger.info(
|
|
369
|
-
"feature_names retrieved from old model with an attribute",
|
|
370
|
-
experiment_name=experiment_name,
|
|
371
|
-
)
|
|
372
|
-
except AttributeError:
|
|
373
|
-
self.logger.warning(
|
|
374
|
-
"feature_names not an attribute of the old model, using None ",
|
|
375
|
-
experiment_name=experiment_name,
|
|
376
|
-
)
|
|
377
|
-
return model_specs.feature_names
|
|
378
|
-
|
|
379
|
-
def _get_feature_modules(
|
|
380
|
-
self,
|
|
381
|
-
experiment_name: str,
|
|
382
|
-
latest_run: pd.Series,
|
|
383
|
-
model_specs: ModelSpecificationDataClass,
|
|
384
|
-
loaded_model: OpenstfRegressor,
|
|
385
|
-
) -> list:
|
|
386
|
-
"""Get the feature_modules from MLflow or the old model."""
|
|
387
|
-
error_message = "feature_modules not loaded and using None, because it"
|
|
388
|
-
try:
|
|
389
|
-
model_specs.feature_modules = json.loads(
|
|
390
|
-
latest_run["tags.feature_modules"].replace("'", '"')
|
|
391
|
-
)
|
|
392
|
-
|
|
393
|
-
except KeyError:
|
|
394
|
-
self.logger.warning(
|
|
395
|
-
f"{error_message} did not exist in run",
|
|
396
|
-
experiment_name=experiment_name,
|
|
397
|
-
)
|
|
398
|
-
except AttributeError:
|
|
399
|
-
self.logger.warning(
|
|
400
|
-
f"{error_message} needs to be a string",
|
|
401
|
-
experiment_name=experiment_name,
|
|
402
|
-
)
|
|
403
|
-
except JSONDecodeError:
|
|
404
|
-
self.logger.warning(
|
|
405
|
-
f"{error_message} needs to be a string of a list",
|
|
406
|
-
experiment_name=experiment_name,
|
|
407
|
-
)
|
|
408
|
-
|
|
409
|
-
# if feature modules is none, see if we can retrieve them from the old model
|
|
410
|
-
if not model_specs.feature_modules:
|
|
411
|
-
try:
|
|
412
|
-
if loaded_model.feature_modules:
|
|
413
|
-
model_specs.feature_modules = loaded_model.feature_modules
|
|
414
|
-
self.logger.info(
|
|
415
|
-
"feature_modules retrieved from old model with an attribute",
|
|
416
|
-
experiment_name=experiment_name,
|
|
417
|
-
)
|
|
418
|
-
except AttributeError:
|
|
419
|
-
self.logger.warning(
|
|
420
|
-
"feature_modules not an attribute of the old model, using None ",
|
|
421
|
-
experiment_name=experiment_name,
|
|
422
|
-
)
|
|
423
|
-
return model_specs.feature_modules
|
|
424
|
-
|
|
425
|
-
def _get_model_uri(self, artifact_uri: str) -> str:
|
|
426
|
-
"""Set model uri based on latest run.
|
|
427
|
-
|
|
428
|
-
Note: this function helps to mock during unit tests
|
|
429
|
-
|
|
430
|
-
"""
|
|
431
|
-
return os.path.join(artifact_uri, "model/")
|
|
@@ -1,81 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: 2017-2023 Contributors to the OpenSTEF project <korte.termijn.prognoses@alliander.com> # noqa E501>
|
|
2
|
-
#
|
|
3
|
-
# SPDX-License-Identifier: MPL-2.0
|
|
4
|
-
import numpy as np
|
|
5
|
-
import pandas as pd
|
|
6
|
-
from sklearn.base import RegressorMixin
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class StandardDeviationGenerator:
|
|
10
|
-
def __init__(self, validation_data: pd.DataFrame) -> None:
|
|
11
|
-
self.validation_data = validation_data
|
|
12
|
-
|
|
13
|
-
def generate_standard_deviation_data(self, model: RegressorMixin) -> RegressorMixin:
|
|
14
|
-
"""Generate the standard data.
|
|
15
|
-
|
|
16
|
-
Calculates the difference between realised and predicted on validation set.
|
|
17
|
-
For each hour of the day the std of the difference is calculated.
|
|
18
|
-
|
|
19
|
-
Args:
|
|
20
|
-
model: The trained model
|
|
21
|
-
|
|
22
|
-
Returns:
|
|
23
|
-
The model with the std data added.
|
|
24
|
-
|
|
25
|
-
"""
|
|
26
|
-
# Define some variables
|
|
27
|
-
predicted = None
|
|
28
|
-
self.standard_deviation = pd.DataFrame()
|
|
29
|
-
|
|
30
|
-
# Loop over horizons and ask prediction for each specific horizon
|
|
31
|
-
for horizon in self.validation_data.horizon.unique():
|
|
32
|
-
# Make subset for this specific horizon
|
|
33
|
-
sub_val = self.validation_data[self.validation_data.horizon == horizon]
|
|
34
|
-
try:
|
|
35
|
-
predicted = model.predict(sub_val.iloc[:, 1:-1])
|
|
36
|
-
except Exception as e:
|
|
37
|
-
print("Could not get prediction from new model!", e)
|
|
38
|
-
|
|
39
|
-
# Calculate confidence interval for this horizon
|
|
40
|
-
confidence_interval_horizon = self._calculate_standard_deviation(
|
|
41
|
-
sub_val.iloc[:, 0], predicted
|
|
42
|
-
)
|
|
43
|
-
confidence_interval_horizon[
|
|
44
|
-
"horizon"
|
|
45
|
-
] = horizon # Label with respective horizon
|
|
46
|
-
self.standard_deviation = pd.concat(
|
|
47
|
-
[self.standard_deviation, confidence_interval_horizon]
|
|
48
|
-
)
|
|
49
|
-
|
|
50
|
-
model.standard_deviation = self.standard_deviation
|
|
51
|
-
|
|
52
|
-
return model
|
|
53
|
-
|
|
54
|
-
@staticmethod
|
|
55
|
-
def _calculate_standard_deviation(
|
|
56
|
-
realised: pd.Series, predicted: pd.Series
|
|
57
|
-
) -> pd.DataFrame:
|
|
58
|
-
"""Protected static method to calculate the corrections for a model.
|
|
59
|
-
|
|
60
|
-
Args:
|
|
61
|
-
realised: pd.series with realised load
|
|
62
|
-
predicted: pd.series with load predicted by new model
|
|
63
|
-
|
|
64
|
-
Returns:
|
|
65
|
-
DataFrame with model corrections
|
|
66
|
-
|
|
67
|
-
"""
|
|
68
|
-
result = pd.DataFrame(index=range(24), columns=["stdev", "hour"])
|
|
69
|
-
# Calculate the error for each predicted point
|
|
70
|
-
error = realised - predicted
|
|
71
|
-
error.index = error.index.hour # Hour only, remove the rest
|
|
72
|
-
# For the time starts with 00, 01, 02, etc. TODO (MAKE MORE ELEGANT SOLUTION THAN A LOOP)
|
|
73
|
-
for hour in range(24):
|
|
74
|
-
hour_error = error[error.index == hour]
|
|
75
|
-
|
|
76
|
-
result.loc[hour, "stdev"] = np.std(hour_error)
|
|
77
|
-
result.loc[hour, "hour"] = hour
|
|
78
|
-
|
|
79
|
-
result = result.astype("float")
|
|
80
|
-
|
|
81
|
-
return result
|