views_stepshifter 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 views platform
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,19 @@
1
+ Metadata-Version: 2.1
2
+ Name: views_stepshifter
3
+ Version: 0.1.1
4
+ Summary:
5
+ Author: Xiaolong Sun
6
+ Author-email: xiaolong.sun@pcr.uu.se
7
+ Requires-Python: >=3.11,<3.15
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.11
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Classifier: Programming Language :: Python :: 3.13
12
+ Requires-Dist: darts (>=0.30.0,<0.31.0)
13
+ Requires-Dist: numpy (>=1.25.2,<2.0.0)
14
+ Requires-Dist: pandas (>=1.5.3,<2.0.0)
15
+ Requires-Dist: scikit-learn (>=1.2.2,<2.0.0)
16
+ Requires-Dist: views_pipeline_core (>=0.1.0,<0.2.0)
17
+ Description-Content-Type: text/markdown
18
+
19
+
File without changes
@@ -0,0 +1,19 @@
1
+ [tool.poetry]
2
+ name = "views_stepshifter"
3
+ version = "0.1.1"
4
+ description = ""
5
+ authors = ["Xiaolong Sun <xiaolong.sun@pcr.uu.se>"]
6
+ readme = "README.md"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = ">=3.11,<3.15"
10
+ views_pipeline_core = "^0.1.0"
11
+ scikit-learn = "^1.2.2"
12
+ pandas = "^1.5.3"
13
+ numpy = "^1.25.2"
14
+ darts = "^0.30.0"
15
+
16
+
17
+ [build-system]
18
+ requires = ["poetry-core"]
19
+ build-backend = "poetry.core.masonry.api"
File without changes
@@ -0,0 +1,287 @@
1
+ from views_pipeline_core.managers.model_manager import ModelManager
2
+ from views_pipeline_core.managers.path_manager import ModelPath
3
+ from views_pipeline_core.models.outputs import generate_output_dict
4
+ from views_pipeline_core.files.utils import read_log_file, create_log_file
5
+ from views_pipeline_core.wandb.utils import add_wandb_monthly_metrics, generate_wandb_log_dict, log_wandb_log_dict
6
+ from views_pipeline_core.evaluation.metrics import generate_metric_dict
7
+ from views_stepshifter.models.stepshifter import StepshifterModel
8
+ from views_stepshifter.models.hurdle_model import HurdleModel
9
+ from views_forecasts.extensions import *
10
+ import logging
11
+ import pandas as pd
12
+ import numpy as np
13
+ import time
14
+ import wandb
15
+ from datetime import datetime
16
+ from sklearn.metrics import mean_squared_error
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class StepshifterManager(ModelManager):
22
+
23
+ def __init__(self, model_path: ModelPath) -> None:
24
+ super().__init__(model_path)
25
+ self._is_hurdle = self._config_meta["algorithm"] == "HurdleModel"
26
+
27
+ def _update_sweep_config(self, args):
28
+ """
29
+ Updates the configuration object with config_hyperparameters, config_meta, config_deployment, and the command line arguments.
30
+
31
+ Args:
32
+ args: Command line arguments
33
+
34
+ Returns:
35
+ The updated configuration object.
36
+ """
37
+
38
+ config = self._config_sweep
39
+ config["parameters"]["run_type"] = {"value": args.run_type}
40
+ config["parameters"]["sweep"] = {"value": True}
41
+ config["parameters"]["name"] = {"value": self._config_meta["name"]}
42
+ config["parameters"]["depvar"] = {"value": self._config_meta["depvar"]}
43
+ config["parameters"]["algorithm"] = {"value": self._config_meta["algorithm"]}
44
+
45
+ if self._is_hurdle:
46
+ config["parameters"]["model_clf"] = {"value": self._config_meta["model_clf"]}
47
+ config["parameters"]["model_reg"] = {"value": self._config_meta["model_reg"]}
48
+
49
+ return config
50
+
51
+ def _split_hurdle_parameters(self):
52
+ """
53
+ Split the parameters dictionary into two separate dictionaries, one for the
54
+ classification model and one for the regression model.
55
+
56
+ Returns:
57
+ A dictionary containing original config, the split classification and regression parameters.
58
+ """
59
+
60
+ cls_dict = {}
61
+ reg_dict = {}
62
+ config = self.config
63
+
64
+ for key, value in config.items():
65
+ if key.startswith("cls_"):
66
+ cls_key = key.replace("cls_", "")
67
+ cls_dict[cls_key] = value
68
+ elif key.startswith("reg_"):
69
+ reg_key = key.replace("reg_", "")
70
+ reg_dict[reg_key] = value
71
+
72
+ config["parameters"]["clf"] = cls_dict
73
+ config["parameters"]["reg"] = reg_dict
74
+
75
+ return config
76
+
77
+ def _execute_model_tasks(self, config=None, train=None, eval=None, forecast=None, artifact_name=None):
78
+ """
79
+ Executes various model-related tasks including training, evaluation, and forecasting.
80
+
81
+ This function manages the execution of different tasks such as training the model,
82
+ evaluating an existing model, or performing forecasting.
83
+ It also initializes the WandB project.
84
+
85
+ Args:
86
+ config: Configuration object containing parameters and settings.
87
+ project: The WandB project name.
88
+ train: Flag to indicate if the model should be trained.
89
+ eval: Flag to indicate if the model should be evaluated.
90
+ forecast: Flag to indicate if forecasting should be performed.
91
+ artifact_name (optional): Specific name of the model artifact to load for evaluation or forecasting.
92
+ """
93
+ start_t = time.time()
94
+
95
+ # Initialize WandB
96
+ try:
97
+ with wandb.init(project=self._project, entity=self._entity, config=config): # project and config ignored when running a sweep
98
+
99
+ # add the monthly metrics to WandB
100
+ add_wandb_monthly_metrics()
101
+
102
+ # Update config from WandB initialization above
103
+ self.config = wandb.config
104
+
105
+ # W&B does not directly support nested dictionaries for hyperparameters
106
+ if self.config["sweep"] and self._is_hurdle:
107
+ self.config["parameters"] = self._split_hurdle_parameters(self.config)
108
+
109
+ if self.config["sweep"]:
110
+ logger.info(f"Sweeping model {self.config['name']}...")
111
+ model = self._train_model_artifact()
112
+ logger.info(f"Evaluating model {self.config['name']}...")
113
+ self._evaluate_sweep(model)
114
+
115
+ if train:
116
+ logger.info(f"Training model {self.config['name']}...")
117
+ self._train_model_artifact()
118
+
119
+ if eval:
120
+ logger.info(f"Evaluating model {self.config['name']}...")
121
+ self._evaluate_model_artifact(artifact_name)
122
+
123
+ if forecast:
124
+ logger.info(f"Forecasting model {self.config['name']}...")
125
+ self._forecast_model_artifact(artifact_name)
126
+ wandb.finish()
127
+ except Exception as e:
128
+ logger.error(f"Error during model tasks execution: {e}")
129
+ end_t = time.time()
130
+ minutes = (end_t - start_t) / 60
131
+ logger.info(f"Done. Runtime: {minutes:.3f} minutes.\n")
132
+
133
+ def _get_model(self, partitioner_dict: dict):
134
+ """
135
+ Get the model based on the algorithm specified in the config
136
+
137
+ Args:
138
+ partitioner_dict: The dictionary of partitioners.
139
+
140
+ Returns:
141
+ The model object based on the algorithm specified in the config
142
+ """
143
+
144
+ if self._is_hurdle:
145
+ model = HurdleModel(self.config, partitioner_dict)
146
+ else:
147
+ self.config["model_reg"] = self.config["algorithm"]
148
+ model = StepshifterModel(self.config, partitioner_dict)
149
+
150
+ return model
151
+
152
+ def _get_standardized_df(self, df: pd.DataFrame) -> pd.DataFrame:
153
+ """
154
+ Standardize the DataFrame based on the run type
155
+
156
+ Args:
157
+ df: The DataFrame to standardize
158
+
159
+ Returns:
160
+ The standardized DataFrame
161
+ """
162
+
163
+ run_type = self.config["run_type"]
164
+ depvar = self.config["depvar"]
165
+
166
+ # choose the columns to keep based on the run type and replace negative values with 0
167
+ if run_type in ["calibration", "testing"]:
168
+ cols = [depvar] + df.forecasts.prediction_columns
169
+ elif run_type == "forecasting":
170
+ cols = ["step_pred_combined", depvar]
171
+ df = df.replace([np.inf, -np.inf], 0)[cols]
172
+ df = df.mask(df < 0, 0)
173
+ return df
174
+
175
+ def _train_model_artifact(self):
176
+ # print(config)
177
+ path_raw = self._model_path.data_raw
178
+ path_generated = self._model_path.data_generated
179
+ path_artifacts = self._model_path.artifacts
180
+ run_type = self.config["run_type"]
181
+ df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl")
182
+
183
+ partitioner_dict = self._data_loader.partition_dict
184
+ stepshift_model = self._get_model(partitioner_dict)
185
+ stepshift_model.fit(df_viewser)
186
+
187
+ if not self.config["sweep"]:
188
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
189
+ model_filename = ModelManager._generate_model_file_name(run_type, timestamp)
190
+ stepshift_model.save(path_artifacts / model_filename)
191
+ data_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None)
192
+ create_log_file(path_generated, self.config, timestamp, None, data_fetch_timestamp)
193
+ return stepshift_model
194
+
195
+ def _evaluate_model_artifact(self, artifact_name):
196
+ path_raw = self._model_path.data_raw
197
+ path_generated = self._model_path.data_generated
198
+ path_artifacts = self._model_path.artifacts
199
+ run_type = self.config["run_type"]
200
+
201
+ # if an artifact name is provided through the CLI, use it.
202
+ # Otherwise, get the latest model artifact based on the run type
203
+ if artifact_name:
204
+ logger.info(f"Using (non-default) artifact: {artifact_name}")
205
+
206
+ if not artifact_name.endswith(".pkl"):
207
+ artifact_name += ".pkl"
208
+ path_artifact = path_artifacts / artifact_name
209
+ else:
210
+ # use the latest model artifact based on the run type
211
+ logger.info(f"Using latest (default) run type ({run_type}) specific artifact")
212
+ path_artifact = self._get_latest_model_artifact(path_artifacts, run_type)
213
+
214
+ self.config["timestamp"] = path_artifact.stem[-15:]
215
+ df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl")
216
+
217
+ try:
218
+ stepshift_model = pd.read_pickle(path_artifact)
219
+ except FileNotFoundError:
220
+ logger.exception(f"Model artifact not found at {path_artifact}")
221
+
222
+ df = stepshift_model.predict(run_type, df_viewser)
223
+ df = self._get_standardized_df(df)
224
+ data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
225
+ data_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None)
226
+
227
+ _, df_output = generate_output_dict(df, self.config)
228
+ evaluation, df_evaluation = generate_metric_dict(df, self.config)
229
+ log_wandb_log_dict(self.config, evaluation)
230
+
231
+ self._save_model_outputs(df_evaluation, df_output, path_generated)
232
+ self._save_predictions(df, path_generated)
233
+ create_log_file(path_generated, self.config, self.config["timestamp"], data_generation_timestamp, data_fetch_timestamp)
234
+
235
+ def _forecast_model_artifact(self, artifact_name):
236
+ path_raw = self._model_path.data_raw
237
+ path_generated = self._model_path.data_generated
238
+ path_artifacts = self._model_path.artifacts
239
+ run_type = self.config["run_type"]
240
+
241
+ # if an artifact name is provided through the CLI, use it.
242
+ # Otherwise, get the latest model artifact based on the run type
243
+ if artifact_name:
244
+ logger.info(f"Using (non-default) artifact: {artifact_name}")
245
+
246
+ if not artifact_name.endswith(".pkl"):
247
+ artifact_name += ".pkl"
248
+ path_artifact = path_artifacts / artifact_name
249
+ else:
250
+ # use the latest model artifact based on the run type
251
+ logger.info(f"Using latest (default) run type ({run_type}) specific artifact")
252
+ path_artifact = self._get_latest_model_artifact(path_artifacts, run_type)
253
+
254
+ self.config["timestamp"] = path_artifact.stem[-15:]
255
+ df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl")
256
+
257
+ try:
258
+ stepshift_model = pd.read_pickle(path_artifact)
259
+ except FileNotFoundError:
260
+ logger.exception(f"Model artifact not found at {path_artifact}")
261
+
262
+ df_predictions = stepshift_model.predict(run_type, df_viewser)
263
+ df_predictions = self._get_standardized_df(df_predictions)
264
+ data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
265
+ data_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None)
266
+
267
+ self._save_predictions(df_predictions, path_generated)
268
+ create_log_file(path_generated, self.config, self.config["timestamp"], data_generation_timestamp, data_fetch_timestamp)
269
+
270
+ def _evaluate_sweep(self, model):
271
+ path_raw = self._model_path.data_raw
272
+ run_type = self.config["run_type"]
273
+ steps = self.config["steps"]
274
+
275
+ df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl")
276
+ df = model.predict(run_type, df_viewser)
277
+ df = self._get_standardized_df(df)
278
+
279
+ # Temporarily keep this because the metric to minimize is MSE
280
+ pred_cols = [f"step_pred_{str(i)}" for i in steps]
281
+ df["mse"] = df.apply(lambda row: mean_squared_error([row[self.config["depvar"]]] * len(steps),
282
+ [row[col] for col in pred_cols]), axis=1)
283
+
284
+ wandb.log({"MSE": df["mse"].mean()})
285
+
286
+ evaluation, _ = generate_metric_dict(df, self.config)
287
+ log_wandb_log_dict(self.config, evaluation)
@@ -0,0 +1,2 @@
1
+ from .stepshifter import StepshifterModel
2
+ from darts.models import LightGBMModel, XGBModel, RandomForest
@@ -0,0 +1,94 @@
1
+ import numpy as np
2
+ from .stepshifter import StepshifterModel
3
+ from .validation import views_validate
4
+ from views_forecasts.extensions import *
5
+ from sklearn.utils.validation import check_is_fitted
6
+
7
+
8
+ class HurdleModel(StepshifterModel):
9
+ """
10
+ Hurdle model for time series forecasting. The model consists of two stages:
11
+ 1. Binary stage: Predicts whether the target variable is 0 or > 0.
12
+ 2. Positive stage: Predicts the value of the target variable when it is > 0.
13
+
14
+ Note:
15
+ This algorithm uses a two-step approach.
16
+
17
+ **Step 1: Classification Stage**
18
+ In the first step, a regression model is used with a binary target (0 or 1),
19
+ indicating the absence or presence of violence. This stage functions similarly
20
+ to a linear probability model, estimating the likelihood of a positive outcome.
21
+ Since the model is a regression rather than a classification model,
22
+ these estimates are not strictly bounded between 0 and 1,
23
+ but this is acceptable for the purpose of this step.
24
+
25
+ To determine whether an observation is classified as "positive," we apply a threshold.
26
+ The default threshold is 1, meaning that predictions above this value
27
+ are considered positive outcomes. This threshold can be adjusted as
28
+ a tunable hyperparameter to better suit specific requirements.
29
+
30
+ **Step 2: Regression Stage**
31
+ In the second step, we use a regression model to predict a continuous or count value
32
+ (e.g., the expected number of conflict fatalities) for the selected time series.
33
+ We include the entire time series for countries or PRIO grids where the
34
+ classification stage yielded at least one "positive" prediction,
35
+ rather than limiting the regression to just the predicted positive values.
36
+ """
37
+
38
+ def __init__(self, config: Dict, partitioner_dict: Dict[str, List[int]], threshold: float = 1.0):
39
+ super().__init__(config, partitioner_dict)
40
+ self._clf = self._resolve_estimator(config['model_clf'])
41
+ self._reg = self._resolve_estimator(config['model_reg'])
42
+ self._clf_params = self._get_parameters(config)['clf']
43
+ self._reg_params = self._get_parameters(config)['reg']
44
+ self._threshold = threshold
45
+
46
+ @views_validate
47
+ def fit(self, df: pd.DataFrame):
48
+ df = self._process_data(df)
49
+ self._prepare_time_series(df)
50
+
51
+ # Binary outcome (event/no-event)
52
+ # According to the DARTS doc, if timeseries uses a numeric type different from np.float32 or np.float64, not all functionalities may work properly.
53
+ # So use astype(float) instead of astype(int) (we should have binary outputs 0,1 though)
54
+ target_binary = [s.map(lambda x: (x > self._threshold).astype(float)) for s in self._target_train]
55
+
56
+ # Positive outcome (for cases where target > threshold)
57
+ target_pos, past_cov_pos = zip(*[(t, p) for t, p in zip(self._target_train, self._past_cov_train)
58
+ if (t.values() > self._threshold).any()])
59
+
60
+ for step in self._steps:
61
+ # Fit binary-like stage using a regression model, but the target is binary (0 or 1)
62
+ binary_model = self._clf(lags_past_covariates=[-step], **self._clf_params)
63
+ binary_model.fit(target_binary, past_covariates=self._past_cov_train)
64
+
65
+ # Fit positive stage using the regression model
66
+ positive_model = self._reg(lags_past_covariates=[-step], **self._reg_params)
67
+ positive_model.fit(target_pos, past_covariates=past_cov_pos)
68
+ self._models[step] = (binary_model, positive_model)
69
+ self.is_fitted_ = True
70
+
71
+ @views_validate
72
+ def predict(self, run_type: str, df: pd.DataFrame) -> pd.DataFrame:
73
+ df = self._process_data(df)
74
+ check_is_fitted(self, 'is_fitted_')
75
+
76
+ if run_type == 'forecasting':
77
+ pred_by_step_binary = [self._predict_by_step_combined(self._models[step][0], step, self._target_train)
78
+ for step in self._steps]
79
+ pred_by_step_positive = [self._predict_by_step_combined(self._models[step][1], step, self._target_train)
80
+ for step in self._steps]
81
+ final_pred = pd.concat(pred_by_step_binary, axis=0) * pd.concat(pred_by_step_positive, axis=0)
82
+ # Add the target variable to the predictions to make sure it is a VIEWS prediction
83
+ # If it is a forecasting run, the target variable is not available in the input data so we fill it with NaN
84
+ final_pred[self._depvar] = np.nan
85
+
86
+ else:
87
+ pred_by_step_binary = [self._predict_by_step(self._models[step][0], step, self._target_train)
88
+ for step in self._steps]
89
+ pred_by_step_positive = [self._predict_by_step(self._models[step][1], step, self._target_train)
90
+ for step in self._steps]
91
+ final_pred = pd.concat(pred_by_step_binary, axis=1) * pd.concat(pred_by_step_positive, axis=1)
92
+ final_pred = pd.merge(final_pred, df[self._depvar], left_index=True, right_index=True)
93
+
94
+ return final_pred
@@ -0,0 +1,199 @@
1
+ import pickle
2
+ import numpy as np
3
+ import logging
4
+ from darts import TimeSeries
5
+ from sklearn.utils.validation import check_is_fitted
6
+ from typing import List, Dict
7
+ from views_forecasts.extensions import *
8
+ from .validation import views_validate
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class StepshifterModel:
14
+ def __init__(self, config: Dict, partitioner_dict: Dict[str, List[int]]):
15
+ self._steps = config['steps']
16
+ self._depvar = config['depvar']
17
+ self._reg = self._resolve_estimator(config['model_reg'])
18
+ self._params = self._get_parameters(config)
19
+ self._train_start, self._train_end = partitioner_dict['train']
20
+ self._test_start, self._test_end = partitioner_dict['predict']
21
+ self._models = {}
22
+
23
+ @staticmethod
24
+ def _resolve_estimator(func_name: str):
25
+ """ Lookup table for supported estimators.
26
+ This is necessary because sklearn estimator default arguments
27
+ must pass equality test, and instantiated sub-estimators are not equal. """
28
+
29
+ match func_name:
30
+ case 'LinearRegressionModel':
31
+ from darts.models import LinearRegressionModel
32
+ return LinearRegressionModel
33
+ case 'RandomForestModel':
34
+ from darts.models import RandomForest
35
+ return RandomForest
36
+ case 'LightGBMModel':
37
+ from darts.models import LightGBMModel
38
+ return LightGBMModel
39
+ case 'XGBModel':
40
+ from darts.models import XGBModel
41
+ return XGBModel
42
+ case _:
43
+ raise ValueError(f"Model {func_name} is not a valid Darts forecasting model or is not supported now. "
44
+ f"Change the model in the config file.")
45
+
46
+ @staticmethod
47
+ def _get_parameters(config: Dict):
48
+ '''
49
+ Get the parameters from the config file.
50
+ If not sweep, then get directly from the config file, otherwise have to remove some parameters.
51
+ '''
52
+
53
+ if config["sweep"]:
54
+ keys_to_remove = ["algorithm", "depvar", "steps", "sweep", "run_type", "model_cls", "model_reg", "name"]
55
+ parameters = {k: v for k, v in config.items() if k not in keys_to_remove}
56
+ else:
57
+ parameters = config["parameters"]
58
+
59
+ return parameters
60
+
61
+ def _process_data(self, df: pd.DataFrame):
62
+ '''
63
+ Countries appear and disappear, so we are predicting countries that exist in the last month of the training data.
64
+ If the country appeared earlier but don't have data previously, we will fill the missing data with 0.
65
+ '''
66
+
67
+ # set up
68
+ self._time = df.index.names[0]
69
+ self._level = df.index.names[1]
70
+ self._independent_variables = [c for c in df.columns if c != self._depvar]
71
+
72
+ last_month_id = df.index.get_level_values(self._time).max()
73
+ existing_country_ids = df.loc[last_month_id].index.unique()
74
+ df = df[df.index.get_level_values(self._level).isin(existing_country_ids)]
75
+
76
+ all_months = df.index.get_level_values(self._time).unique()
77
+ all_combinations = pd.MultiIndex.from_product([all_months, existing_country_ids], names=[self._time, self._level])
78
+ missing_combinations = all_combinations.difference(df.index)
79
+
80
+ missing_df = pd.DataFrame(0, index=missing_combinations, columns=df.columns)
81
+ df = pd.concat([df, missing_df]).sort_index()
82
+
83
+ return df
84
+
85
+ def _prepare_time_series(self, df: pd.DataFrame):
86
+ '''
87
+ Prepare time series for training and prediction
88
+ '''
89
+
90
+ df_reset = df.reset_index(level=[1])
91
+ self._series = TimeSeries.from_group_dataframe(df_reset, group_cols=self._level,
92
+ value_cols=self._independent_variables + [self._depvar])
93
+
94
+ self._target_train = [series.slice(self._train_start, self._train_end + 1)[self._depvar]
95
+ for series in self._series] # ts.slice is different from df.slice
96
+ self._past_cov_train = [series.slice(self._train_start, self._train_end + 1)[self._independent_variables]
97
+ for series in self._series]
98
+ self._past_cov = [series[self._independent_variables] for series in self._series]
99
+
100
+ def _predict_by_step(self, model, step, target):
101
+ ts_pred = model.predict(n=self._test_end - self._test_start + 1,
102
+ series=target,
103
+ # darts automatically locates the time period of past_covariates
104
+ past_covariates=self._past_cov,
105
+ show_warnings=False)
106
+
107
+ # process the predictions
108
+ index_tuples, df_list = [], []
109
+ test_period = slice(self._test_start, self._test_end)
110
+
111
+ for pred in ts_pred:
112
+ df_pred = pred.pd_dataframe().loc[test_period]
113
+ level = pred.static_covariates.iat[0, 0]
114
+ index_tuples.extend([(month, level) for month in df_pred.index])
115
+ df_list.append(df_pred.values)
116
+
117
+ df_preds = pd.DataFrame(
118
+ data=np.concatenate(df_list),
119
+ index=pd.MultiIndex.from_tuples(index_tuples, names=[self._time, self._level]),
120
+ columns=[f"step_pred_{step}"]
121
+ )
122
+
123
+ return df_preds.sort_index()
124
+
125
+ def _predict_by_step_combined(self, model, step, target):
126
+ '''
127
+ For forecasting only need to keep predictions with last-month-with-data, i.e., diagonal prediction
128
+ '''
129
+
130
+ ts_pred = model.predict(n=step,
131
+ series=target,
132
+ # darts automatically locates the time period of past_covariates
133
+ past_covariates=self._past_cov,
134
+ show_warnings=False)
135
+
136
+ # process the predictions
137
+ index_tuples, df_list = [], []
138
+ for pred in ts_pred:
139
+ df_pred = pred.pd_dataframe().loc[[self._test_start + step - 1]]
140
+ level = pred.static_covariates.iat[0, 0]
141
+ index_tuples.extend([(month, level) for month in df_pred.index])
142
+ df_list.append(df_pred.values)
143
+
144
+ df_preds = pd.DataFrame(
145
+ data=np.concatenate(df_list),
146
+ index=pd.MultiIndex.from_tuples(index_tuples, names=[self._time, self._level]),
147
+ columns=["step_pred_combined"]
148
+ )
149
+
150
+ return df_preds.sort_index()
151
+
152
+ @views_validate
153
+ def fit(self, df: pd.DataFrame):
154
+ df = self._process_data(df)
155
+ self._prepare_time_series(df)
156
+ for step in self._steps:
157
+ model = self._reg(lags_past_covariates=[-step], **self._params)
158
+ model.fit(self._target_train, past_covariates=self._past_cov_train)
159
+ self._models[step] = model
160
+ self.is_fitted_ = True
161
+
162
+ @views_validate
163
+ def predict(self, run_type: str, df: pd.DataFrame) -> pd.DataFrame:
164
+ df = self._process_data(df)
165
+ check_is_fitted(self, 'is_fitted_')
166
+
167
+ if run_type == 'forecasting':
168
+ pred_by_step = [self._predict_by_step_combined(self._models[step], step, self._target_train) for step in self._steps]
169
+ pred = pd.concat(pred_by_step, axis=0)
170
+ # Add the target variable to the predictions to make sure it is a VIEWS prediction
171
+ # If it is a forecasting run, the target variable is not available in the input data so we fill it with NaN
172
+ pred[self._depvar] = np.nan
173
+
174
+ else:
175
+ pred_by_step = [self._predict_by_step(self._models[step], step, self._target_train) for step in self._steps]
176
+ pred = pd.concat(pred_by_step, axis=1)
177
+ pred = pd.merge(pred, df[self._depvar], left_index=True, right_index=True)
178
+
179
+ return pred
180
+
181
+ def save(self, path: str):
182
+ try:
183
+ with open(path, "wb") as file:
184
+ pickle.dump(self, file)
185
+ logger.info(f"Model successfully saved to {path}")
186
+ except Exception as e:
187
+ logger.exception(f"Failed to save model: {e}")
188
+
189
+ @property
190
+ def models(self):
191
+ return list(self._models.values())
192
+
193
+ @property
194
+ def steps(self):
195
+ return self._steps
196
+
197
+ @property
198
+ def depvar(self):
199
+ return self._depvar
@@ -0,0 +1,45 @@
1
+ import functools
2
+ import numpy as np
3
+ import pandas as pd
4
+ import logging
5
+ from views_forecasts.extensions import *
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ def dataframe_is_right_format(dataframe: pd.DataFrame):
11
+ try:
12
+ assert len(dataframe.index.levels) == 2
13
+ # print("The dataframe has a two-level index")
14
+ except AssertionError:
15
+ logger.exception("Dataframe must have a two-level index")
16
+ raise AssertionError("Dataframe must have a two-level index")
17
+
18
+ try:
19
+ assert dataframe.index.names[0] == "month_id"
20
+ # print("The first level of the index is correct")
21
+ except AssertionError:
22
+ logger.exception("The first level of the index must be 'month_id'")
23
+ raise AssertionError("The first level of the index must be 'month_id'")
24
+
25
+ try:
26
+ assert dataframe.index.names[1] in ["country_id", "priogrid_gid"]
27
+ # print("The second level of the index is correct")
28
+ except AssertionError:
29
+ logger.exception("The second level of the index must be 'country_id' or 'priogrid_gid'")
30
+ raise AssertionError("The second level of the index must be 'country_id' or 'priogrid_gid'")
31
+
32
+ try:
33
+ assert set(dataframe.dtypes) == {np.dtype(float)}
34
+ # print("The dataframe contains only np.float64 floats")
35
+ except AssertionError:
36
+ logger.exception("The dataframe must contain only np.float64 floats")
37
+ raise AssertionError("The dataframe must contain only np.float64 floats")
38
+
39
+
40
+ def views_validate(fn):
41
+ @functools.wraps(fn)
42
+ def inner(*args,**kwargs):
43
+ dataframe_is_right_format(args[-1])
44
+ return fn(*args, **kwargs)
45
+ return inner