views_stepshifter 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- views_stepshifter-0.1.1/LICENSE +21 -0
- views_stepshifter-0.1.1/PKG-INFO +19 -0
- views_stepshifter-0.1.1/README.md +0 -0
- views_stepshifter-0.1.1/pyproject.toml +19 -0
- views_stepshifter-0.1.1/views_stepshifter/__init__.py +0 -0
- views_stepshifter-0.1.1/views_stepshifter/manager/__pycache__/stepshifter_manager.cpython-311.pyc +0 -0
- views_stepshifter-0.1.1/views_stepshifter/manager/stepshifter_manager.py +287 -0
- views_stepshifter-0.1.1/views_stepshifter/models/__init__.py +2 -0
- views_stepshifter-0.1.1/views_stepshifter/models/__pycache__/__init__.cpython-311.pyc +0 -0
- views_stepshifter-0.1.1/views_stepshifter/models/__pycache__/hurdle_model.cpython-311.pyc +0 -0
- views_stepshifter-0.1.1/views_stepshifter/models/__pycache__/stepshifter.cpython-311.pyc +0 -0
- views_stepshifter-0.1.1/views_stepshifter/models/__pycache__/validation.cpython-311.pyc +0 -0
- views_stepshifter-0.1.1/views_stepshifter/models/hurdle_model.py +94 -0
- views_stepshifter-0.1.1/views_stepshifter/models/stepshifter.py +199 -0
- views_stepshifter-0.1.1/views_stepshifter/models/validation.py +45 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 views platform
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: views_stepshifter
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary:
|
|
5
|
+
Author: Xiaolong Sun
|
|
6
|
+
Author-email: xiaolong.sun@pcr.uu.se
|
|
7
|
+
Requires-Python: >=3.11,<3.15
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
12
|
+
Requires-Dist: darts (>=0.30.0,<0.31.0)
|
|
13
|
+
Requires-Dist: numpy (>=1.25.2,<2.0.0)
|
|
14
|
+
Requires-Dist: pandas (>=1.5.3,<2.0.0)
|
|
15
|
+
Requires-Dist: scikit-learn (>=1.2.2,<2.0.0)
|
|
16
|
+
Requires-Dist: views_pipeline_core (>=0.1.0,<0.2.0)
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
|
|
File without changes
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "views_stepshifter"
|
|
3
|
+
version = "0.1.1"
|
|
4
|
+
description = ""
|
|
5
|
+
authors = ["Xiaolong Sun <xiaolong.sun@pcr.uu.se>"]
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
|
|
8
|
+
[tool.poetry.dependencies]
|
|
9
|
+
python = ">=3.11,<3.15"
|
|
10
|
+
views_pipeline_core = "^0.1.0"
|
|
11
|
+
scikit-learn = "^1.2.2"
|
|
12
|
+
pandas = "^1.5.3"
|
|
13
|
+
numpy = "^1.25.2"
|
|
14
|
+
darts = "^0.30.0"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
[build-system]
|
|
18
|
+
requires = ["poetry-core"]
|
|
19
|
+
build-backend = "poetry.core.masonry.api"
|
|
File without changes
|
views_stepshifter-0.1.1/views_stepshifter/manager/__pycache__/stepshifter_manager.cpython-311.pyc
ADDED
|
Binary file
|
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
from views_pipeline_core.managers.model_manager import ModelManager
|
|
2
|
+
from views_pipeline_core.managers.path_manager import ModelPath
|
|
3
|
+
from views_pipeline_core.models.outputs import generate_output_dict
|
|
4
|
+
from views_pipeline_core.files.utils import read_log_file, create_log_file
|
|
5
|
+
from views_pipeline_core.wandb.utils import add_wandb_monthly_metrics, generate_wandb_log_dict, log_wandb_log_dict
|
|
6
|
+
from views_pipeline_core.evaluation.metrics import generate_metric_dict
|
|
7
|
+
from views_stepshifter.models.stepshifter import StepshifterModel
|
|
8
|
+
from views_stepshifter.models.hurdle_model import HurdleModel
|
|
9
|
+
from views_forecasts.extensions import *
|
|
10
|
+
import logging
|
|
11
|
+
import pandas as pd
|
|
12
|
+
import numpy as np
|
|
13
|
+
import time
|
|
14
|
+
import wandb
|
|
15
|
+
from datetime import datetime
|
|
16
|
+
from sklearn.metrics import mean_squared_error
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class StepshifterManager(ModelManager):
|
|
22
|
+
|
|
23
|
+
def __init__(self, model_path: ModelPath) -> None:
|
|
24
|
+
super().__init__(model_path)
|
|
25
|
+
self._is_hurdle = self._config_meta["algorithm"] == "HurdleModel"
|
|
26
|
+
|
|
27
|
+
def _update_sweep_config(self, args):
|
|
28
|
+
"""
|
|
29
|
+
Updates the configuration object with config_hyperparameters, config_meta, config_deployment, and the command line arguments.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
args: Command line arguments
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
The updated configuration object.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
config = self._config_sweep
|
|
39
|
+
config["parameters"]["run_type"] = {"value": args.run_type}
|
|
40
|
+
config["parameters"]["sweep"] = {"value": True}
|
|
41
|
+
config["parameters"]["name"] = {"value": self._config_meta["name"]}
|
|
42
|
+
config["parameters"]["depvar"] = {"value": self._config_meta["depvar"]}
|
|
43
|
+
config["parameters"]["algorithm"] = {"value": self._config_meta["algorithm"]}
|
|
44
|
+
|
|
45
|
+
if self._is_hurdle:
|
|
46
|
+
config["parameters"]["model_clf"] = {"value": self._config_meta["model_clf"]}
|
|
47
|
+
config["parameters"]["model_reg"] = {"value": self._config_meta["model_reg"]}
|
|
48
|
+
|
|
49
|
+
return config
|
|
50
|
+
|
|
51
|
+
def _split_hurdle_parameters(self):
|
|
52
|
+
"""
|
|
53
|
+
Split the parameters dictionary into two separate dictionaries, one for the
|
|
54
|
+
classification model and one for the regression model.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
A dictionary containing original config, the split classification and regression parameters.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
cls_dict = {}
|
|
61
|
+
reg_dict = {}
|
|
62
|
+
config = self.config
|
|
63
|
+
|
|
64
|
+
for key, value in config.items():
|
|
65
|
+
if key.startswith("cls_"):
|
|
66
|
+
cls_key = key.replace("cls_", "")
|
|
67
|
+
cls_dict[cls_key] = value
|
|
68
|
+
elif key.startswith("reg_"):
|
|
69
|
+
reg_key = key.replace("reg_", "")
|
|
70
|
+
reg_dict[reg_key] = value
|
|
71
|
+
|
|
72
|
+
config["parameters"]["clf"] = cls_dict
|
|
73
|
+
config["parameters"]["reg"] = reg_dict
|
|
74
|
+
|
|
75
|
+
return config
|
|
76
|
+
|
|
77
|
+
def _execute_model_tasks(self, config=None, train=None, eval=None, forecast=None, artifact_name=None):
|
|
78
|
+
"""
|
|
79
|
+
Executes various model-related tasks including training, evaluation, and forecasting.
|
|
80
|
+
|
|
81
|
+
This function manages the execution of different tasks such as training the model,
|
|
82
|
+
evaluating an existing model, or performing forecasting.
|
|
83
|
+
It also initializes the WandB project.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
config: Configuration object containing parameters and settings.
|
|
87
|
+
project: The WandB project name.
|
|
88
|
+
train: Flag to indicate if the model should be trained.
|
|
89
|
+
eval: Flag to indicate if the model should be evaluated.
|
|
90
|
+
forecast: Flag to indicate if forecasting should be performed.
|
|
91
|
+
artifact_name (optional): Specific name of the model artifact to load for evaluation or forecasting.
|
|
92
|
+
"""
|
|
93
|
+
start_t = time.time()
|
|
94
|
+
|
|
95
|
+
# Initialize WandB
|
|
96
|
+
try:
|
|
97
|
+
with wandb.init(project=self._project, entity=self._entity, config=config): # project and config ignored when running a sweep
|
|
98
|
+
|
|
99
|
+
# add the monthly metrics to WandB
|
|
100
|
+
add_wandb_monthly_metrics()
|
|
101
|
+
|
|
102
|
+
# Update config from WandB initialization above
|
|
103
|
+
self.config = wandb.config
|
|
104
|
+
|
|
105
|
+
# W&B does not directly support nested dictionaries for hyperparameters
|
|
106
|
+
if self.config["sweep"] and self._is_hurdle:
|
|
107
|
+
self.config["parameters"] = self._split_hurdle_parameters(self.config)
|
|
108
|
+
|
|
109
|
+
if self.config["sweep"]:
|
|
110
|
+
logger.info(f"Sweeping model {self.config['name']}...")
|
|
111
|
+
model = self._train_model_artifact()
|
|
112
|
+
logger.info(f"Evaluating model {self.config['name']}...")
|
|
113
|
+
self._evaluate_sweep(model)
|
|
114
|
+
|
|
115
|
+
if train:
|
|
116
|
+
logger.info(f"Training model {self.config['name']}...")
|
|
117
|
+
self._train_model_artifact()
|
|
118
|
+
|
|
119
|
+
if eval:
|
|
120
|
+
logger.info(f"Evaluating model {self.config['name']}...")
|
|
121
|
+
self._evaluate_model_artifact(artifact_name)
|
|
122
|
+
|
|
123
|
+
if forecast:
|
|
124
|
+
logger.info(f"Forecasting model {self.config['name']}...")
|
|
125
|
+
self._forecast_model_artifact(artifact_name)
|
|
126
|
+
wandb.finish()
|
|
127
|
+
except Exception as e:
|
|
128
|
+
logger.error(f"Error during model tasks execution: {e}")
|
|
129
|
+
end_t = time.time()
|
|
130
|
+
minutes = (end_t - start_t) / 60
|
|
131
|
+
logger.info(f"Done. Runtime: {minutes:.3f} minutes.\n")
|
|
132
|
+
|
|
133
|
+
def _get_model(self, partitioner_dict: dict):
|
|
134
|
+
"""
|
|
135
|
+
Get the model based on the algorithm specified in the config
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
partitioner_dict: The dictionary of partitioners.
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
The model object based on the algorithm specified in the config
|
|
142
|
+
"""
|
|
143
|
+
|
|
144
|
+
if self._is_hurdle:
|
|
145
|
+
model = HurdleModel(self.config, partitioner_dict)
|
|
146
|
+
else:
|
|
147
|
+
self.config["model_reg"] = self.config["algorithm"]
|
|
148
|
+
model = StepshifterModel(self.config, partitioner_dict)
|
|
149
|
+
|
|
150
|
+
return model
|
|
151
|
+
|
|
152
|
+
def _get_standardized_df(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
153
|
+
"""
|
|
154
|
+
Standardize the DataFrame based on the run type
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
df: The DataFrame to standardize
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
The standardized DataFrame
|
|
161
|
+
"""
|
|
162
|
+
|
|
163
|
+
run_type = self.config["run_type"]
|
|
164
|
+
depvar = self.config["depvar"]
|
|
165
|
+
|
|
166
|
+
# choose the columns to keep based on the run type and replace negative values with 0
|
|
167
|
+
if run_type in ["calibration", "testing"]:
|
|
168
|
+
cols = [depvar] + df.forecasts.prediction_columns
|
|
169
|
+
elif run_type == "forecasting":
|
|
170
|
+
cols = ["step_pred_combined", depvar]
|
|
171
|
+
df = df.replace([np.inf, -np.inf], 0)[cols]
|
|
172
|
+
df = df.mask(df < 0, 0)
|
|
173
|
+
return df
|
|
174
|
+
|
|
175
|
+
def _train_model_artifact(self):
|
|
176
|
+
# print(config)
|
|
177
|
+
path_raw = self._model_path.data_raw
|
|
178
|
+
path_generated = self._model_path.data_generated
|
|
179
|
+
path_artifacts = self._model_path.artifacts
|
|
180
|
+
run_type = self.config["run_type"]
|
|
181
|
+
df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl")
|
|
182
|
+
|
|
183
|
+
partitioner_dict = self._data_loader.partition_dict
|
|
184
|
+
stepshift_model = self._get_model(partitioner_dict)
|
|
185
|
+
stepshift_model.fit(df_viewser)
|
|
186
|
+
|
|
187
|
+
if not self.config["sweep"]:
|
|
188
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
189
|
+
model_filename = ModelManager._generate_model_file_name(run_type, timestamp)
|
|
190
|
+
stepshift_model.save(path_artifacts / model_filename)
|
|
191
|
+
data_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None)
|
|
192
|
+
create_log_file(path_generated, self.config, timestamp, None, data_fetch_timestamp)
|
|
193
|
+
return stepshift_model
|
|
194
|
+
|
|
195
|
+
def _evaluate_model_artifact(self, artifact_name):
|
|
196
|
+
path_raw = self._model_path.data_raw
|
|
197
|
+
path_generated = self._model_path.data_generated
|
|
198
|
+
path_artifacts = self._model_path.artifacts
|
|
199
|
+
run_type = self.config["run_type"]
|
|
200
|
+
|
|
201
|
+
# if an artifact name is provided through the CLI, use it.
|
|
202
|
+
# Otherwise, get the latest model artifact based on the run type
|
|
203
|
+
if artifact_name:
|
|
204
|
+
logger.info(f"Using (non-default) artifact: {artifact_name}")
|
|
205
|
+
|
|
206
|
+
if not artifact_name.endswith(".pkl"):
|
|
207
|
+
artifact_name += ".pkl"
|
|
208
|
+
path_artifact = path_artifacts / artifact_name
|
|
209
|
+
else:
|
|
210
|
+
# use the latest model artifact based on the run type
|
|
211
|
+
logger.info(f"Using latest (default) run type ({run_type}) specific artifact")
|
|
212
|
+
path_artifact = self._get_latest_model_artifact(path_artifacts, run_type)
|
|
213
|
+
|
|
214
|
+
self.config["timestamp"] = path_artifact.stem[-15:]
|
|
215
|
+
df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl")
|
|
216
|
+
|
|
217
|
+
try:
|
|
218
|
+
stepshift_model = pd.read_pickle(path_artifact)
|
|
219
|
+
except FileNotFoundError:
|
|
220
|
+
logger.exception(f"Model artifact not found at {path_artifact}")
|
|
221
|
+
|
|
222
|
+
df = stepshift_model.predict(run_type, df_viewser)
|
|
223
|
+
df = self._get_standardized_df(df)
|
|
224
|
+
data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
225
|
+
data_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None)
|
|
226
|
+
|
|
227
|
+
_, df_output = generate_output_dict(df, self.config)
|
|
228
|
+
evaluation, df_evaluation = generate_metric_dict(df, self.config)
|
|
229
|
+
log_wandb_log_dict(self.config, evaluation)
|
|
230
|
+
|
|
231
|
+
self._save_model_outputs(df_evaluation, df_output, path_generated)
|
|
232
|
+
self._save_predictions(df, path_generated)
|
|
233
|
+
create_log_file(path_generated, self.config, self.config["timestamp"], data_generation_timestamp, data_fetch_timestamp)
|
|
234
|
+
|
|
235
|
+
def _forecast_model_artifact(self, artifact_name):
|
|
236
|
+
path_raw = self._model_path.data_raw
|
|
237
|
+
path_generated = self._model_path.data_generated
|
|
238
|
+
path_artifacts = self._model_path.artifacts
|
|
239
|
+
run_type = self.config["run_type"]
|
|
240
|
+
|
|
241
|
+
# if an artifact name is provided through the CLI, use it.
|
|
242
|
+
# Otherwise, get the latest model artifact based on the run type
|
|
243
|
+
if artifact_name:
|
|
244
|
+
logger.info(f"Using (non-default) artifact: {artifact_name}")
|
|
245
|
+
|
|
246
|
+
if not artifact_name.endswith(".pkl"):
|
|
247
|
+
artifact_name += ".pkl"
|
|
248
|
+
path_artifact = path_artifacts / artifact_name
|
|
249
|
+
else:
|
|
250
|
+
# use the latest model artifact based on the run type
|
|
251
|
+
logger.info(f"Using latest (default) run type ({run_type}) specific artifact")
|
|
252
|
+
path_artifact = self._get_latest_model_artifact(path_artifacts, run_type)
|
|
253
|
+
|
|
254
|
+
self.config["timestamp"] = path_artifact.stem[-15:]
|
|
255
|
+
df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl")
|
|
256
|
+
|
|
257
|
+
try:
|
|
258
|
+
stepshift_model = pd.read_pickle(path_artifact)
|
|
259
|
+
except FileNotFoundError:
|
|
260
|
+
logger.exception(f"Model artifact not found at {path_artifact}")
|
|
261
|
+
|
|
262
|
+
df_predictions = stepshift_model.predict(run_type, df_viewser)
|
|
263
|
+
df_predictions = self._get_standardized_df(df_predictions)
|
|
264
|
+
data_generation_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
265
|
+
data_fetch_timestamp = read_log_file(path_raw / f"{run_type}_data_fetch_log.txt").get("Data Fetch Timestamp", None)
|
|
266
|
+
|
|
267
|
+
self._save_predictions(df_predictions, path_generated)
|
|
268
|
+
create_log_file(path_generated, self.config, self.config["timestamp"], data_generation_timestamp, data_fetch_timestamp)
|
|
269
|
+
|
|
270
|
+
def _evaluate_sweep(self, model):
|
|
271
|
+
path_raw = self._model_path.data_raw
|
|
272
|
+
run_type = self.config["run_type"]
|
|
273
|
+
steps = self.config["steps"]
|
|
274
|
+
|
|
275
|
+
df_viewser = pd.read_pickle(path_raw / f"{run_type}_viewser_df.pkl")
|
|
276
|
+
df = model.predict(run_type, df_viewser)
|
|
277
|
+
df = self._get_standardized_df(df)
|
|
278
|
+
|
|
279
|
+
# Temporarily keep this because the metric to minimize is MSE
|
|
280
|
+
pred_cols = [f"step_pred_{str(i)}" for i in steps]
|
|
281
|
+
df["mse"] = df.apply(lambda row: mean_squared_error([row[self.config["depvar"]]] * len(steps),
|
|
282
|
+
[row[col] for col in pred_cols]), axis=1)
|
|
283
|
+
|
|
284
|
+
wandb.log({"MSE": df["mse"].mean()})
|
|
285
|
+
|
|
286
|
+
evaluation, _ = generate_metric_dict(df, self.config)
|
|
287
|
+
log_wandb_log_dict(self.config, evaluation)
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from .stepshifter import StepshifterModel
|
|
3
|
+
from .validation import views_validate
|
|
4
|
+
from views_forecasts.extensions import *
|
|
5
|
+
from sklearn.utils.validation import check_is_fitted
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class HurdleModel(StepshifterModel):
|
|
9
|
+
"""
|
|
10
|
+
Hurdle model for time series forecasting. The model consists of two stages:
|
|
11
|
+
1. Binary stage: Predicts whether the target variable is 0 or > 0.
|
|
12
|
+
2. Positive stage: Predicts the value of the target variable when it is > 0.
|
|
13
|
+
|
|
14
|
+
Note:
|
|
15
|
+
This algorithm uses a two-step approach.
|
|
16
|
+
|
|
17
|
+
**Step 1: Classification Stage**
|
|
18
|
+
In the first step, a regression model is used with a binary target (0 or 1),
|
|
19
|
+
indicating the absence or presence of violence. This stage functions similarly
|
|
20
|
+
to a linear probability model, estimating the likelihood of a positive outcome.
|
|
21
|
+
Since the model is a regression rather than a classification model,
|
|
22
|
+
these estimates are not strictly bounded between 0 and 1,
|
|
23
|
+
but this is acceptable for the purpose of this step.
|
|
24
|
+
|
|
25
|
+
To determine whether an observation is classified as "positive," we apply a threshold.
|
|
26
|
+
The default threshold is 1, meaning that predictions above this value
|
|
27
|
+
are considered positive outcomes. This threshold can be adjusted as
|
|
28
|
+
a tunable hyperparameter to better suit specific requirements.
|
|
29
|
+
|
|
30
|
+
**Step 2: Regression Stage**
|
|
31
|
+
In the second step, we use a regression model to predict a continuous or count value
|
|
32
|
+
(e.g., the expected number of conflict fatalities) for the selected time series.
|
|
33
|
+
We include the entire time series for countries or PRIO grids where the
|
|
34
|
+
classification stage yielded at least one "positive" prediction,
|
|
35
|
+
rather than limiting the regression to just the predicted positive values.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(self, config: Dict, partitioner_dict: Dict[str, List[int]], threshold: float = 1.0):
|
|
39
|
+
super().__init__(config, partitioner_dict)
|
|
40
|
+
self._clf = self._resolve_estimator(config['model_clf'])
|
|
41
|
+
self._reg = self._resolve_estimator(config['model_reg'])
|
|
42
|
+
self._clf_params = self._get_parameters(config)['clf']
|
|
43
|
+
self._reg_params = self._get_parameters(config)['reg']
|
|
44
|
+
self._threshold = threshold
|
|
45
|
+
|
|
46
|
+
@views_validate
|
|
47
|
+
def fit(self, df: pd.DataFrame):
|
|
48
|
+
df = self._process_data(df)
|
|
49
|
+
self._prepare_time_series(df)
|
|
50
|
+
|
|
51
|
+
# Binary outcome (event/no-event)
|
|
52
|
+
# According to the DARTS doc, if timeseries uses a numeric type different from np.float32 or np.float64, not all functionalities may work properly.
|
|
53
|
+
# So use astype(float) instead of astype(int) (we should have binary outputs 0,1 though)
|
|
54
|
+
target_binary = [s.map(lambda x: (x > self._threshold).astype(float)) for s in self._target_train]
|
|
55
|
+
|
|
56
|
+
# Positive outcome (for cases where target > threshold)
|
|
57
|
+
target_pos, past_cov_pos = zip(*[(t, p) for t, p in zip(self._target_train, self._past_cov_train)
|
|
58
|
+
if (t.values() > self._threshold).any()])
|
|
59
|
+
|
|
60
|
+
for step in self._steps:
|
|
61
|
+
# Fit binary-like stage using a regression model, but the target is binary (0 or 1)
|
|
62
|
+
binary_model = self._clf(lags_past_covariates=[-step], **self._clf_params)
|
|
63
|
+
binary_model.fit(target_binary, past_covariates=self._past_cov_train)
|
|
64
|
+
|
|
65
|
+
# Fit positive stage using the regression model
|
|
66
|
+
positive_model = self._reg(lags_past_covariates=[-step], **self._reg_params)
|
|
67
|
+
positive_model.fit(target_pos, past_covariates=past_cov_pos)
|
|
68
|
+
self._models[step] = (binary_model, positive_model)
|
|
69
|
+
self.is_fitted_ = True
|
|
70
|
+
|
|
71
|
+
@views_validate
|
|
72
|
+
def predict(self, run_type: str, df: pd.DataFrame) -> pd.DataFrame:
|
|
73
|
+
df = self._process_data(df)
|
|
74
|
+
check_is_fitted(self, 'is_fitted_')
|
|
75
|
+
|
|
76
|
+
if run_type == 'forecasting':
|
|
77
|
+
pred_by_step_binary = [self._predict_by_step_combined(self._models[step][0], step, self._target_train)
|
|
78
|
+
for step in self._steps]
|
|
79
|
+
pred_by_step_positive = [self._predict_by_step_combined(self._models[step][1], step, self._target_train)
|
|
80
|
+
for step in self._steps]
|
|
81
|
+
final_pred = pd.concat(pred_by_step_binary, axis=0) * pd.concat(pred_by_step_positive, axis=0)
|
|
82
|
+
# Add the target variable to the predictions to make sure it is a VIEWS prediction
|
|
83
|
+
# If it is a forecasting run, the target variable is not available in the input data so we fill it with NaN
|
|
84
|
+
final_pred[self._depvar] = np.nan
|
|
85
|
+
|
|
86
|
+
else:
|
|
87
|
+
pred_by_step_binary = [self._predict_by_step(self._models[step][0], step, self._target_train)
|
|
88
|
+
for step in self._steps]
|
|
89
|
+
pred_by_step_positive = [self._predict_by_step(self._models[step][1], step, self._target_train)
|
|
90
|
+
for step in self._steps]
|
|
91
|
+
final_pred = pd.concat(pred_by_step_binary, axis=1) * pd.concat(pred_by_step_positive, axis=1)
|
|
92
|
+
final_pred = pd.merge(final_pred, df[self._depvar], left_index=True, right_index=True)
|
|
93
|
+
|
|
94
|
+
return final_pred
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
import pickle
|
|
2
|
+
import numpy as np
|
|
3
|
+
import logging
|
|
4
|
+
from darts import TimeSeries
|
|
5
|
+
from sklearn.utils.validation import check_is_fitted
|
|
6
|
+
from typing import List, Dict
|
|
7
|
+
from views_forecasts.extensions import *
|
|
8
|
+
from .validation import views_validate
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class StepshifterModel:
|
|
14
|
+
def __init__(self, config: Dict, partitioner_dict: Dict[str, List[int]]):
|
|
15
|
+
self._steps = config['steps']
|
|
16
|
+
self._depvar = config['depvar']
|
|
17
|
+
self._reg = self._resolve_estimator(config['model_reg'])
|
|
18
|
+
self._params = self._get_parameters(config)
|
|
19
|
+
self._train_start, self._train_end = partitioner_dict['train']
|
|
20
|
+
self._test_start, self._test_end = partitioner_dict['predict']
|
|
21
|
+
self._models = {}
|
|
22
|
+
|
|
23
|
+
@staticmethod
|
|
24
|
+
def _resolve_estimator(func_name: str):
|
|
25
|
+
""" Lookup table for supported estimators.
|
|
26
|
+
This is necessary because sklearn estimator default arguments
|
|
27
|
+
must pass equality test, and instantiated sub-estimators are not equal. """
|
|
28
|
+
|
|
29
|
+
match func_name:
|
|
30
|
+
case 'LinearRegressionModel':
|
|
31
|
+
from darts.models import LinearRegressionModel
|
|
32
|
+
return LinearRegressionModel
|
|
33
|
+
case 'RandomForestModel':
|
|
34
|
+
from darts.models import RandomForest
|
|
35
|
+
return RandomForest
|
|
36
|
+
case 'LightGBMModel':
|
|
37
|
+
from darts.models import LightGBMModel
|
|
38
|
+
return LightGBMModel
|
|
39
|
+
case 'XGBModel':
|
|
40
|
+
from darts.models import XGBModel
|
|
41
|
+
return XGBModel
|
|
42
|
+
case _:
|
|
43
|
+
raise ValueError(f"Model {func_name} is not a valid Darts forecasting model or is not supported now. "
|
|
44
|
+
f"Change the model in the config file.")
|
|
45
|
+
|
|
46
|
+
@staticmethod
|
|
47
|
+
def _get_parameters(config: Dict):
|
|
48
|
+
'''
|
|
49
|
+
Get the parameters from the config file.
|
|
50
|
+
If not sweep, then get directly from the config file, otherwise have to remove some parameters.
|
|
51
|
+
'''
|
|
52
|
+
|
|
53
|
+
if config["sweep"]:
|
|
54
|
+
keys_to_remove = ["algorithm", "depvar", "steps", "sweep", "run_type", "model_cls", "model_reg", "name"]
|
|
55
|
+
parameters = {k: v for k, v in config.items() if k not in keys_to_remove}
|
|
56
|
+
else:
|
|
57
|
+
parameters = config["parameters"]
|
|
58
|
+
|
|
59
|
+
return parameters
|
|
60
|
+
|
|
61
|
+
def _process_data(self, df: pd.DataFrame):
|
|
62
|
+
'''
|
|
63
|
+
Countries appear and disappear, so we are predicting countries that exist in the last month of the training data.
|
|
64
|
+
If the country appeared earlier but don't have data previously, we will fill the missing data with 0.
|
|
65
|
+
'''
|
|
66
|
+
|
|
67
|
+
# set up
|
|
68
|
+
self._time = df.index.names[0]
|
|
69
|
+
self._level = df.index.names[1]
|
|
70
|
+
self._independent_variables = [c for c in df.columns if c != self._depvar]
|
|
71
|
+
|
|
72
|
+
last_month_id = df.index.get_level_values(self._time).max()
|
|
73
|
+
existing_country_ids = df.loc[last_month_id].index.unique()
|
|
74
|
+
df = df[df.index.get_level_values(self._level).isin(existing_country_ids)]
|
|
75
|
+
|
|
76
|
+
all_months = df.index.get_level_values(self._time).unique()
|
|
77
|
+
all_combinations = pd.MultiIndex.from_product([all_months, existing_country_ids], names=[self._time, self._level])
|
|
78
|
+
missing_combinations = all_combinations.difference(df.index)
|
|
79
|
+
|
|
80
|
+
missing_df = pd.DataFrame(0, index=missing_combinations, columns=df.columns)
|
|
81
|
+
df = pd.concat([df, missing_df]).sort_index()
|
|
82
|
+
|
|
83
|
+
return df
|
|
84
|
+
|
|
85
|
+
def _prepare_time_series(self, df: pd.DataFrame):
|
|
86
|
+
'''
|
|
87
|
+
Prepare time series for training and prediction
|
|
88
|
+
'''
|
|
89
|
+
|
|
90
|
+
df_reset = df.reset_index(level=[1])
|
|
91
|
+
self._series = TimeSeries.from_group_dataframe(df_reset, group_cols=self._level,
|
|
92
|
+
value_cols=self._independent_variables + [self._depvar])
|
|
93
|
+
|
|
94
|
+
self._target_train = [series.slice(self._train_start, self._train_end + 1)[self._depvar]
|
|
95
|
+
for series in self._series] # ts.slice is different from df.slice
|
|
96
|
+
self._past_cov_train = [series.slice(self._train_start, self._train_end + 1)[self._independent_variables]
|
|
97
|
+
for series in self._series]
|
|
98
|
+
self._past_cov = [series[self._independent_variables] for series in self._series]
|
|
99
|
+
|
|
100
|
+
def _predict_by_step(self, model, step, target):
|
|
101
|
+
ts_pred = model.predict(n=self._test_end - self._test_start + 1,
|
|
102
|
+
series=target,
|
|
103
|
+
# darts automatically locates the time period of past_covariates
|
|
104
|
+
past_covariates=self._past_cov,
|
|
105
|
+
show_warnings=False)
|
|
106
|
+
|
|
107
|
+
# process the predictions
|
|
108
|
+
index_tuples, df_list = [], []
|
|
109
|
+
test_period = slice(self._test_start, self._test_end)
|
|
110
|
+
|
|
111
|
+
for pred in ts_pred:
|
|
112
|
+
df_pred = pred.pd_dataframe().loc[test_period]
|
|
113
|
+
level = pred.static_covariates.iat[0, 0]
|
|
114
|
+
index_tuples.extend([(month, level) for month in df_pred.index])
|
|
115
|
+
df_list.append(df_pred.values)
|
|
116
|
+
|
|
117
|
+
df_preds = pd.DataFrame(
|
|
118
|
+
data=np.concatenate(df_list),
|
|
119
|
+
index=pd.MultiIndex.from_tuples(index_tuples, names=[self._time, self._level]),
|
|
120
|
+
columns=[f"step_pred_{step}"]
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
return df_preds.sort_index()
|
|
124
|
+
|
|
125
|
+
def _predict_by_step_combined(self, model, step, target):
|
|
126
|
+
'''
|
|
127
|
+
For forecasting only need to keep predictions with last-month-with-data, i.e., diagonal prediction
|
|
128
|
+
'''
|
|
129
|
+
|
|
130
|
+
ts_pred = model.predict(n=step,
|
|
131
|
+
series=target,
|
|
132
|
+
# darts automatically locates the time period of past_covariates
|
|
133
|
+
past_covariates=self._past_cov,
|
|
134
|
+
show_warnings=False)
|
|
135
|
+
|
|
136
|
+
# process the predictions
|
|
137
|
+
index_tuples, df_list = [], []
|
|
138
|
+
for pred in ts_pred:
|
|
139
|
+
df_pred = pred.pd_dataframe().loc[[self._test_start + step - 1]]
|
|
140
|
+
level = pred.static_covariates.iat[0, 0]
|
|
141
|
+
index_tuples.extend([(month, level) for month in df_pred.index])
|
|
142
|
+
df_list.append(df_pred.values)
|
|
143
|
+
|
|
144
|
+
df_preds = pd.DataFrame(
|
|
145
|
+
data=np.concatenate(df_list),
|
|
146
|
+
index=pd.MultiIndex.from_tuples(index_tuples, names=[self._time, self._level]),
|
|
147
|
+
columns=["step_pred_combined"]
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
return df_preds.sort_index()
|
|
151
|
+
|
|
152
|
+
@views_validate
|
|
153
|
+
def fit(self, df: pd.DataFrame):
|
|
154
|
+
df = self._process_data(df)
|
|
155
|
+
self._prepare_time_series(df)
|
|
156
|
+
for step in self._steps:
|
|
157
|
+
model = self._reg(lags_past_covariates=[-step], **self._params)
|
|
158
|
+
model.fit(self._target_train, past_covariates=self._past_cov_train)
|
|
159
|
+
self._models[step] = model
|
|
160
|
+
self.is_fitted_ = True
|
|
161
|
+
|
|
162
|
+
@views_validate
|
|
163
|
+
def predict(self, run_type: str, df: pd.DataFrame) -> pd.DataFrame:
|
|
164
|
+
df = self._process_data(df)
|
|
165
|
+
check_is_fitted(self, 'is_fitted_')
|
|
166
|
+
|
|
167
|
+
if run_type == 'forecasting':
|
|
168
|
+
pred_by_step = [self._predict_by_step_combined(self._models[step], step, self._target_train) for step in self._steps]
|
|
169
|
+
pred = pd.concat(pred_by_step, axis=0)
|
|
170
|
+
# Add the target variable to the predictions to make sure it is a VIEWS prediction
|
|
171
|
+
# If it is a forecasting run, the target variable is not available in the input data so we fill it with NaN
|
|
172
|
+
pred[self._depvar] = np.nan
|
|
173
|
+
|
|
174
|
+
else:
|
|
175
|
+
pred_by_step = [self._predict_by_step(self._models[step], step, self._target_train) for step in self._steps]
|
|
176
|
+
pred = pd.concat(pred_by_step, axis=1)
|
|
177
|
+
pred = pd.merge(pred, df[self._depvar], left_index=True, right_index=True)
|
|
178
|
+
|
|
179
|
+
return pred
|
|
180
|
+
|
|
181
|
+
def save(self, path: str):
|
|
182
|
+
try:
|
|
183
|
+
with open(path, "wb") as file:
|
|
184
|
+
pickle.dump(self, file)
|
|
185
|
+
logger.info(f"Model successfully saved to {path}")
|
|
186
|
+
except Exception as e:
|
|
187
|
+
logger.exception(f"Failed to save model: {e}")
|
|
188
|
+
|
|
189
|
+
@property
|
|
190
|
+
def models(self):
|
|
191
|
+
return list(self._models.values())
|
|
192
|
+
|
|
193
|
+
@property
|
|
194
|
+
def steps(self):
|
|
195
|
+
return self._steps
|
|
196
|
+
|
|
197
|
+
@property
|
|
198
|
+
def depvar(self):
|
|
199
|
+
return self._depvar
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import logging
|
|
5
|
+
from views_forecasts.extensions import *
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def dataframe_is_right_format(dataframe: pd.DataFrame):
|
|
11
|
+
try:
|
|
12
|
+
assert len(dataframe.index.levels) == 2
|
|
13
|
+
# print("The dataframe has a two-level index")
|
|
14
|
+
except AssertionError:
|
|
15
|
+
logger.exception("Dataframe must have a two-level index")
|
|
16
|
+
raise AssertionError("Dataframe must have a two-level index")
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
assert dataframe.index.names[0] == "month_id"
|
|
20
|
+
# print("The first level of the index is correct")
|
|
21
|
+
except AssertionError:
|
|
22
|
+
logger.exception("The first level of the index must be 'month_id'")
|
|
23
|
+
raise AssertionError("The first level of the index must be 'month_id'")
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
assert dataframe.index.names[1] in ["country_id", "priogrid_gid"]
|
|
27
|
+
# print("The second level of the index is correct")
|
|
28
|
+
except AssertionError:
|
|
29
|
+
logger.exception("The second level of the index must be 'country_id' or 'priogrid_gid'")
|
|
30
|
+
raise AssertionError("The second level of the index must be 'country_id' or 'priogrid_gid'")
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
assert set(dataframe.dtypes) == {np.dtype(float)}
|
|
34
|
+
# print("The dataframe contains only np.float64 floats")
|
|
35
|
+
except AssertionError:
|
|
36
|
+
logger.exception("The dataframe must contain only np.float64 floats")
|
|
37
|
+
raise AssertionError("The dataframe must contain only np.float64 floats")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def views_validate(fn):
|
|
41
|
+
@functools.wraps(fn)
|
|
42
|
+
def inner(*args,**kwargs):
|
|
43
|
+
dataframe_is_right_format(args[-1])
|
|
44
|
+
return fn(*args, **kwargs)
|
|
45
|
+
return inner
|