autofuzzts 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autofuzzts/config.py +17 -17
- autofuzzts/data/data_loader.py +7 -7
- autofuzzts/data_validation/validate.py +41 -41
- autofuzzts/models/fuzzy_classifier.py +82 -82
- autofuzzts/models/mlp_nas.py +90 -90
- autofuzzts/partition/{fuzzy_clust_fun.py → fuzzy_part_fun.py} +107 -107
- autofuzzts/partition/partition.py +109 -109
- autofuzzts/partition/visualize_partition.py +32 -32
- autofuzzts/pipeline.py +469 -469
- autofuzzts/preprocess/prep_for_model.py +70 -70
- autofuzzts/preprocess/preprocess.py +62 -62
- {autofuzzts-0.1.1.dist-info → autofuzzts-0.1.3.dist-info}/METADATA +161 -146
- autofuzzts-0.1.3.dist-info/RECORD +23 -0
- {autofuzzts-0.1.1.dist-info → autofuzzts-0.1.3.dist-info}/WHEEL +1 -1
- {autofuzzts-0.1.1.dist-info → autofuzzts-0.1.3.dist-info}/licenses/LICENSE +21 -21
- autofuzzts/partition/fuzzy_clust_fun_orig.py +0 -129
- autofuzzts/utils.py +0 -1
- autofuzzts-0.1.1.dist-info/RECORD +0 -25
- {autofuzzts-0.1.1.dist-info → autofuzzts-0.1.3.dist-info}/top_level.txt +0 -0
autofuzzts/pipeline.py
CHANGED
|
@@ -1,469 +1,469 @@
|
|
|
1
|
-
# pipeline.py
|
|
2
|
-
import pandas as pd
|
|
3
|
-
from typing import Dict, Literal
|
|
4
|
-
import optuna
|
|
5
|
-
import numpy as np
|
|
6
|
-
|
|
7
|
-
from autofuzzts.config import get_config
|
|
8
|
-
from autofuzzts.data import data_loader
|
|
9
|
-
from autofuzzts.data_validation.validate import validate_and_clean_input
|
|
10
|
-
from autofuzzts.partition.partition import FuzzyPartition
|
|
11
|
-
from autofuzzts.preprocess.preprocess import preprocess_data, preprocess_data_val
|
|
12
|
-
from autofuzzts.preprocess.prep_for_model import (
|
|
13
|
-
prepare_for_model,
|
|
14
|
-
prepare_for_model_val_set,
|
|
15
|
-
)
|
|
16
|
-
from autofuzzts.models.fuzzy_classifier import FuzzyPipelineModel
|
|
17
|
-
|
|
18
|
-
from sklearn.model_selection import ParameterGrid
|
|
19
|
-
from sklearn.calibration import CalibratedClassifierCV
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
## Import RMSE and MAE
|
|
23
|
-
from sklearn.metrics import (
|
|
24
|
-
root_mean_squared_error,
|
|
25
|
-
mean_absolute_error,
|
|
26
|
-
mean_squared_error,
|
|
27
|
-
)
|
|
28
|
-
|
|
29
|
-
# Example custom configuration
|
|
30
|
-
custom_config = {
|
|
31
|
-
"
|
|
32
|
-
"verbosity": True,
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
# Retrieve the final configuration
|
|
36
|
-
selected_config = get_config(custom_config)
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def run_pipeline(datasetet_name: str, config: dict = selected_config):
|
|
40
|
-
# Load data
|
|
41
|
-
|
|
42
|
-
data = data_loader.load_sample_data(datasetet_name)
|
|
43
|
-
print(data.head(5))
|
|
44
|
-
print("Evaluated configuration is")
|
|
45
|
-
print(config)
|
|
46
|
-
|
|
47
|
-
pass
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
def train_val_pipeline(
|
|
51
|
-
train_set: pd.DataFrame,
|
|
52
|
-
val_set: pd.DataFrame,
|
|
53
|
-
config: Dict = selected_config,
|
|
54
|
-
metric: Literal["rmse", "mse", "mae"] = "rmse",
|
|
55
|
-
diff_type: Literal["perc", "abs"] = "perc",
|
|
56
|
-
covariates: list[str] = None,
|
|
57
|
-
) -> float:
|
|
58
|
-
train_set = validate_and_clean_input(train_set, covariates)
|
|
59
|
-
val_set = validate_and_clean_input(val_set, covariates)
|
|
60
|
-
|
|
61
|
-
print("train set length:", len(train_set))
|
|
62
|
-
|
|
63
|
-
if covariates:
|
|
64
|
-
train_covariates = train_set[covariates].copy()
|
|
65
|
-
val_covariates = val_set[covariates].copy()
|
|
66
|
-
|
|
67
|
-
train_set_preprocessed, scaler_train = preprocess_data(train_set, diff_type)
|
|
68
|
-
val_set_preprocessed = preprocess_data_val(
|
|
69
|
-
df=val_set, df_train=train_set, diff_type=diff_type, scaler=scaler_train
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
fp = FuzzyPartition(
|
|
73
|
-
fuzzy_function=config["fuzzy_part_func"],
|
|
74
|
-
|
|
75
|
-
sigma=config["sigma"],
|
|
76
|
-
scaler=scaler_train,
|
|
77
|
-
verbosity=config["verbosity"],
|
|
78
|
-
)
|
|
79
|
-
|
|
80
|
-
# Prepare train and validation fuzzy partitions
|
|
81
|
-
X_training = train_set_preprocessed["diff_scaled"].values
|
|
82
|
-
X_validation = val_set_preprocessed["diff_scaled"].values
|
|
83
|
-
|
|
84
|
-
train_fuzzy_partition, _, _ = fp.fuzzy_partition(X_training)
|
|
85
|
-
val_fuzzy_partition, _, center_points_unscaled_test_val = fp.fuzzy_partition(
|
|
86
|
-
X_validation
|
|
87
|
-
)
|
|
88
|
-
|
|
89
|
-
X_train, y_train = prepare_for_model(
|
|
90
|
-
train_fuzzy_partition.copy(), config["number_of_lags"]
|
|
91
|
-
)
|
|
92
|
-
X_val, y_val = prepare_for_model_val_set(
|
|
93
|
-
df_val_fp=val_fuzzy_partition.copy(),
|
|
94
|
-
df_train_fp=train_fuzzy_partition.copy(),
|
|
95
|
-
n_lags=config["number_of_lags"],
|
|
96
|
-
)
|
|
97
|
-
|
|
98
|
-
if covariates:
|
|
99
|
-
X_train = pd.concat([X_train, train_covariates], axis=1)
|
|
100
|
-
X_val = pd.concat([X_val, val_covariates], axis=1)
|
|
101
|
-
|
|
102
|
-
model = FuzzyPipelineModel(
|
|
103
|
-
|
|
104
|
-
number_of_lags=config["number_of_lags"],
|
|
105
|
-
verbosity=config["verbosity"],
|
|
106
|
-
)
|
|
107
|
-
|
|
108
|
-
model.fit(X_train, y_train, model_type="xgb")
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
## Convert prediction to crips number using center points of
|
|
113
|
-
y_val_pred_center_point = [center_points_unscaled_test_val[i] for i in
|
|
114
|
-
|
|
115
|
-
## Recalculate percentage difference to actual values
|
|
116
|
-
y_val_pred = [None] * len(val_set)
|
|
117
|
-
|
|
118
|
-
# Set the first prediction using the last known value from the train set
|
|
119
|
-
last_train_value = train_set["Y"].iloc[
|
|
120
|
-
-1
|
|
121
|
-
] # Assuming `df_train` holds the training data
|
|
122
|
-
y_val_pred[0] = last_train_value * (1 + y_val_pred_center_point[0])
|
|
123
|
-
|
|
124
|
-
# Loop to calculate each subsequent prediction based on the actual previous value in `df_test['Y']`
|
|
125
|
-
|
|
126
|
-
if diff_type == "perc":
|
|
127
|
-
for i in range(1, len(val_set)):
|
|
128
|
-
prev_Y = val_set["Y"].iloc[
|
|
129
|
-
i - 1
|
|
130
|
-
] # Use the previous actual value from `df_test`
|
|
131
|
-
perc_change = y_val_pred_center_point[i]
|
|
132
|
-
y_val_pred[i] = prev_Y * (1 + perc_change)
|
|
133
|
-
|
|
134
|
-
elif diff_type == "abs":
|
|
135
|
-
for i in range(1, len(val_set)):
|
|
136
|
-
prev_Y = val_set["Y"].iloc[i - 1]
|
|
137
|
-
y_val_pred[i] = prev_Y + y_val_pred_center_point[i]
|
|
138
|
-
|
|
139
|
-
if metric == "rmse":
|
|
140
|
-
metric_value = root_mean_squared_error(val_set["Y"], y_val_pred)
|
|
141
|
-
elif metric == "mse":
|
|
142
|
-
metric_value = root_mean_squared_error(val_set["Y"], y_val_pred) ** 2
|
|
143
|
-
elif metric == "mae":
|
|
144
|
-
metric_value = mean_absolute_error(val_set["Y"], y_val_pred)
|
|
145
|
-
else:
|
|
146
|
-
raise ValueError(
|
|
147
|
-
f"Invalid metric {metric}. Please choose one of 'rmse', 'mse', 'mae'"
|
|
148
|
-
)
|
|
149
|
-
|
|
150
|
-
return metric_value
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
def train_model(
|
|
154
|
-
dataset: pd.DataFrame,
|
|
155
|
-
config: Dict,
|
|
156
|
-
model_type: Literal["xgb", "mlp", "tpot"] = "xgb",
|
|
157
|
-
):
|
|
158
|
-
"""
|
|
159
|
-
Function to train a model on the dataset provided.
|
|
160
|
-
|
|
161
|
-
Parameters:
|
|
162
|
-
dataset: pd.DataFrame
|
|
163
|
-
The dataset to train the model on.
|
|
164
|
-
config: dict
|
|
165
|
-
The configuration dictionary for the model.
|
|
166
|
-
model_type: str
|
|
167
|
-
The type of model to train. Default is 'xgb'.
|
|
168
|
-
|
|
169
|
-
"""
|
|
170
|
-
config = get_config(config)
|
|
171
|
-
|
|
172
|
-
df = validate_and_clean_input(dataset)
|
|
173
|
-
|
|
174
|
-
df_preprocessed, scaler_train = preprocess_data(df, diff_type="perc")
|
|
175
|
-
|
|
176
|
-
fp = FuzzyPartition(
|
|
177
|
-
fuzzy_function=config["fuzzy_part_func"],
|
|
178
|
-
|
|
179
|
-
sigma=config["sigma"],
|
|
180
|
-
scaler=scaler_train,
|
|
181
|
-
verbosity=config["verbosity"],
|
|
182
|
-
)
|
|
183
|
-
|
|
184
|
-
X_training = df_preprocessed["diff_scaled"].values
|
|
185
|
-
|
|
186
|
-
train_fuzzy_partition, _, _ = fp.fuzzy_partition(X_training)
|
|
187
|
-
|
|
188
|
-
X_train, y_train = prepare_for_model(
|
|
189
|
-
train_fuzzy_partition.copy(), config["number_of_lags"]
|
|
190
|
-
)
|
|
191
|
-
|
|
192
|
-
model_train = FuzzyPipelineModel(
|
|
193
|
-
|
|
194
|
-
number_of_lags=config["number_of_lags"],
|
|
195
|
-
verbosity=config["verbosity"],
|
|
196
|
-
)
|
|
197
|
-
|
|
198
|
-
model_train.fit(X_train, y_train, model_type=model_type)
|
|
199
|
-
|
|
200
|
-
return model_train, scaler_train
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
def tune_hyperparameters_bayes(
|
|
204
|
-
train_set: pd.DataFrame,
|
|
205
|
-
val_set: pd.DataFrame,
|
|
206
|
-
n_trials: int = 315,
|
|
207
|
-
metric: Literal["rmse", "mse", "mae"] = "rmse",
|
|
208
|
-
diff_type: Literal["perc", "abs"] = "perc",
|
|
209
|
-
covariates: list[str] = None,
|
|
210
|
-
):
|
|
211
|
-
def objective(trial):
|
|
212
|
-
# Define search space based on your specifications
|
|
213
|
-
config = {
|
|
214
|
-
"
|
|
215
|
-
"
|
|
216
|
-
), # Number of fuzzy sets
|
|
217
|
-
"number_of_lags": trial.suggest_int(
|
|
218
|
-
"number_of_lags", 1, 10
|
|
219
|
-
), # Number of lags
|
|
220
|
-
"fuzzy_part_func": trial.suggest_categorical(
|
|
221
|
-
"fuzzy_part_func", ["Triangle", "Cosine", "Gaussian"]
|
|
222
|
-
), # Partition function type
|
|
223
|
-
}
|
|
224
|
-
|
|
225
|
-
if config["fuzzy_part_func"] == "Gaussian":
|
|
226
|
-
config["sigma"] = trial.suggest_float("sigma", 0.1, 4, log=True)
|
|
227
|
-
else:
|
|
228
|
-
config["sigma"] = None
|
|
229
|
-
|
|
230
|
-
selected_config = get_config(config)
|
|
231
|
-
|
|
232
|
-
# Use train_val_pipeline to evaluate this configuration
|
|
233
|
-
metric_value = train_val_pipeline(
|
|
234
|
-
train_set,
|
|
235
|
-
val_set,
|
|
236
|
-
selected_config,
|
|
237
|
-
metric,
|
|
238
|
-
diff_type,
|
|
239
|
-
covariates=covariates,
|
|
240
|
-
)
|
|
241
|
-
return metric_value
|
|
242
|
-
|
|
243
|
-
# Create and optimize the Optuna study
|
|
244
|
-
study = optuna.create_study(direction="minimize")
|
|
245
|
-
study.optimize(objective, n_trials=n_trials)
|
|
246
|
-
|
|
247
|
-
# Extract the best configuration and score
|
|
248
|
-
best_config = study.best_params
|
|
249
|
-
best_metric_value = study.best_value
|
|
250
|
-
|
|
251
|
-
print(f"Best Config: {best_config}")
|
|
252
|
-
print(f"Best {metric.upper()}: {best_metric_value}")
|
|
253
|
-
return best_config, best_metric_value
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
def tune_hyperparameters_bayes_Henon(
|
|
257
|
-
train_set: pd.DataFrame,
|
|
258
|
-
val_set: pd.DataFrame,
|
|
259
|
-
n_trials: int = 315,
|
|
260
|
-
metric: Literal["rmse", "mse", "mae"] = "rmse",
|
|
261
|
-
diff_type: Literal["perc", "abs"] = "perc",
|
|
262
|
-
):
|
|
263
|
-
def objective(trial):
|
|
264
|
-
config = {
|
|
265
|
-
"
|
|
266
|
-
"
|
|
267
|
-
), # Number of fuzzy sets
|
|
268
|
-
"number_of_lags": trial.suggest_int("n_lags", 2, 5), # Number of lags
|
|
269
|
-
"fuzzy_part_func": trial.suggest_categorical(
|
|
270
|
-
"fuzzy_part_func", ["Triangle", "Cosine", "Gaussian"]
|
|
271
|
-
),
|
|
272
|
-
}
|
|
273
|
-
|
|
274
|
-
if config["fuzzy_part_func"] == "Gaussian":
|
|
275
|
-
config["sigma"] = trial.suggest_float("sigma", 0.1, 4, log=True)
|
|
276
|
-
else:
|
|
277
|
-
config["sigma"] = None
|
|
278
|
-
|
|
279
|
-
selected_config = get_config(config)
|
|
280
|
-
|
|
281
|
-
# Use train_val_pipeline to evaluate this configuration
|
|
282
|
-
metric_value = train_val_pipeline(
|
|
283
|
-
train_set, val_set, selected_config, metric, diff_type
|
|
284
|
-
)
|
|
285
|
-
return metric_value
|
|
286
|
-
|
|
287
|
-
# Create and optimize the Optuna study
|
|
288
|
-
study = optuna.create_study(direction="minimize")
|
|
289
|
-
study.optimize(objective, n_trials=n_trials)
|
|
290
|
-
|
|
291
|
-
# Extract the best configuration and score
|
|
292
|
-
best_config = study.best_params
|
|
293
|
-
best_metric_value = study.best_value
|
|
294
|
-
|
|
295
|
-
print(f"Best Config: {best_config}")
|
|
296
|
-
print(f"Best {metric.upper()}: {best_metric_value}")
|
|
297
|
-
return best_config, best_metric_value
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
def tune_hyperparameters_grid(
|
|
301
|
-
train_set: pd.DataFrame,
|
|
302
|
-
val_set: pd.DataFrame,
|
|
303
|
-
n_trials: int = 315,
|
|
304
|
-
metric: Literal["rmse", "mse", "mae"] = "rmse",
|
|
305
|
-
diff_type: Literal["perc", "abs"] = "perc",
|
|
306
|
-
):
|
|
307
|
-
# Define grid for Gaussian fuzzy function (includes 'sigma')
|
|
308
|
-
grid_gauss = {
|
|
309
|
-
"n_lags": [1, 3, 5, 7, 9],
|
|
310
|
-
"
|
|
311
|
-
"sigma": [0.1, 0.5, 1, 5, 9],
|
|
312
|
-
"fuzzy_part_func": ["matrix_F_transform_gauss"],
|
|
313
|
-
}
|
|
314
|
-
|
|
315
|
-
# Define grid for non-Gaussian fuzzy functions (excludes 'sigma')
|
|
316
|
-
grid_non_gauss = {
|
|
317
|
-
"n_lags": [1, 3, 5, 7, 9],
|
|
318
|
-
"
|
|
319
|
-
"sigma": [None], # Set sigma to None for non-Gaussian functions
|
|
320
|
-
"fuzzy_part_func": ["matrix_F_transform_cosine", "matrix_F_transform_triangle"],
|
|
321
|
-
}
|
|
322
|
-
|
|
323
|
-
# Combine the grids
|
|
324
|
-
grid_gauss = list(ParameterGrid(grid_gauss))
|
|
325
|
-
grid_non_gauss = list(ParameterGrid(grid_non_gauss))
|
|
326
|
-
combined_grid = grid_gauss + grid_non_gauss
|
|
327
|
-
|
|
328
|
-
## Run the grid search------------------------------------------------------------------------------------------------------
|
|
329
|
-
best_metric_value = float("inf")
|
|
330
|
-
best_config = None
|
|
331
|
-
num_evaluations = 0
|
|
332
|
-
|
|
333
|
-
for config in combined_grid:
|
|
334
|
-
selected_config = get_config(config)
|
|
335
|
-
# Count the configuration being evaluated
|
|
336
|
-
num_evaluations += 1
|
|
337
|
-
|
|
338
|
-
if num_evaluations >= n_trials:
|
|
339
|
-
break
|
|
340
|
-
|
|
341
|
-
## If number of evaluation is divisible by 20 print the number of evaluations
|
|
342
|
-
if num_evaluations % 20 == 0:
|
|
343
|
-
print(f"Number of evaluations done: {num_evaluations}")
|
|
344
|
-
|
|
345
|
-
# Evaluate the config on the validation set using train_val_pipeline
|
|
346
|
-
metric_value = train_val_pipeline(
|
|
347
|
-
train_set, val_set, selected_config, metric, diff_type
|
|
348
|
-
)
|
|
349
|
-
|
|
350
|
-
# Update best config if this one is better according to the selected metric
|
|
351
|
-
if metric_value < best_metric_value:
|
|
352
|
-
best_metric_value = metric_value
|
|
353
|
-
best_config = config
|
|
354
|
-
|
|
355
|
-
return best_config, best_metric_value, num_evaluations
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
def fit_calibrate_predict(
|
|
359
|
-
train_set: pd.DataFrame,
|
|
360
|
-
test_set: pd.DataFrame,
|
|
361
|
-
config: Dict,
|
|
362
|
-
model_type: Literal["xgb", "mlp", "tpot"] = "xgb",
|
|
363
|
-
number_cv_calib=5,
|
|
364
|
-
diff_type: Literal["perc", "abs"] = "perc",
|
|
365
|
-
covariates: list[str] = None,
|
|
366
|
-
exclude_bool: bool = False,
|
|
367
|
-
) -> float:
|
|
368
|
-
"""
|
|
369
|
-
Aim of this question is to train a model on the train set, calibrate it using the calibration method provided, and predict it on the test set using the metric provided.
|
|
370
|
-
"""
|
|
371
|
-
|
|
372
|
-
config = get_config(config)
|
|
373
|
-
|
|
374
|
-
# Step 1: Validate and preprocess the input data
|
|
375
|
-
train_set = validate_and_clean_input(train_set, covariates=covariates)
|
|
376
|
-
test_set = validate_and_clean_input(test_set, covariates=covariates)
|
|
377
|
-
|
|
378
|
-
train_set_preprocessed, scaler_train = preprocess_data(
|
|
379
|
-
train_set, diff_type=diff_type
|
|
380
|
-
)
|
|
381
|
-
test_set_preprocessed = preprocess_data_val(
|
|
382
|
-
df=test_set, df_train=train_set, diff_type=diff_type, scaler=scaler_train
|
|
383
|
-
)
|
|
384
|
-
|
|
385
|
-
# Step 2: Fuzzy Partition for train, validation, and test sets
|
|
386
|
-
fp = FuzzyPartition(
|
|
387
|
-
fuzzy_function=config["fuzzy_part_func"],
|
|
388
|
-
|
|
389
|
-
sigma=config["sigma"],
|
|
390
|
-
scaler=scaler_train,
|
|
391
|
-
verbosity=config["verbosity"],
|
|
392
|
-
)
|
|
393
|
-
|
|
394
|
-
# Prepare train, validation, and test fuzzy partitions
|
|
395
|
-
X_training = train_set_preprocessed["diff_scaled"].values
|
|
396
|
-
X_test = test_set_preprocessed["diff_scaled"].values
|
|
397
|
-
|
|
398
|
-
train_fuzzy_partition, _, _ = fp.fuzzy_partition(X_training)
|
|
399
|
-
test_fuzzy_partition, _, center_points_unscaled_test = fp.fuzzy_partition(X_test)
|
|
400
|
-
|
|
401
|
-
if exclude_bool:
|
|
402
|
-
## Remove column left from train_fuzzy_partition
|
|
403
|
-
train_fuzzy_partition = train_fuzzy_partition.drop(columns=["left"])
|
|
404
|
-
test_fuzzy_partition = test_fuzzy_partition.drop(columns=["left"])
|
|
405
|
-
|
|
406
|
-
# Prepare data for model training, validation, and testing
|
|
407
|
-
X_train, y_train = prepare_for_model(
|
|
408
|
-
train_fuzzy_partition.copy(), config["number_of_lags"]
|
|
409
|
-
)
|
|
410
|
-
X_test_final, _ = prepare_for_model_val_set(
|
|
411
|
-
df_val_fp=test_fuzzy_partition.copy(),
|
|
412
|
-
df_train_fp=train_fuzzy_partition.copy(),
|
|
413
|
-
n_lags=config["number_of_lags"],
|
|
414
|
-
)
|
|
415
|
-
|
|
416
|
-
if covariates:
|
|
417
|
-
train_covariates = train_set[covariates].copy()
|
|
418
|
-
test_covariates = test_set[covariates].copy()
|
|
419
|
-
|
|
420
|
-
X_train = pd.concat([X_train, train_covariates], axis=1)
|
|
421
|
-
X_test_final = pd.concat([X_test_final, test_covariates], axis=1)
|
|
422
|
-
|
|
423
|
-
# Step 3: Train the model on the combined train and validation set
|
|
424
|
-
model = FuzzyPipelineModel(
|
|
425
|
-
|
|
426
|
-
number_of_lags=config["number_of_lags"],
|
|
427
|
-
verbosity=config["verbosity"],
|
|
428
|
-
)
|
|
429
|
-
|
|
430
|
-
model.fit(X_train, y_train, model_type=model_type)
|
|
431
|
-
|
|
432
|
-
try:
|
|
433
|
-
# Step 4: Calibrate the model using CalibratedClassifierCV
|
|
434
|
-
model.calibrate(X_train, y_train, method="sigmoid", cv=number_cv_calib)
|
|
435
|
-
except:
|
|
436
|
-
pass
|
|
437
|
-
|
|
438
|
-
# Step 5: Make predictions and evaluate on the test set
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
## Convert prediction to crips number using center points of
|
|
442
|
-
y_test_pred_center_point = [
|
|
443
|
-
center_points_unscaled_test[i] for i in
|
|
444
|
-
]
|
|
445
|
-
|
|
446
|
-
## Recalculate percentage difference to actual values
|
|
447
|
-
y_test_pred = [None] * len(test_set)
|
|
448
|
-
|
|
449
|
-
# Set the first prediction using the last known value from the train set
|
|
450
|
-
last_train_value = train_set["Y"].iloc[
|
|
451
|
-
-1
|
|
452
|
-
] # Assuming `df_train` holds the training data
|
|
453
|
-
y_test_pred[0] = last_train_value * (1 + y_test_pred_center_point[0])
|
|
454
|
-
|
|
455
|
-
if diff_type == "perc":
|
|
456
|
-
# Loop to calculate each subsequent prediction based on the actual previous value in `df_test['Y']`
|
|
457
|
-
for i in range(1, len(test_set)):
|
|
458
|
-
prev_Y = test_set["Y"].iloc[
|
|
459
|
-
i - 1
|
|
460
|
-
] # Use the previous actual value from `df_test`
|
|
461
|
-
perc_change = y_test_pred_center_point[i]
|
|
462
|
-
y_test_pred[i] = prev_Y * (1 + perc_change)
|
|
463
|
-
|
|
464
|
-
elif diff_type == "abs":
|
|
465
|
-
for i in range(1, len(test_set)):
|
|
466
|
-
prev_Y = test_set["Y"].iloc[i - 1]
|
|
467
|
-
y_test_pred[i] = prev_Y + y_test_pred_center_point[i]
|
|
468
|
-
|
|
469
|
-
return
|
|
1
|
+
# pipeline.py
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from typing import Dict, Literal
|
|
4
|
+
import optuna
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
from autofuzzts.config import get_config
|
|
8
|
+
from autofuzzts.data import data_loader
|
|
9
|
+
from autofuzzts.data_validation.validate import validate_and_clean_input
|
|
10
|
+
from autofuzzts.partition.partition import FuzzyPartition
|
|
11
|
+
from autofuzzts.preprocess.preprocess import preprocess_data, preprocess_data_val
|
|
12
|
+
from autofuzzts.preprocess.prep_for_model import (
|
|
13
|
+
prepare_for_model,
|
|
14
|
+
prepare_for_model_val_set,
|
|
15
|
+
)
|
|
16
|
+
from autofuzzts.models.fuzzy_classifier import FuzzyPipelineModel
|
|
17
|
+
|
|
18
|
+
from sklearn.model_selection import ParameterGrid
|
|
19
|
+
from sklearn.calibration import CalibratedClassifierCV
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
## Import RMSE and MAE
|
|
23
|
+
from sklearn.metrics import (
|
|
24
|
+
root_mean_squared_error,
|
|
25
|
+
mean_absolute_error,
|
|
26
|
+
mean_squared_error,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
# Example custom configuration
|
|
30
|
+
custom_config = {
|
|
31
|
+
"n_fuzzy_sets": 5,
|
|
32
|
+
"verbosity": True,
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
# Retrieve the final configuration
|
|
36
|
+
selected_config = get_config(custom_config)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def run_pipeline(datasetet_name: str, config: dict = selected_config):
|
|
40
|
+
# Load data
|
|
41
|
+
|
|
42
|
+
data = data_loader.load_sample_data(datasetet_name)
|
|
43
|
+
print(data.head(5))
|
|
44
|
+
print("Evaluated configuration is")
|
|
45
|
+
print(config)
|
|
46
|
+
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def train_val_pipeline(
|
|
51
|
+
train_set: pd.DataFrame,
|
|
52
|
+
val_set: pd.DataFrame,
|
|
53
|
+
config: Dict = selected_config,
|
|
54
|
+
metric: Literal["rmse", "mse", "mae"] = "rmse",
|
|
55
|
+
diff_type: Literal["perc", "abs"] = "perc",
|
|
56
|
+
covariates: list[str] = None,
|
|
57
|
+
) -> float:
|
|
58
|
+
train_set = validate_and_clean_input(train_set, covariates)
|
|
59
|
+
val_set = validate_and_clean_input(val_set, covariates)
|
|
60
|
+
|
|
61
|
+
print("train set length:", len(train_set))
|
|
62
|
+
|
|
63
|
+
if covariates:
|
|
64
|
+
train_covariates = train_set[covariates].copy()
|
|
65
|
+
val_covariates = val_set[covariates].copy()
|
|
66
|
+
|
|
67
|
+
train_set_preprocessed, scaler_train = preprocess_data(train_set, diff_type)
|
|
68
|
+
val_set_preprocessed = preprocess_data_val(
|
|
69
|
+
df=val_set, df_train=train_set, diff_type=diff_type, scaler=scaler_train
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
fp = FuzzyPartition(
|
|
73
|
+
fuzzy_function=config["fuzzy_part_func"],
|
|
74
|
+
n_fuzzy_sets=config["n_fuzzy_sets"],
|
|
75
|
+
sigma=config["sigma"],
|
|
76
|
+
scaler=scaler_train,
|
|
77
|
+
verbosity=config["verbosity"],
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# Prepare train and validation fuzzy partitions
|
|
81
|
+
X_training = train_set_preprocessed["diff_scaled"].values
|
|
82
|
+
X_validation = val_set_preprocessed["diff_scaled"].values
|
|
83
|
+
|
|
84
|
+
train_fuzzy_partition, _, _ = fp.fuzzy_partition(X_training)
|
|
85
|
+
val_fuzzy_partition, _, center_points_unscaled_test_val = fp.fuzzy_partition(
|
|
86
|
+
X_validation
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
X_train, y_train = prepare_for_model(
|
|
90
|
+
train_fuzzy_partition.copy(), config["number_of_lags"]
|
|
91
|
+
)
|
|
92
|
+
X_val, y_val = prepare_for_model_val_set(
|
|
93
|
+
df_val_fp=val_fuzzy_partition.copy(),
|
|
94
|
+
df_train_fp=train_fuzzy_partition.copy(),
|
|
95
|
+
n_lags=config["number_of_lags"],
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
if covariates:
|
|
99
|
+
X_train = pd.concat([X_train, train_covariates], axis=1)
|
|
100
|
+
X_val = pd.concat([X_val, val_covariates], axis=1)
|
|
101
|
+
|
|
102
|
+
model = FuzzyPipelineModel(
|
|
103
|
+
n_fuzzy_sets=config["n_fuzzy_sets"],
|
|
104
|
+
number_of_lags=config["number_of_lags"],
|
|
105
|
+
verbosity=config["verbosity"],
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
model.fit(X_train, y_train, model_type="xgb")
|
|
109
|
+
|
|
110
|
+
pred_fuzzy_set = model.predict(X_val)
|
|
111
|
+
|
|
112
|
+
## Convert prediction to crips number using center points of fuzzy sets
|
|
113
|
+
y_val_pred_center_point = [center_points_unscaled_test_val[i] for i in pred_fuzzy_set]
|
|
114
|
+
|
|
115
|
+
## Recalculate percentage difference to actual values
|
|
116
|
+
y_val_pred = [None] * len(val_set)
|
|
117
|
+
|
|
118
|
+
# Set the first prediction using the last known value from the train set
|
|
119
|
+
last_train_value = train_set["Y"].iloc[
|
|
120
|
+
-1
|
|
121
|
+
] # Assuming `df_train` holds the training data
|
|
122
|
+
y_val_pred[0] = last_train_value * (1 + y_val_pred_center_point[0])
|
|
123
|
+
|
|
124
|
+
# Loop to calculate each subsequent prediction based on the actual previous value in `df_test['Y']`
|
|
125
|
+
|
|
126
|
+
if diff_type == "perc":
|
|
127
|
+
for i in range(1, len(val_set)):
|
|
128
|
+
prev_Y = val_set["Y"].iloc[
|
|
129
|
+
i - 1
|
|
130
|
+
] # Use the previous actual value from `df_test`
|
|
131
|
+
perc_change = y_val_pred_center_point[i]
|
|
132
|
+
y_val_pred[i] = prev_Y * (1 + perc_change)
|
|
133
|
+
|
|
134
|
+
elif diff_type == "abs":
|
|
135
|
+
for i in range(1, len(val_set)):
|
|
136
|
+
prev_Y = val_set["Y"].iloc[i - 1]
|
|
137
|
+
y_val_pred[i] = prev_Y + y_val_pred_center_point[i]
|
|
138
|
+
|
|
139
|
+
if metric == "rmse":
|
|
140
|
+
metric_value = root_mean_squared_error(val_set["Y"], y_val_pred)
|
|
141
|
+
elif metric == "mse":
|
|
142
|
+
metric_value = root_mean_squared_error(val_set["Y"], y_val_pred) ** 2
|
|
143
|
+
elif metric == "mae":
|
|
144
|
+
metric_value = mean_absolute_error(val_set["Y"], y_val_pred)
|
|
145
|
+
else:
|
|
146
|
+
raise ValueError(
|
|
147
|
+
f"Invalid metric {metric}. Please choose one of 'rmse', 'mse', 'mae'"
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
return metric_value
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def train_model(
|
|
154
|
+
dataset: pd.DataFrame,
|
|
155
|
+
config: Dict,
|
|
156
|
+
model_type: Literal["xgb", "mlp", "tpot"] = "xgb",
|
|
157
|
+
):
|
|
158
|
+
"""
|
|
159
|
+
Function to train a model on the dataset provided.
|
|
160
|
+
|
|
161
|
+
Parameters:
|
|
162
|
+
dataset: pd.DataFrame
|
|
163
|
+
The dataset to train the model on.
|
|
164
|
+
config: dict
|
|
165
|
+
The configuration dictionary for the model.
|
|
166
|
+
model_type: str
|
|
167
|
+
The type of model to train. Default is 'xgb'.
|
|
168
|
+
|
|
169
|
+
"""
|
|
170
|
+
config = get_config(config)
|
|
171
|
+
|
|
172
|
+
df = validate_and_clean_input(dataset)
|
|
173
|
+
|
|
174
|
+
df_preprocessed, scaler_train = preprocess_data(df, diff_type="perc")
|
|
175
|
+
|
|
176
|
+
fp = FuzzyPartition(
|
|
177
|
+
fuzzy_function=config["fuzzy_part_func"],
|
|
178
|
+
n_fuzzy_sets=config["n_fuzzy_sets"],
|
|
179
|
+
sigma=config["sigma"],
|
|
180
|
+
scaler=scaler_train,
|
|
181
|
+
verbosity=config["verbosity"],
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
X_training = df_preprocessed["diff_scaled"].values
|
|
185
|
+
|
|
186
|
+
train_fuzzy_partition, _, _ = fp.fuzzy_partition(X_training)
|
|
187
|
+
|
|
188
|
+
X_train, y_train = prepare_for_model(
|
|
189
|
+
train_fuzzy_partition.copy(), config["number_of_lags"]
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
model_train = FuzzyPipelineModel(
|
|
193
|
+
n_fuzzy_sets=config["n_fuzzy_sets"],
|
|
194
|
+
number_of_lags=config["number_of_lags"],
|
|
195
|
+
verbosity=config["verbosity"],
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
model_train.fit(X_train, y_train, model_type=model_type)
|
|
199
|
+
|
|
200
|
+
return model_train, scaler_train
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def tune_hyperparameters_bayes(
|
|
204
|
+
train_set: pd.DataFrame,
|
|
205
|
+
val_set: pd.DataFrame,
|
|
206
|
+
n_trials: int = 315,
|
|
207
|
+
metric: Literal["rmse", "mse", "mae"] = "rmse",
|
|
208
|
+
diff_type: Literal["perc", "abs"] = "perc",
|
|
209
|
+
covariates: list[str] = None,
|
|
210
|
+
):
|
|
211
|
+
def objective(trial):
|
|
212
|
+
# Define search space based on your specifications
|
|
213
|
+
config = {
|
|
214
|
+
"n_fuzzy_sets": trial.suggest_int(
|
|
215
|
+
"n_fuzzy_sets", 4, 40
|
|
216
|
+
), # Number of fuzzy sets
|
|
217
|
+
"number_of_lags": trial.suggest_int(
|
|
218
|
+
"number_of_lags", 1, 10
|
|
219
|
+
), # Number of lags
|
|
220
|
+
"fuzzy_part_func": trial.suggest_categorical(
|
|
221
|
+
"fuzzy_part_func", ["Triangle", "Cosine", "Gaussian"]
|
|
222
|
+
), # Partition function type
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
if config["fuzzy_part_func"] == "Gaussian":
|
|
226
|
+
config["sigma"] = trial.suggest_float("sigma", 0.1, 4, log=True)
|
|
227
|
+
else:
|
|
228
|
+
config["sigma"] = None
|
|
229
|
+
|
|
230
|
+
selected_config = get_config(config)
|
|
231
|
+
|
|
232
|
+
# Use train_val_pipeline to evaluate this configuration
|
|
233
|
+
metric_value = train_val_pipeline(
|
|
234
|
+
train_set,
|
|
235
|
+
val_set,
|
|
236
|
+
selected_config,
|
|
237
|
+
metric,
|
|
238
|
+
diff_type,
|
|
239
|
+
covariates=covariates,
|
|
240
|
+
)
|
|
241
|
+
return metric_value
|
|
242
|
+
|
|
243
|
+
# Create and optimize the Optuna study
|
|
244
|
+
study = optuna.create_study(direction="minimize")
|
|
245
|
+
study.optimize(objective, n_trials=n_trials)
|
|
246
|
+
|
|
247
|
+
# Extract the best configuration and score
|
|
248
|
+
best_config = study.best_params
|
|
249
|
+
best_metric_value = study.best_value
|
|
250
|
+
|
|
251
|
+
print(f"Best Config: {best_config}")
|
|
252
|
+
print(f"Best {metric.upper()}: {best_metric_value}")
|
|
253
|
+
return best_config, best_metric_value
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def tune_hyperparameters_bayes_Henon(
|
|
257
|
+
train_set: pd.DataFrame,
|
|
258
|
+
val_set: pd.DataFrame,
|
|
259
|
+
n_trials: int = 315,
|
|
260
|
+
metric: Literal["rmse", "mse", "mae"] = "rmse",
|
|
261
|
+
diff_type: Literal["perc", "abs"] = "perc",
|
|
262
|
+
):
|
|
263
|
+
def objective(trial):
|
|
264
|
+
config = {
|
|
265
|
+
"n_fuzzy_sets": trial.suggest_int(
|
|
266
|
+
"n_fuzzy_sets", 2, 29
|
|
267
|
+
), # Number of fuzzy sets
|
|
268
|
+
"number_of_lags": trial.suggest_int("n_lags", 2, 5), # Number of lags
|
|
269
|
+
"fuzzy_part_func": trial.suggest_categorical(
|
|
270
|
+
"fuzzy_part_func", ["Triangle", "Cosine", "Gaussian"]
|
|
271
|
+
),
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
if config["fuzzy_part_func"] == "Gaussian":
|
|
275
|
+
config["sigma"] = trial.suggest_float("sigma", 0.1, 4, log=True)
|
|
276
|
+
else:
|
|
277
|
+
config["sigma"] = None
|
|
278
|
+
|
|
279
|
+
selected_config = get_config(config)
|
|
280
|
+
|
|
281
|
+
# Use train_val_pipeline to evaluate this configuration
|
|
282
|
+
metric_value = train_val_pipeline(
|
|
283
|
+
train_set, val_set, selected_config, metric, diff_type
|
|
284
|
+
)
|
|
285
|
+
return metric_value
|
|
286
|
+
|
|
287
|
+
# Create and optimize the Optuna study
|
|
288
|
+
study = optuna.create_study(direction="minimize")
|
|
289
|
+
study.optimize(objective, n_trials=n_trials)
|
|
290
|
+
|
|
291
|
+
# Extract the best configuration and score
|
|
292
|
+
best_config = study.best_params
|
|
293
|
+
best_metric_value = study.best_value
|
|
294
|
+
|
|
295
|
+
print(f"Best Config: {best_config}")
|
|
296
|
+
print(f"Best {metric.upper()}: {best_metric_value}")
|
|
297
|
+
return best_config, best_metric_value
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def tune_hyperparameters_grid(
|
|
301
|
+
train_set: pd.DataFrame,
|
|
302
|
+
val_set: pd.DataFrame,
|
|
303
|
+
n_trials: int = 315,
|
|
304
|
+
metric: Literal["rmse", "mse", "mae"] = "rmse",
|
|
305
|
+
diff_type: Literal["perc", "abs"] = "perc",
|
|
306
|
+
):
|
|
307
|
+
# Define grid for Gaussian fuzzy function (includes 'sigma')
|
|
308
|
+
grid_gauss = {
|
|
309
|
+
"n_lags": [1, 3, 5, 7, 9],
|
|
310
|
+
"n_fuzzy_sets": [4, 6, 8, 10, 12, 14, 16, 18, 20],
|
|
311
|
+
"sigma": [0.1, 0.5, 1, 5, 9],
|
|
312
|
+
"fuzzy_part_func": ["matrix_F_transform_gauss"],
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
# Define grid for non-Gaussian fuzzy functions (excludes 'sigma')
|
|
316
|
+
grid_non_gauss = {
|
|
317
|
+
"n_lags": [1, 3, 5, 7, 9],
|
|
318
|
+
"n_fuzzy_sets": [4, 6, 8, 10, 12, 14, 16, 18, 20],
|
|
319
|
+
"sigma": [None], # Set sigma to None for non-Gaussian functions
|
|
320
|
+
"fuzzy_part_func": ["matrix_F_transform_cosine", "matrix_F_transform_triangle"],
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
# Combine the grids
|
|
324
|
+
grid_gauss = list(ParameterGrid(grid_gauss))
|
|
325
|
+
grid_non_gauss = list(ParameterGrid(grid_non_gauss))
|
|
326
|
+
combined_grid = grid_gauss + grid_non_gauss
|
|
327
|
+
|
|
328
|
+
## Run the grid search------------------------------------------------------------------------------------------------------
|
|
329
|
+
best_metric_value = float("inf")
|
|
330
|
+
best_config = None
|
|
331
|
+
num_evaluations = 0
|
|
332
|
+
|
|
333
|
+
for config in combined_grid:
|
|
334
|
+
selected_config = get_config(config)
|
|
335
|
+
# Count the configuration being evaluated
|
|
336
|
+
num_evaluations += 1
|
|
337
|
+
|
|
338
|
+
if num_evaluations >= n_trials:
|
|
339
|
+
break
|
|
340
|
+
|
|
341
|
+
## If number of evaluation is divisible by 20 print the number of evaluations
|
|
342
|
+
if num_evaluations % 20 == 0:
|
|
343
|
+
print(f"Number of evaluations done: {num_evaluations}")
|
|
344
|
+
|
|
345
|
+
# Evaluate the config on the validation set using train_val_pipeline
|
|
346
|
+
metric_value = train_val_pipeline(
|
|
347
|
+
train_set, val_set, selected_config, metric, diff_type
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
# Update best config if this one is better according to the selected metric
|
|
351
|
+
if metric_value < best_metric_value:
|
|
352
|
+
best_metric_value = metric_value
|
|
353
|
+
best_config = config
|
|
354
|
+
|
|
355
|
+
return best_config, best_metric_value, num_evaluations
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def fit_calibrate_predict(
|
|
359
|
+
train_set: pd.DataFrame,
|
|
360
|
+
test_set: pd.DataFrame,
|
|
361
|
+
config: Dict,
|
|
362
|
+
model_type: Literal["xgb", "mlp", "tpot"] = "xgb",
|
|
363
|
+
number_cv_calib=5,
|
|
364
|
+
diff_type: Literal["perc", "abs"] = "perc",
|
|
365
|
+
covariates: list[str] = None,
|
|
366
|
+
exclude_bool: bool = False,
|
|
367
|
+
) -> float:
|
|
368
|
+
"""
|
|
369
|
+
Aim of this question is to train a model on the train set, calibrate it using the calibration method provided, and predict it on the test set using the metric provided.
|
|
370
|
+
"""
|
|
371
|
+
|
|
372
|
+
config = get_config(config)
|
|
373
|
+
|
|
374
|
+
# Step 1: Validate and preprocess the input data
|
|
375
|
+
train_set = validate_and_clean_input(train_set, covariates=covariates)
|
|
376
|
+
test_set = validate_and_clean_input(test_set, covariates=covariates)
|
|
377
|
+
|
|
378
|
+
train_set_preprocessed, scaler_train = preprocess_data(
|
|
379
|
+
train_set, diff_type=diff_type
|
|
380
|
+
)
|
|
381
|
+
test_set_preprocessed = preprocess_data_val(
|
|
382
|
+
df=test_set, df_train=train_set, diff_type=diff_type, scaler=scaler_train
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
# Step 2: Fuzzy Partition for train, validation, and test sets
|
|
386
|
+
fp = FuzzyPartition(
|
|
387
|
+
fuzzy_function=config["fuzzy_part_func"],
|
|
388
|
+
n_fuzzy_sets=config["n_fuzzy_sets"],
|
|
389
|
+
sigma=config["sigma"],
|
|
390
|
+
scaler=scaler_train,
|
|
391
|
+
verbosity=config["verbosity"],
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
# Prepare train, validation, and test fuzzy partitions
|
|
395
|
+
X_training = train_set_preprocessed["diff_scaled"].values
|
|
396
|
+
X_test = test_set_preprocessed["diff_scaled"].values
|
|
397
|
+
|
|
398
|
+
train_fuzzy_partition, _, _ = fp.fuzzy_partition(X_training)
|
|
399
|
+
test_fuzzy_partition, _, center_points_unscaled_test = fp.fuzzy_partition(X_test)
|
|
400
|
+
|
|
401
|
+
if exclude_bool:
|
|
402
|
+
## Remove column left from train_fuzzy_partition
|
|
403
|
+
train_fuzzy_partition = train_fuzzy_partition.drop(columns=["left"])
|
|
404
|
+
test_fuzzy_partition = test_fuzzy_partition.drop(columns=["left"])
|
|
405
|
+
|
|
406
|
+
# Prepare data for model training, validation, and testing
|
|
407
|
+
X_train, y_train = prepare_for_model(
|
|
408
|
+
train_fuzzy_partition.copy(), config["number_of_lags"]
|
|
409
|
+
)
|
|
410
|
+
X_test_final, _ = prepare_for_model_val_set(
|
|
411
|
+
df_val_fp=test_fuzzy_partition.copy(),
|
|
412
|
+
df_train_fp=train_fuzzy_partition.copy(),
|
|
413
|
+
n_lags=config["number_of_lags"],
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
if covariates:
|
|
417
|
+
train_covariates = train_set[covariates].copy()
|
|
418
|
+
test_covariates = test_set[covariates].copy()
|
|
419
|
+
|
|
420
|
+
X_train = pd.concat([X_train, train_covariates], axis=1)
|
|
421
|
+
X_test_final = pd.concat([X_test_final, test_covariates], axis=1)
|
|
422
|
+
|
|
423
|
+
# Step 3: Train the model on the combined train and validation set
|
|
424
|
+
model = FuzzyPipelineModel(
|
|
425
|
+
n_fuzzy_sets=config["n_fuzzy_sets"],
|
|
426
|
+
number_of_lags=config["number_of_lags"],
|
|
427
|
+
verbosity=config["verbosity"],
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
model.fit(X_train, y_train, model_type=model_type)
|
|
431
|
+
|
|
432
|
+
try:
|
|
433
|
+
# Step 4: Calibrate the model using CalibratedClassifierCV
|
|
434
|
+
model.calibrate(X_train, y_train, method="sigmoid", cv=number_cv_calib)
|
|
435
|
+
except:
|
|
436
|
+
pass
|
|
437
|
+
|
|
438
|
+
# Step 5: Make predictions and evaluate on the test set
|
|
439
|
+
y_test_pred_fuzzy_set = model.predict(X_test_final)
|
|
440
|
+
|
|
441
|
+
## Convert prediction to crips number using center points of fuzzy sets
|
|
442
|
+
y_test_pred_center_point = [
|
|
443
|
+
center_points_unscaled_test[i] for i in y_test_pred_fuzzy_set
|
|
444
|
+
]
|
|
445
|
+
|
|
446
|
+
## Recalculate percentage difference to actual values
|
|
447
|
+
y_test_pred = [None] * len(test_set)
|
|
448
|
+
|
|
449
|
+
# Set the first prediction using the last known value from the train set
|
|
450
|
+
last_train_value = train_set["Y"].iloc[
|
|
451
|
+
-1
|
|
452
|
+
] # Assuming `df_train` holds the training data
|
|
453
|
+
y_test_pred[0] = last_train_value * (1 + y_test_pred_center_point[0])
|
|
454
|
+
|
|
455
|
+
if diff_type == "perc":
|
|
456
|
+
# Loop to calculate each subsequent prediction based on the actual previous value in `df_test['Y']`
|
|
457
|
+
for i in range(1, len(test_set)):
|
|
458
|
+
prev_Y = test_set["Y"].iloc[
|
|
459
|
+
i - 1
|
|
460
|
+
] # Use the previous actual value from `df_test`
|
|
461
|
+
perc_change = y_test_pred_center_point[i]
|
|
462
|
+
y_test_pred[i] = prev_Y * (1 + perc_change)
|
|
463
|
+
|
|
464
|
+
elif diff_type == "abs":
|
|
465
|
+
for i in range(1, len(test_set)):
|
|
466
|
+
prev_Y = test_set["Y"].iloc[i - 1]
|
|
467
|
+
y_test_pred[i] = prev_Y + y_test_pred_center_point[i]
|
|
468
|
+
|
|
469
|
+
return y_test_pred_fuzzy_set, y_test_pred_center_point, y_test_pred
|