autofuzzts 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
autofuzzts/pipeline.py CHANGED
@@ -1,469 +1,469 @@
1
- # pipeline.py
2
- import pandas as pd
3
- from typing import Dict, Literal
4
- import optuna
5
- import numpy as np
6
-
7
- from autofuzzts.config import get_config
8
- from autofuzzts.data import data_loader
9
- from autofuzzts.data_validation.validate import validate_and_clean_input
10
- from autofuzzts.partition.partition import FuzzyPartition
11
- from autofuzzts.preprocess.preprocess import preprocess_data, preprocess_data_val
12
- from autofuzzts.preprocess.prep_for_model import (
13
- prepare_for_model,
14
- prepare_for_model_val_set,
15
- )
16
- from autofuzzts.models.fuzzy_classifier import FuzzyPipelineModel
17
-
18
- from sklearn.model_selection import ParameterGrid
19
- from sklearn.calibration import CalibratedClassifierCV
20
-
21
-
22
- ## Import RMSE and MAE
23
- from sklearn.metrics import (
24
- root_mean_squared_error,
25
- mean_absolute_error,
26
- mean_squared_error,
27
- )
28
-
29
- # Example custom configuration
30
- custom_config = {
31
- "n_clusters": 5,
32
- "verbosity": True,
33
- }
34
-
35
- # Retrieve the final configuration
36
- selected_config = get_config(custom_config)
37
-
38
-
39
- def run_pipeline(datasetet_name: str, config: dict = selected_config):
40
- # Load data
41
-
42
- data = data_loader.load_sample_data(datasetet_name)
43
- print(data.head(5))
44
- print("Evaluated configuration is")
45
- print(config)
46
-
47
- pass
48
-
49
-
50
- def train_val_pipeline(
51
- train_set: pd.DataFrame,
52
- val_set: pd.DataFrame,
53
- config: Dict = selected_config,
54
- metric: Literal["rmse", "mse", "mae"] = "rmse",
55
- diff_type: Literal["perc", "abs"] = "perc",
56
- covariates: list[str] = None,
57
- ) -> float:
58
- train_set = validate_and_clean_input(train_set, covariates)
59
- val_set = validate_and_clean_input(val_set, covariates)
60
-
61
- print("train set length:", len(train_set))
62
-
63
- if covariates:
64
- train_covariates = train_set[covariates].copy()
65
- val_covariates = val_set[covariates].copy()
66
-
67
- train_set_preprocessed, scaler_train = preprocess_data(train_set, diff_type)
68
- val_set_preprocessed = preprocess_data_val(
69
- df=val_set, df_train=train_set, diff_type=diff_type, scaler=scaler_train
70
- )
71
-
72
- fp = FuzzyPartition(
73
- fuzzy_function=config["fuzzy_part_func"],
74
- n_clusters=config["n_clusters"],
75
- sigma=config["sigma"],
76
- scaler=scaler_train,
77
- verbosity=config["verbosity"],
78
- )
79
-
80
- # Prepare train and validation fuzzy partitions
81
- X_training = train_set_preprocessed["diff_scaled"].values
82
- X_validation = val_set_preprocessed["diff_scaled"].values
83
-
84
- train_fuzzy_partition, _, _ = fp.fuzzy_partition(X_training)
85
- val_fuzzy_partition, _, center_points_unscaled_test_val = fp.fuzzy_partition(
86
- X_validation
87
- )
88
-
89
- X_train, y_train = prepare_for_model(
90
- train_fuzzy_partition.copy(), config["number_of_lags"]
91
- )
92
- X_val, y_val = prepare_for_model_val_set(
93
- df_val_fp=val_fuzzy_partition.copy(),
94
- df_train_fp=train_fuzzy_partition.copy(),
95
- n_lags=config["number_of_lags"],
96
- )
97
-
98
- if covariates:
99
- X_train = pd.concat([X_train, train_covariates], axis=1)
100
- X_val = pd.concat([X_val, val_covariates], axis=1)
101
-
102
- model = FuzzyPipelineModel(
103
- n_clusters=config["n_clusters"],
104
- number_of_lags=config["number_of_lags"],
105
- verbosity=config["verbosity"],
106
- )
107
-
108
- model.fit(X_train, y_train, model_type="xgb")
109
-
110
- pred_cluster = model.predict(X_val)
111
-
112
- ## Convert prediction to crips number using center points of clusters
113
- y_val_pred_center_point = [center_points_unscaled_test_val[i] for i in pred_cluster]
114
-
115
- ## Recalculate percentage difference to actual values
116
- y_val_pred = [None] * len(val_set)
117
-
118
- # Set the first prediction using the last known value from the train set
119
- last_train_value = train_set["Y"].iloc[
120
- -1
121
- ] # Assuming `df_train` holds the training data
122
- y_val_pred[0] = last_train_value * (1 + y_val_pred_center_point[0])
123
-
124
- # Loop to calculate each subsequent prediction based on the actual previous value in `df_test['Y']`
125
-
126
- if diff_type == "perc":
127
- for i in range(1, len(val_set)):
128
- prev_Y = val_set["Y"].iloc[
129
- i - 1
130
- ] # Use the previous actual value from `df_test`
131
- perc_change = y_val_pred_center_point[i]
132
- y_val_pred[i] = prev_Y * (1 + perc_change)
133
-
134
- elif diff_type == "abs":
135
- for i in range(1, len(val_set)):
136
- prev_Y = val_set["Y"].iloc[i - 1]
137
- y_val_pred[i] = prev_Y + y_val_pred_center_point[i]
138
-
139
- if metric == "rmse":
140
- metric_value = root_mean_squared_error(val_set["Y"], y_val_pred)
141
- elif metric == "mse":
142
- metric_value = root_mean_squared_error(val_set["Y"], y_val_pred) ** 2
143
- elif metric == "mae":
144
- metric_value = mean_absolute_error(val_set["Y"], y_val_pred)
145
- else:
146
- raise ValueError(
147
- f"Invalid metric {metric}. Please choose one of 'rmse', 'mse', 'mae'"
148
- )
149
-
150
- return metric_value
151
-
152
-
153
- def train_model(
154
- dataset: pd.DataFrame,
155
- config: Dict,
156
- model_type: Literal["xgb", "mlp", "tpot"] = "xgb",
157
- ):
158
- """
159
- Function to train a model on the dataset provided.
160
-
161
- Parameters:
162
- dataset: pd.DataFrame
163
- The dataset to train the model on.
164
- config: dict
165
- The configuration dictionary for the model.
166
- model_type: str
167
- The type of model to train. Default is 'xgb'.
168
-
169
- """
170
- config = get_config(config)
171
-
172
- df = validate_and_clean_input(dataset)
173
-
174
- df_preprocessed, scaler_train = preprocess_data(df, diff_type="perc")
175
-
176
- fp = FuzzyPartition(
177
- fuzzy_function=config["fuzzy_part_func"],
178
- n_clusters=config["n_clusters"],
179
- sigma=config["sigma"],
180
- scaler=scaler_train,
181
- verbosity=config["verbosity"],
182
- )
183
-
184
- X_training = df_preprocessed["diff_scaled"].values
185
-
186
- train_fuzzy_partition, _, _ = fp.fuzzy_partition(X_training)
187
-
188
- X_train, y_train = prepare_for_model(
189
- train_fuzzy_partition.copy(), config["number_of_lags"]
190
- )
191
-
192
- model_train = FuzzyPipelineModel(
193
- n_clusters=config["n_clusters"],
194
- number_of_lags=config["number_of_lags"],
195
- verbosity=config["verbosity"],
196
- )
197
-
198
- model_train.fit(X_train, y_train, model_type=model_type)
199
-
200
- return model_train, scaler_train
201
-
202
-
203
- def tune_hyperparameters_bayes(
204
- train_set: pd.DataFrame,
205
- val_set: pd.DataFrame,
206
- n_trials: int = 315,
207
- metric: Literal["rmse", "mse", "mae"] = "rmse",
208
- diff_type: Literal["perc", "abs"] = "perc",
209
- covariates: list[str] = None,
210
- ):
211
- def objective(trial):
212
- # Define search space based on your specifications
213
- config = {
214
- "n_clusters": trial.suggest_int(
215
- "n_clusters", 4, 40
216
- ), # Number of fuzzy sets
217
- "number_of_lags": trial.suggest_int(
218
- "number_of_lags", 1, 10
219
- ), # Number of lags
220
- "fuzzy_part_func": trial.suggest_categorical(
221
- "fuzzy_part_func", ["Triangle", "Cosine", "Gaussian"]
222
- ), # Partition function type
223
- }
224
-
225
- if config["fuzzy_part_func"] == "Gaussian":
226
- config["sigma"] = trial.suggest_float("sigma", 0.1, 4, log=True)
227
- else:
228
- config["sigma"] = None
229
-
230
- selected_config = get_config(config)
231
-
232
- # Use train_val_pipeline to evaluate this configuration
233
- metric_value = train_val_pipeline(
234
- train_set,
235
- val_set,
236
- selected_config,
237
- metric,
238
- diff_type,
239
- covariates=covariates,
240
- )
241
- return metric_value
242
-
243
- # Create and optimize the Optuna study
244
- study = optuna.create_study(direction="minimize")
245
- study.optimize(objective, n_trials=n_trials)
246
-
247
- # Extract the best configuration and score
248
- best_config = study.best_params
249
- best_metric_value = study.best_value
250
-
251
- print(f"Best Config: {best_config}")
252
- print(f"Best {metric.upper()}: {best_metric_value}")
253
- return best_config, best_metric_value
254
-
255
-
256
- def tune_hyperparameters_bayes_Henon(
257
- train_set: pd.DataFrame,
258
- val_set: pd.DataFrame,
259
- n_trials: int = 315,
260
- metric: Literal["rmse", "mse", "mae"] = "rmse",
261
- diff_type: Literal["perc", "abs"] = "perc",
262
- ):
263
- def objective(trial):
264
- config = {
265
- "n_clusters": trial.suggest_int(
266
- "n_clusters", 2, 29
267
- ), # Number of fuzzy sets
268
- "number_of_lags": trial.suggest_int("n_lags", 2, 5), # Number of lags
269
- "fuzzy_part_func": trial.suggest_categorical(
270
- "fuzzy_part_func", ["Triangle", "Cosine", "Gaussian"]
271
- ),
272
- }
273
-
274
- if config["fuzzy_part_func"] == "Gaussian":
275
- config["sigma"] = trial.suggest_float("sigma", 0.1, 4, log=True)
276
- else:
277
- config["sigma"] = None
278
-
279
- selected_config = get_config(config)
280
-
281
- # Use train_val_pipeline to evaluate this configuration
282
- metric_value = train_val_pipeline(
283
- train_set, val_set, selected_config, metric, diff_type
284
- )
285
- return metric_value
286
-
287
- # Create and optimize the Optuna study
288
- study = optuna.create_study(direction="minimize")
289
- study.optimize(objective, n_trials=n_trials)
290
-
291
- # Extract the best configuration and score
292
- best_config = study.best_params
293
- best_metric_value = study.best_value
294
-
295
- print(f"Best Config: {best_config}")
296
- print(f"Best {metric.upper()}: {best_metric_value}")
297
- return best_config, best_metric_value
298
-
299
-
300
- def tune_hyperparameters_grid(
301
- train_set: pd.DataFrame,
302
- val_set: pd.DataFrame,
303
- n_trials: int = 315,
304
- metric: Literal["rmse", "mse", "mae"] = "rmse",
305
- diff_type: Literal["perc", "abs"] = "perc",
306
- ):
307
- # Define grid for Gaussian fuzzy function (includes 'sigma')
308
- grid_gauss = {
309
- "n_lags": [1, 3, 5, 7, 9],
310
- "n_clusters": [4, 6, 8, 10, 12, 14, 16, 18, 20],
311
- "sigma": [0.1, 0.5, 1, 5, 9],
312
- "fuzzy_part_func": ["matrix_F_transform_gauss"],
313
- }
314
-
315
- # Define grid for non-Gaussian fuzzy functions (excludes 'sigma')
316
- grid_non_gauss = {
317
- "n_lags": [1, 3, 5, 7, 9],
318
- "n_clusters": [4, 6, 8, 10, 12, 14, 16, 18, 20],
319
- "sigma": [None], # Set sigma to None for non-Gaussian functions
320
- "fuzzy_part_func": ["matrix_F_transform_cosine", "matrix_F_transform_triangle"],
321
- }
322
-
323
- # Combine the grids
324
- grid_gauss = list(ParameterGrid(grid_gauss))
325
- grid_non_gauss = list(ParameterGrid(grid_non_gauss))
326
- combined_grid = grid_gauss + grid_non_gauss
327
-
328
- ## Run the grid search------------------------------------------------------------------------------------------------------
329
- best_metric_value = float("inf")
330
- best_config = None
331
- num_evaluations = 0
332
-
333
- for config in combined_grid:
334
- selected_config = get_config(config)
335
- # Count the configuration being evaluated
336
- num_evaluations += 1
337
-
338
- if num_evaluations >= n_trials:
339
- break
340
-
341
- ## If number of evaluation is divisible by 20 print the number of evaluations
342
- if num_evaluations % 20 == 0:
343
- print(f"Number of evaluations done: {num_evaluations}")
344
-
345
- # Evaluate the config on the validation set using train_val_pipeline
346
- metric_value = train_val_pipeline(
347
- train_set, val_set, selected_config, metric, diff_type
348
- )
349
-
350
- # Update best config if this one is better according to the selected metric
351
- if metric_value < best_metric_value:
352
- best_metric_value = metric_value
353
- best_config = config
354
-
355
- return best_config, best_metric_value, num_evaluations
356
-
357
-
358
- def fit_calibrate_predict(
359
- train_set: pd.DataFrame,
360
- test_set: pd.DataFrame,
361
- config: Dict,
362
- model_type: Literal["xgb", "mlp", "tpot"] = "xgb",
363
- number_cv_calib=5,
364
- diff_type: Literal["perc", "abs"] = "perc",
365
- covariates: list[str] = None,
366
- exclude_bool: bool = False,
367
- ) -> float:
368
- """
369
- Aim of this question is to train a model on the train set, calibrate it using the calibration method provided, and predict it on the test set using the metric provided.
370
- """
371
-
372
- config = get_config(config)
373
-
374
- # Step 1: Validate and preprocess the input data
375
- train_set = validate_and_clean_input(train_set, covariates=covariates)
376
- test_set = validate_and_clean_input(test_set, covariates=covariates)
377
-
378
- train_set_preprocessed, scaler_train = preprocess_data(
379
- train_set, diff_type=diff_type
380
- )
381
- test_set_preprocessed = preprocess_data_val(
382
- df=test_set, df_train=train_set, diff_type=diff_type, scaler=scaler_train
383
- )
384
-
385
- # Step 2: Fuzzy Partition for train, validation, and test sets
386
- fp = FuzzyPartition(
387
- fuzzy_function=config["fuzzy_part_func"],
388
- n_clusters=config["n_clusters"],
389
- sigma=config["sigma"],
390
- scaler=scaler_train,
391
- verbosity=config["verbosity"],
392
- )
393
-
394
- # Prepare train, validation, and test fuzzy partitions
395
- X_training = train_set_preprocessed["diff_scaled"].values
396
- X_test = test_set_preprocessed["diff_scaled"].values
397
-
398
- train_fuzzy_partition, _, _ = fp.fuzzy_partition(X_training)
399
- test_fuzzy_partition, _, center_points_unscaled_test = fp.fuzzy_partition(X_test)
400
-
401
- if exclude_bool:
402
- ## Remove column left from train_fuzzy_partition
403
- train_fuzzy_partition = train_fuzzy_partition.drop(columns=["left"])
404
- test_fuzzy_partition = test_fuzzy_partition.drop(columns=["left"])
405
-
406
- # Prepare data for model training, validation, and testing
407
- X_train, y_train = prepare_for_model(
408
- train_fuzzy_partition.copy(), config["number_of_lags"]
409
- )
410
- X_test_final, _ = prepare_for_model_val_set(
411
- df_val_fp=test_fuzzy_partition.copy(),
412
- df_train_fp=train_fuzzy_partition.copy(),
413
- n_lags=config["number_of_lags"],
414
- )
415
-
416
- if covariates:
417
- train_covariates = train_set[covariates].copy()
418
- test_covariates = test_set[covariates].copy()
419
-
420
- X_train = pd.concat([X_train, train_covariates], axis=1)
421
- X_test_final = pd.concat([X_test_final, test_covariates], axis=1)
422
-
423
- # Step 3: Train the model on the combined train and validation set
424
- model = FuzzyPipelineModel(
425
- n_clusters=config["n_clusters"],
426
- number_of_lags=config["number_of_lags"],
427
- verbosity=config["verbosity"],
428
- )
429
-
430
- model.fit(X_train, y_train, model_type=model_type)
431
-
432
- try:
433
- # Step 4: Calibrate the model using CalibratedClassifierCV
434
- model.calibrate(X_train, y_train, method="sigmoid", cv=number_cv_calib)
435
- except:
436
- pass
437
-
438
- # Step 5: Make predictions and evaluate on the test set
439
- y_test_pred_cluster = model.predict(X_test_final)
440
-
441
- ## Convert prediction to crips number using center points of clusters
442
- y_test_pred_center_point = [
443
- center_points_unscaled_test[i] for i in y_test_pred_cluster
444
- ]
445
-
446
- ## Recalculate percentage difference to actual values
447
- y_test_pred = [None] * len(test_set)
448
-
449
- # Set the first prediction using the last known value from the train set
450
- last_train_value = train_set["Y"].iloc[
451
- -1
452
- ] # Assuming `df_train` holds the training data
453
- y_test_pred[0] = last_train_value * (1 + y_test_pred_center_point[0])
454
-
455
- if diff_type == "perc":
456
- # Loop to calculate each subsequent prediction based on the actual previous value in `df_test['Y']`
457
- for i in range(1, len(test_set)):
458
- prev_Y = test_set["Y"].iloc[
459
- i - 1
460
- ] # Use the previous actual value from `df_test`
461
- perc_change = y_test_pred_center_point[i]
462
- y_test_pred[i] = prev_Y * (1 + perc_change)
463
-
464
- elif diff_type == "abs":
465
- for i in range(1, len(test_set)):
466
- prev_Y = test_set["Y"].iloc[i - 1]
467
- y_test_pred[i] = prev_Y + y_test_pred_center_point[i]
468
-
469
- return y_test_pred_cluster, y_test_pred_center_point, y_test_pred
1
+ # pipeline.py
2
+ import pandas as pd
3
+ from typing import Dict, Literal
4
+ import optuna
5
+ import numpy as np
6
+
7
+ from autofuzzts.config import get_config
8
+ from autofuzzts.data import data_loader
9
+ from autofuzzts.data_validation.validate import validate_and_clean_input
10
+ from autofuzzts.partition.partition import FuzzyPartition
11
+ from autofuzzts.preprocess.preprocess import preprocess_data, preprocess_data_val
12
+ from autofuzzts.preprocess.prep_for_model import (
13
+ prepare_for_model,
14
+ prepare_for_model_val_set,
15
+ )
16
+ from autofuzzts.models.fuzzy_classifier import FuzzyPipelineModel
17
+
18
+ from sklearn.model_selection import ParameterGrid
19
+ from sklearn.calibration import CalibratedClassifierCV
20
+
21
+
22
+ ## Import RMSE and MAE
23
+ from sklearn.metrics import (
24
+ root_mean_squared_error,
25
+ mean_absolute_error,
26
+ mean_squared_error,
27
+ )
28
+
29
+ # Example custom configuration
30
+ custom_config = {
31
+ "n_fuzzy_sets": 5,
32
+ "verbosity": True,
33
+ }
34
+
35
+ # Retrieve the final configuration
36
+ selected_config = get_config(custom_config)
37
+
38
+
39
+ def run_pipeline(datasetet_name: str, config: dict = selected_config):
40
+ # Load data
41
+
42
+ data = data_loader.load_sample_data(datasetet_name)
43
+ print(data.head(5))
44
+ print("Evaluated configuration is")
45
+ print(config)
46
+
47
+ pass
48
+
49
+
50
+ def train_val_pipeline(
51
+ train_set: pd.DataFrame,
52
+ val_set: pd.DataFrame,
53
+ config: Dict = selected_config,
54
+ metric: Literal["rmse", "mse", "mae"] = "rmse",
55
+ diff_type: Literal["perc", "abs"] = "perc",
56
+ covariates: list[str] = None,
57
+ ) -> float:
58
+ train_set = validate_and_clean_input(train_set, covariates)
59
+ val_set = validate_and_clean_input(val_set, covariates)
60
+
61
+ print("train set length:", len(train_set))
62
+
63
+ if covariates:
64
+ train_covariates = train_set[covariates].copy()
65
+ val_covariates = val_set[covariates].copy()
66
+
67
+ train_set_preprocessed, scaler_train = preprocess_data(train_set, diff_type)
68
+ val_set_preprocessed = preprocess_data_val(
69
+ df=val_set, df_train=train_set, diff_type=diff_type, scaler=scaler_train
70
+ )
71
+
72
+ fp = FuzzyPartition(
73
+ fuzzy_function=config["fuzzy_part_func"],
74
+ n_fuzzy_sets=config["n_fuzzy_sets"],
75
+ sigma=config["sigma"],
76
+ scaler=scaler_train,
77
+ verbosity=config["verbosity"],
78
+ )
79
+
80
+ # Prepare train and validation fuzzy partitions
81
+ X_training = train_set_preprocessed["diff_scaled"].values
82
+ X_validation = val_set_preprocessed["diff_scaled"].values
83
+
84
+ train_fuzzy_partition, _, _ = fp.fuzzy_partition(X_training)
85
+ val_fuzzy_partition, _, center_points_unscaled_test_val = fp.fuzzy_partition(
86
+ X_validation
87
+ )
88
+
89
+ X_train, y_train = prepare_for_model(
90
+ train_fuzzy_partition.copy(), config["number_of_lags"]
91
+ )
92
+ X_val, y_val = prepare_for_model_val_set(
93
+ df_val_fp=val_fuzzy_partition.copy(),
94
+ df_train_fp=train_fuzzy_partition.copy(),
95
+ n_lags=config["number_of_lags"],
96
+ )
97
+
98
+ if covariates:
99
+ X_train = pd.concat([X_train, train_covariates], axis=1)
100
+ X_val = pd.concat([X_val, val_covariates], axis=1)
101
+
102
+ model = FuzzyPipelineModel(
103
+ n_fuzzy_sets=config["n_fuzzy_sets"],
104
+ number_of_lags=config["number_of_lags"],
105
+ verbosity=config["verbosity"],
106
+ )
107
+
108
+ model.fit(X_train, y_train, model_type="xgb")
109
+
110
+ pred_fuzzy_set = model.predict(X_val)
111
+
112
+ ## Convert prediction to crips number using center points of fuzzy sets
113
+ y_val_pred_center_point = [center_points_unscaled_test_val[i] for i in pred_fuzzy_set]
114
+
115
+ ## Recalculate percentage difference to actual values
116
+ y_val_pred = [None] * len(val_set)
117
+
118
+ # Set the first prediction using the last known value from the train set
119
+ last_train_value = train_set["Y"].iloc[
120
+ -1
121
+ ] # Assuming `df_train` holds the training data
122
+ y_val_pred[0] = last_train_value * (1 + y_val_pred_center_point[0])
123
+
124
+ # Loop to calculate each subsequent prediction based on the actual previous value in `df_test['Y']`
125
+
126
+ if diff_type == "perc":
127
+ for i in range(1, len(val_set)):
128
+ prev_Y = val_set["Y"].iloc[
129
+ i - 1
130
+ ] # Use the previous actual value from `df_test`
131
+ perc_change = y_val_pred_center_point[i]
132
+ y_val_pred[i] = prev_Y * (1 + perc_change)
133
+
134
+ elif diff_type == "abs":
135
+ for i in range(1, len(val_set)):
136
+ prev_Y = val_set["Y"].iloc[i - 1]
137
+ y_val_pred[i] = prev_Y + y_val_pred_center_point[i]
138
+
139
+ if metric == "rmse":
140
+ metric_value = root_mean_squared_error(val_set["Y"], y_val_pred)
141
+ elif metric == "mse":
142
+ metric_value = root_mean_squared_error(val_set["Y"], y_val_pred) ** 2
143
+ elif metric == "mae":
144
+ metric_value = mean_absolute_error(val_set["Y"], y_val_pred)
145
+ else:
146
+ raise ValueError(
147
+ f"Invalid metric {metric}. Please choose one of 'rmse', 'mse', 'mae'"
148
+ )
149
+
150
+ return metric_value
151
+
152
+
153
+ def train_model(
154
+ dataset: pd.DataFrame,
155
+ config: Dict,
156
+ model_type: Literal["xgb", "mlp", "tpot"] = "xgb",
157
+ ):
158
+ """
159
+ Function to train a model on the dataset provided.
160
+
161
+ Parameters:
162
+ dataset: pd.DataFrame
163
+ The dataset to train the model on.
164
+ config: dict
165
+ The configuration dictionary for the model.
166
+ model_type: str
167
+ The type of model to train. Default is 'xgb'.
168
+
169
+ """
170
+ config = get_config(config)
171
+
172
+ df = validate_and_clean_input(dataset)
173
+
174
+ df_preprocessed, scaler_train = preprocess_data(df, diff_type="perc")
175
+
176
+ fp = FuzzyPartition(
177
+ fuzzy_function=config["fuzzy_part_func"],
178
+ n_fuzzy_sets=config["n_fuzzy_sets"],
179
+ sigma=config["sigma"],
180
+ scaler=scaler_train,
181
+ verbosity=config["verbosity"],
182
+ )
183
+
184
+ X_training = df_preprocessed["diff_scaled"].values
185
+
186
+ train_fuzzy_partition, _, _ = fp.fuzzy_partition(X_training)
187
+
188
+ X_train, y_train = prepare_for_model(
189
+ train_fuzzy_partition.copy(), config["number_of_lags"]
190
+ )
191
+
192
+ model_train = FuzzyPipelineModel(
193
+ n_fuzzy_sets=config["n_fuzzy_sets"],
194
+ number_of_lags=config["number_of_lags"],
195
+ verbosity=config["verbosity"],
196
+ )
197
+
198
+ model_train.fit(X_train, y_train, model_type=model_type)
199
+
200
+ return model_train, scaler_train
201
+
202
+
203
+ def tune_hyperparameters_bayes(
204
+ train_set: pd.DataFrame,
205
+ val_set: pd.DataFrame,
206
+ n_trials: int = 315,
207
+ metric: Literal["rmse", "mse", "mae"] = "rmse",
208
+ diff_type: Literal["perc", "abs"] = "perc",
209
+ covariates: list[str] = None,
210
+ ):
211
+ def objective(trial):
212
+ # Define search space based on your specifications
213
+ config = {
214
+ "n_fuzzy_sets": trial.suggest_int(
215
+ "n_fuzzy_sets", 4, 40
216
+ ), # Number of fuzzy sets
217
+ "number_of_lags": trial.suggest_int(
218
+ "number_of_lags", 1, 10
219
+ ), # Number of lags
220
+ "fuzzy_part_func": trial.suggest_categorical(
221
+ "fuzzy_part_func", ["Triangle", "Cosine", "Gaussian"]
222
+ ), # Partition function type
223
+ }
224
+
225
+ if config["fuzzy_part_func"] == "Gaussian":
226
+ config["sigma"] = trial.suggest_float("sigma", 0.1, 4, log=True)
227
+ else:
228
+ config["sigma"] = None
229
+
230
+ selected_config = get_config(config)
231
+
232
+ # Use train_val_pipeline to evaluate this configuration
233
+ metric_value = train_val_pipeline(
234
+ train_set,
235
+ val_set,
236
+ selected_config,
237
+ metric,
238
+ diff_type,
239
+ covariates=covariates,
240
+ )
241
+ return metric_value
242
+
243
+ # Create and optimize the Optuna study
244
+ study = optuna.create_study(direction="minimize")
245
+ study.optimize(objective, n_trials=n_trials)
246
+
247
+ # Extract the best configuration and score
248
+ best_config = study.best_params
249
+ best_metric_value = study.best_value
250
+
251
+ print(f"Best Config: {best_config}")
252
+ print(f"Best {metric.upper()}: {best_metric_value}")
253
+ return best_config, best_metric_value
254
+
255
+
256
+ def tune_hyperparameters_bayes_Henon(
257
+ train_set: pd.DataFrame,
258
+ val_set: pd.DataFrame,
259
+ n_trials: int = 315,
260
+ metric: Literal["rmse", "mse", "mae"] = "rmse",
261
+ diff_type: Literal["perc", "abs"] = "perc",
262
+ ):
263
+ def objective(trial):
264
+ config = {
265
+ "n_fuzzy_sets": trial.suggest_int(
266
+ "n_fuzzy_sets", 2, 29
267
+ ), # Number of fuzzy sets
268
+ "number_of_lags": trial.suggest_int("n_lags", 2, 5), # Number of lags
269
+ "fuzzy_part_func": trial.suggest_categorical(
270
+ "fuzzy_part_func", ["Triangle", "Cosine", "Gaussian"]
271
+ ),
272
+ }
273
+
274
+ if config["fuzzy_part_func"] == "Gaussian":
275
+ config["sigma"] = trial.suggest_float("sigma", 0.1, 4, log=True)
276
+ else:
277
+ config["sigma"] = None
278
+
279
+ selected_config = get_config(config)
280
+
281
+ # Use train_val_pipeline to evaluate this configuration
282
+ metric_value = train_val_pipeline(
283
+ train_set, val_set, selected_config, metric, diff_type
284
+ )
285
+ return metric_value
286
+
287
+ # Create and optimize the Optuna study
288
+ study = optuna.create_study(direction="minimize")
289
+ study.optimize(objective, n_trials=n_trials)
290
+
291
+ # Extract the best configuration and score
292
+ best_config = study.best_params
293
+ best_metric_value = study.best_value
294
+
295
+ print(f"Best Config: {best_config}")
296
+ print(f"Best {metric.upper()}: {best_metric_value}")
297
+ return best_config, best_metric_value
298
+
299
+
300
+ def tune_hyperparameters_grid(
301
+ train_set: pd.DataFrame,
302
+ val_set: pd.DataFrame,
303
+ n_trials: int = 315,
304
+ metric: Literal["rmse", "mse", "mae"] = "rmse",
305
+ diff_type: Literal["perc", "abs"] = "perc",
306
+ ):
307
+ # Define grid for Gaussian fuzzy function (includes 'sigma')
308
+ grid_gauss = {
309
+ "n_lags": [1, 3, 5, 7, 9],
310
+ "n_fuzzy_sets": [4, 6, 8, 10, 12, 14, 16, 18, 20],
311
+ "sigma": [0.1, 0.5, 1, 5, 9],
312
+ "fuzzy_part_func": ["matrix_F_transform_gauss"],
313
+ }
314
+
315
+ # Define grid for non-Gaussian fuzzy functions (excludes 'sigma')
316
+ grid_non_gauss = {
317
+ "n_lags": [1, 3, 5, 7, 9],
318
+ "n_fuzzy_sets": [4, 6, 8, 10, 12, 14, 16, 18, 20],
319
+ "sigma": [None], # Set sigma to None for non-Gaussian functions
320
+ "fuzzy_part_func": ["matrix_F_transform_cosine", "matrix_F_transform_triangle"],
321
+ }
322
+
323
+ # Combine the grids
324
+ grid_gauss = list(ParameterGrid(grid_gauss))
325
+ grid_non_gauss = list(ParameterGrid(grid_non_gauss))
326
+ combined_grid = grid_gauss + grid_non_gauss
327
+
328
+ ## Run the grid search------------------------------------------------------------------------------------------------------
329
+ best_metric_value = float("inf")
330
+ best_config = None
331
+ num_evaluations = 0
332
+
333
+ for config in combined_grid:
334
+ selected_config = get_config(config)
335
+ # Count the configuration being evaluated
336
+ num_evaluations += 1
337
+
338
+ if num_evaluations >= n_trials:
339
+ break
340
+
341
+ ## If number of evaluation is divisible by 20 print the number of evaluations
342
+ if num_evaluations % 20 == 0:
343
+ print(f"Number of evaluations done: {num_evaluations}")
344
+
345
+ # Evaluate the config on the validation set using train_val_pipeline
346
+ metric_value = train_val_pipeline(
347
+ train_set, val_set, selected_config, metric, diff_type
348
+ )
349
+
350
+ # Update best config if this one is better according to the selected metric
351
+ if metric_value < best_metric_value:
352
+ best_metric_value = metric_value
353
+ best_config = config
354
+
355
+ return best_config, best_metric_value, num_evaluations
356
+
357
+
358
+ def fit_calibrate_predict(
359
+ train_set: pd.DataFrame,
360
+ test_set: pd.DataFrame,
361
+ config: Dict,
362
+ model_type: Literal["xgb", "mlp", "tpot"] = "xgb",
363
+ number_cv_calib=5,
364
+ diff_type: Literal["perc", "abs"] = "perc",
365
+ covariates: list[str] = None,
366
+ exclude_bool: bool = False,
367
+ ) -> float:
368
+ """
369
+ Aim of this question is to train a model on the train set, calibrate it using the calibration method provided, and predict it on the test set using the metric provided.
370
+ """
371
+
372
+ config = get_config(config)
373
+
374
+ # Step 1: Validate and preprocess the input data
375
+ train_set = validate_and_clean_input(train_set, covariates=covariates)
376
+ test_set = validate_and_clean_input(test_set, covariates=covariates)
377
+
378
+ train_set_preprocessed, scaler_train = preprocess_data(
379
+ train_set, diff_type=diff_type
380
+ )
381
+ test_set_preprocessed = preprocess_data_val(
382
+ df=test_set, df_train=train_set, diff_type=diff_type, scaler=scaler_train
383
+ )
384
+
385
+ # Step 2: Fuzzy Partition for train, validation, and test sets
386
+ fp = FuzzyPartition(
387
+ fuzzy_function=config["fuzzy_part_func"],
388
+ n_fuzzy_sets=config["n_fuzzy_sets"],
389
+ sigma=config["sigma"],
390
+ scaler=scaler_train,
391
+ verbosity=config["verbosity"],
392
+ )
393
+
394
+ # Prepare train, validation, and test fuzzy partitions
395
+ X_training = train_set_preprocessed["diff_scaled"].values
396
+ X_test = test_set_preprocessed["diff_scaled"].values
397
+
398
+ train_fuzzy_partition, _, _ = fp.fuzzy_partition(X_training)
399
+ test_fuzzy_partition, _, center_points_unscaled_test = fp.fuzzy_partition(X_test)
400
+
401
+ if exclude_bool:
402
+ ## Remove column left from train_fuzzy_partition
403
+ train_fuzzy_partition = train_fuzzy_partition.drop(columns=["left"])
404
+ test_fuzzy_partition = test_fuzzy_partition.drop(columns=["left"])
405
+
406
+ # Prepare data for model training, validation, and testing
407
+ X_train, y_train = prepare_for_model(
408
+ train_fuzzy_partition.copy(), config["number_of_lags"]
409
+ )
410
+ X_test_final, _ = prepare_for_model_val_set(
411
+ df_val_fp=test_fuzzy_partition.copy(),
412
+ df_train_fp=train_fuzzy_partition.copy(),
413
+ n_lags=config["number_of_lags"],
414
+ )
415
+
416
+ if covariates:
417
+ train_covariates = train_set[covariates].copy()
418
+ test_covariates = test_set[covariates].copy()
419
+
420
+ X_train = pd.concat([X_train, train_covariates], axis=1)
421
+ X_test_final = pd.concat([X_test_final, test_covariates], axis=1)
422
+
423
+ # Step 3: Train the model on the combined train and validation set
424
+ model = FuzzyPipelineModel(
425
+ n_fuzzy_sets=config["n_fuzzy_sets"],
426
+ number_of_lags=config["number_of_lags"],
427
+ verbosity=config["verbosity"],
428
+ )
429
+
430
+ model.fit(X_train, y_train, model_type=model_type)
431
+
432
+ try:
433
+ # Step 4: Calibrate the model using CalibratedClassifierCV
434
+ model.calibrate(X_train, y_train, method="sigmoid", cv=number_cv_calib)
435
+ except:
436
+ pass
437
+
438
+ # Step 5: Make predictions and evaluate on the test set
439
+ y_test_pred_fuzzy_set = model.predict(X_test_final)
440
+
441
+ ## Convert prediction to crips number using center points of fuzzy sets
442
+ y_test_pred_center_point = [
443
+ center_points_unscaled_test[i] for i in y_test_pred_fuzzy_set
444
+ ]
445
+
446
+ ## Recalculate percentage difference to actual values
447
+ y_test_pred = [None] * len(test_set)
448
+
449
+ # Set the first prediction using the last known value from the train set
450
+ last_train_value = train_set["Y"].iloc[
451
+ -1
452
+ ] # Assuming `df_train` holds the training data
453
+ y_test_pred[0] = last_train_value * (1 + y_test_pred_center_point[0])
454
+
455
+ if diff_type == "perc":
456
+ # Loop to calculate each subsequent prediction based on the actual previous value in `df_test['Y']`
457
+ for i in range(1, len(test_set)):
458
+ prev_Y = test_set["Y"].iloc[
459
+ i - 1
460
+ ] # Use the previous actual value from `df_test`
461
+ perc_change = y_test_pred_center_point[i]
462
+ y_test_pred[i] = prev_Y * (1 + perc_change)
463
+
464
+ elif diff_type == "abs":
465
+ for i in range(1, len(test_set)):
466
+ prev_Y = test_set["Y"].iloc[i - 1]
467
+ y_test_pred[i] = prev_Y + y_test_pred_center_point[i]
468
+
469
+ return y_test_pred_fuzzy_set, y_test_pred_center_point, y_test_pred