autofuzzts 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autofuzzts/pipeline.py +234 -148
- autofuzzts-0.1.2.dist-info/METADATA +146 -0
- {autofuzzts-0.1.0.dist-info → autofuzzts-0.1.2.dist-info}/RECORD +6 -6
- autofuzzts-0.1.0.dist-info/METADATA +0 -41
- {autofuzzts-0.1.0.dist-info → autofuzzts-0.1.2.dist-info}/WHEEL +0 -0
- {autofuzzts-0.1.0.dist-info → autofuzzts-0.1.2.dist-info}/licenses/LICENSE +0 -0
- {autofuzzts-0.1.0.dist-info → autofuzzts-0.1.2.dist-info}/top_level.txt +0 -0
autofuzzts/pipeline.py
CHANGED
|
@@ -8,8 +8,11 @@ from autofuzzts.config import get_config
|
|
|
8
8
|
from autofuzzts.data import data_loader
|
|
9
9
|
from autofuzzts.data_validation.validate import validate_and_clean_input
|
|
10
10
|
from autofuzzts.partition.partition import FuzzyPartition
|
|
11
|
-
from autofuzzts.preprocess.preprocess import preprocess_data,preprocess_data_val
|
|
12
|
-
from autofuzzts.preprocess.prep_for_model import
|
|
11
|
+
from autofuzzts.preprocess.preprocess import preprocess_data, preprocess_data_val
|
|
12
|
+
from autofuzzts.preprocess.prep_for_model import (
|
|
13
|
+
prepare_for_model,
|
|
14
|
+
prepare_for_model_val_set,
|
|
15
|
+
)
|
|
13
16
|
from autofuzzts.models.fuzzy_classifier import FuzzyPipelineModel
|
|
14
17
|
|
|
15
18
|
from sklearn.model_selection import ParameterGrid
|
|
@@ -17,7 +20,11 @@ from sklearn.calibration import CalibratedClassifierCV
|
|
|
17
20
|
|
|
18
21
|
|
|
19
22
|
## Import RMSE and MAE
|
|
20
|
-
from sklearn.metrics import
|
|
23
|
+
from sklearn.metrics import (
|
|
24
|
+
root_mean_squared_error,
|
|
25
|
+
mean_absolute_error,
|
|
26
|
+
mean_squared_error,
|
|
27
|
+
)
|
|
21
28
|
|
|
22
29
|
# Example custom configuration
|
|
23
30
|
custom_config = {
|
|
@@ -29,100 +36,126 @@ custom_config = {
|
|
|
29
36
|
selected_config = get_config(custom_config)
|
|
30
37
|
|
|
31
38
|
|
|
32
|
-
|
|
33
39
|
def run_pipeline(datasetet_name: str, config: dict = selected_config):
|
|
34
40
|
# Load data
|
|
35
41
|
|
|
36
42
|
data = data_loader.load_sample_data(datasetet_name)
|
|
37
43
|
print(data.head(5))
|
|
38
|
-
print(
|
|
44
|
+
print("Evaluated configuration is")
|
|
39
45
|
print(config)
|
|
40
46
|
|
|
41
47
|
pass
|
|
42
|
-
|
|
43
48
|
|
|
44
|
-
|
|
45
|
-
|
|
49
|
+
|
|
50
|
+
def train_val_pipeline(
|
|
51
|
+
train_set: pd.DataFrame,
|
|
52
|
+
val_set: pd.DataFrame,
|
|
53
|
+
config: Dict = selected_config,
|
|
54
|
+
metric: Literal["rmse", "mse", "mae"] = "rmse",
|
|
55
|
+
diff_type: Literal["perc", "abs"] = "perc",
|
|
56
|
+
covariates: list[str] = None,
|
|
57
|
+
) -> float:
|
|
46
58
|
train_set = validate_and_clean_input(train_set, covariates)
|
|
47
59
|
val_set = validate_and_clean_input(val_set, covariates)
|
|
48
60
|
|
|
49
|
-
print(
|
|
61
|
+
print("train set length:", len(train_set))
|
|
50
62
|
|
|
51
|
-
if covariates
|
|
63
|
+
if covariates:
|
|
52
64
|
train_covariates = train_set[covariates].copy()
|
|
53
|
-
val_covariates = val_set[covariates].copy()
|
|
54
|
-
|
|
65
|
+
val_covariates = val_set[covariates].copy()
|
|
55
66
|
|
|
56
67
|
train_set_preprocessed, scaler_train = preprocess_data(train_set, diff_type)
|
|
57
|
-
val_set_preprocessed = preprocess_data_val(
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
68
|
+
val_set_preprocessed = preprocess_data_val(
|
|
69
|
+
df=val_set, df_train=train_set, diff_type=diff_type, scaler=scaler_train
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
fp = FuzzyPartition(
|
|
73
|
+
fuzzy_function=config["fuzzy_part_func"],
|
|
74
|
+
n_clusters=config["n_clusters"],
|
|
75
|
+
sigma=config["sigma"],
|
|
76
|
+
scaler=scaler_train,
|
|
77
|
+
verbosity=config["verbosity"],
|
|
78
|
+
)
|
|
61
79
|
|
|
62
80
|
# Prepare train and validation fuzzy partitions
|
|
63
|
-
X_training = train_set_preprocessed[
|
|
64
|
-
X_validation = val_set_preprocessed[
|
|
65
|
-
|
|
81
|
+
X_training = train_set_preprocessed["diff_scaled"].values
|
|
82
|
+
X_validation = val_set_preprocessed["diff_scaled"].values
|
|
66
83
|
|
|
67
|
-
train_fuzzy_partition,_,_
|
|
68
|
-
val_fuzzy_partition, _,center_points_unscaled_test_val
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
84
|
+
train_fuzzy_partition, _, _ = fp.fuzzy_partition(X_training)
|
|
85
|
+
val_fuzzy_partition, _, center_points_unscaled_test_val = fp.fuzzy_partition(
|
|
86
|
+
X_validation
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
X_train, y_train = prepare_for_model(
|
|
90
|
+
train_fuzzy_partition.copy(), config["number_of_lags"]
|
|
91
|
+
)
|
|
92
|
+
X_val, y_val = prepare_for_model_val_set(
|
|
93
|
+
df_val_fp=val_fuzzy_partition.copy(),
|
|
94
|
+
df_train_fp=train_fuzzy_partition.copy(),
|
|
95
|
+
n_lags=config["number_of_lags"],
|
|
96
|
+
)
|
|
72
97
|
|
|
73
|
-
|
|
74
98
|
if covariates:
|
|
75
99
|
X_train = pd.concat([X_train, train_covariates], axis=1)
|
|
76
100
|
X_val = pd.concat([X_val, val_covariates], axis=1)
|
|
77
101
|
|
|
78
|
-
model = FuzzyPipelineModel(
|
|
79
|
-
|
|
102
|
+
model = FuzzyPipelineModel(
|
|
103
|
+
n_clusters=config["n_clusters"],
|
|
104
|
+
number_of_lags=config["number_of_lags"],
|
|
105
|
+
verbosity=config["verbosity"],
|
|
106
|
+
)
|
|
80
107
|
|
|
81
|
-
model.fit(X_train, y_train, model_type=
|
|
108
|
+
model.fit(X_train, y_train, model_type="xgb")
|
|
82
109
|
|
|
83
110
|
pred_cluster = model.predict(X_val)
|
|
84
111
|
|
|
85
|
-
|
|
86
112
|
## Convert prediction to crips number using center points of clusters
|
|
87
113
|
y_val_pred_center_point = [center_points_unscaled_test_val[i] for i in pred_cluster]
|
|
88
114
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
115
|
## Recalculate percentage difference to actual values
|
|
93
|
-
y_val_pred= [None] * len(val_set)
|
|
116
|
+
y_val_pred = [None] * len(val_set)
|
|
94
117
|
|
|
95
118
|
# Set the first prediction using the last known value from the train set
|
|
96
|
-
last_train_value = train_set[
|
|
119
|
+
last_train_value = train_set["Y"].iloc[
|
|
120
|
+
-1
|
|
121
|
+
] # Assuming `df_train` holds the training data
|
|
97
122
|
y_val_pred[0] = last_train_value * (1 + y_val_pred_center_point[0])
|
|
98
123
|
|
|
99
124
|
# Loop to calculate each subsequent prediction based on the actual previous value in `df_test['Y']`
|
|
100
125
|
|
|
101
|
-
if diff_type ==
|
|
126
|
+
if diff_type == "perc":
|
|
102
127
|
for i in range(1, len(val_set)):
|
|
103
|
-
prev_Y = val_set[
|
|
128
|
+
prev_Y = val_set["Y"].iloc[
|
|
129
|
+
i - 1
|
|
130
|
+
] # Use the previous actual value from `df_test`
|
|
104
131
|
perc_change = y_val_pred_center_point[i]
|
|
105
132
|
y_val_pred[i] = prev_Y * (1 + perc_change)
|
|
106
133
|
|
|
107
|
-
elif diff_type ==
|
|
134
|
+
elif diff_type == "abs":
|
|
108
135
|
for i in range(1, len(val_set)):
|
|
109
|
-
prev_Y = val_set[
|
|
136
|
+
prev_Y = val_set["Y"].iloc[i - 1]
|
|
110
137
|
y_val_pred[i] = prev_Y + y_val_pred_center_point[i]
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
metric_value = mean_absolute_error(val_set['Y'], y_val_pred)
|
|
138
|
+
|
|
139
|
+
if metric == "rmse":
|
|
140
|
+
metric_value = root_mean_squared_error(val_set["Y"], y_val_pred)
|
|
141
|
+
elif metric == "mse":
|
|
142
|
+
metric_value = root_mean_squared_error(val_set["Y"], y_val_pred) ** 2
|
|
143
|
+
elif metric == "mae":
|
|
144
|
+
metric_value = mean_absolute_error(val_set["Y"], y_val_pred)
|
|
119
145
|
else:
|
|
120
|
-
raise ValueError(
|
|
121
|
-
|
|
146
|
+
raise ValueError(
|
|
147
|
+
f"Invalid metric {metric}. Please choose one of 'rmse', 'mse', 'mae'"
|
|
148
|
+
)
|
|
149
|
+
|
|
122
150
|
return metric_value
|
|
123
151
|
|
|
124
|
-
|
|
125
|
-
|
|
152
|
+
|
|
153
|
+
def train_model(
|
|
154
|
+
dataset: pd.DataFrame,
|
|
155
|
+
config: Dict,
|
|
156
|
+
model_type: Literal["xgb", "mlp", "tpot"] = "xgb",
|
|
157
|
+
):
|
|
158
|
+
"""
|
|
126
159
|
Function to train a model on the dataset provided.
|
|
127
160
|
|
|
128
161
|
Parameters:
|
|
@@ -132,52 +165,81 @@ def train_model(dataset: pd.DataFrame, config: Dict, model_type: Literal['xgb','
|
|
|
132
165
|
The configuration dictionary for the model.
|
|
133
166
|
model_type: str
|
|
134
167
|
The type of model to train. Default is 'xgb'.
|
|
135
|
-
|
|
136
|
-
|
|
168
|
+
|
|
169
|
+
"""
|
|
137
170
|
config = get_config(config)
|
|
138
171
|
|
|
139
172
|
df = validate_and_clean_input(dataset)
|
|
140
|
-
|
|
141
|
-
df_preprocessed, scaler_train = preprocess_data(df, diff_type='perc')
|
|
142
173
|
|
|
143
|
-
|
|
144
|
-
|
|
174
|
+
df_preprocessed, scaler_train = preprocess_data(df, diff_type="perc")
|
|
175
|
+
|
|
176
|
+
fp = FuzzyPartition(
|
|
177
|
+
fuzzy_function=config["fuzzy_part_func"],
|
|
178
|
+
n_clusters=config["n_clusters"],
|
|
179
|
+
sigma=config["sigma"],
|
|
180
|
+
scaler=scaler_train,
|
|
181
|
+
verbosity=config["verbosity"],
|
|
182
|
+
)
|
|
145
183
|
|
|
146
|
-
X_training = df_preprocessed[
|
|
184
|
+
X_training = df_preprocessed["diff_scaled"].values
|
|
147
185
|
|
|
148
|
-
train_fuzzy_partition,_,_
|
|
186
|
+
train_fuzzy_partition, _, _ = fp.fuzzy_partition(X_training)
|
|
149
187
|
|
|
150
|
-
X_train, y_train = prepare_for_model(
|
|
188
|
+
X_train, y_train = prepare_for_model(
|
|
189
|
+
train_fuzzy_partition.copy(), config["number_of_lags"]
|
|
190
|
+
)
|
|
151
191
|
|
|
152
|
-
model_train = FuzzyPipelineModel(
|
|
192
|
+
model_train = FuzzyPipelineModel(
|
|
193
|
+
n_clusters=config["n_clusters"],
|
|
194
|
+
number_of_lags=config["number_of_lags"],
|
|
195
|
+
verbosity=config["verbosity"],
|
|
196
|
+
)
|
|
153
197
|
|
|
154
198
|
model_train.fit(X_train, y_train, model_type=model_type)
|
|
155
|
-
|
|
156
|
-
|
|
199
|
+
|
|
157
200
|
return model_train, scaler_train
|
|
158
201
|
|
|
159
|
-
|
|
160
|
-
|
|
202
|
+
|
|
203
|
+
def tune_hyperparameters_bayes(
|
|
204
|
+
train_set: pd.DataFrame,
|
|
205
|
+
val_set: pd.DataFrame,
|
|
206
|
+
n_trials: int = 315,
|
|
207
|
+
metric: Literal["rmse", "mse", "mae"] = "rmse",
|
|
208
|
+
diff_type: Literal["perc", "abs"] = "perc",
|
|
209
|
+
covariates: list[str] = None,
|
|
210
|
+
):
|
|
161
211
|
def objective(trial):
|
|
162
212
|
# Define search space based on your specifications
|
|
163
213
|
config = {
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
214
|
+
"n_clusters": trial.suggest_int(
|
|
215
|
+
"n_clusters", 4, 40
|
|
216
|
+
), # Number of fuzzy sets
|
|
217
|
+
"number_of_lags": trial.suggest_int(
|
|
218
|
+
"number_of_lags", 1, 10
|
|
219
|
+
), # Number of lags
|
|
220
|
+
"fuzzy_part_func": trial.suggest_categorical(
|
|
221
|
+
"fuzzy_part_func", ["Triangle", "Cosine", "Gaussian"]
|
|
222
|
+
), # Partition function type
|
|
167
223
|
}
|
|
168
224
|
|
|
169
|
-
if config[
|
|
170
|
-
config[
|
|
225
|
+
if config["fuzzy_part_func"] == "Gaussian":
|
|
226
|
+
config["sigma"] = trial.suggest_float("sigma", 0.1, 4, log=True)
|
|
171
227
|
else:
|
|
172
|
-
config[
|
|
228
|
+
config["sigma"] = None
|
|
173
229
|
|
|
174
230
|
selected_config = get_config(config)
|
|
175
231
|
|
|
176
232
|
# Use train_val_pipeline to evaluate this configuration
|
|
177
|
-
metric_value = train_val_pipeline(
|
|
233
|
+
metric_value = train_val_pipeline(
|
|
234
|
+
train_set,
|
|
235
|
+
val_set,
|
|
236
|
+
selected_config,
|
|
237
|
+
metric,
|
|
238
|
+
diff_type,
|
|
239
|
+
covariates=covariates,
|
|
240
|
+
)
|
|
178
241
|
return metric_value
|
|
179
242
|
|
|
180
|
-
|
|
181
243
|
# Create and optimize the Optuna study
|
|
182
244
|
study = optuna.create_study(direction="minimize")
|
|
183
245
|
study.optimize(objective, n_trials=n_trials)
|
|
@@ -191,26 +253,37 @@ def tune_hyperparameters_bayes(train_set: pd.DataFrame, val_set: pd.DataFrame, n
|
|
|
191
253
|
return best_config, best_metric_value
|
|
192
254
|
|
|
193
255
|
|
|
194
|
-
def tune_hyperparameters_bayes_Henon(
|
|
256
|
+
def tune_hyperparameters_bayes_Henon(
|
|
257
|
+
train_set: pd.DataFrame,
|
|
258
|
+
val_set: pd.DataFrame,
|
|
259
|
+
n_trials: int = 315,
|
|
260
|
+
metric: Literal["rmse", "mse", "mae"] = "rmse",
|
|
261
|
+
diff_type: Literal["perc", "abs"] = "perc",
|
|
262
|
+
):
|
|
195
263
|
def objective(trial):
|
|
196
264
|
config = {
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
265
|
+
"n_clusters": trial.suggest_int(
|
|
266
|
+
"n_clusters", 2, 29
|
|
267
|
+
), # Number of fuzzy sets
|
|
268
|
+
"number_of_lags": trial.suggest_int("n_lags", 2, 5), # Number of lags
|
|
269
|
+
"fuzzy_part_func": trial.suggest_categorical(
|
|
270
|
+
"fuzzy_part_func", ["Triangle", "Cosine", "Gaussian"]
|
|
271
|
+
),
|
|
200
272
|
}
|
|
201
273
|
|
|
202
|
-
if config[
|
|
203
|
-
config[
|
|
274
|
+
if config["fuzzy_part_func"] == "Gaussian":
|
|
275
|
+
config["sigma"] = trial.suggest_float("sigma", 0.1, 4, log=True)
|
|
204
276
|
else:
|
|
205
|
-
config[
|
|
206
|
-
|
|
277
|
+
config["sigma"] = None
|
|
278
|
+
|
|
207
279
|
selected_config = get_config(config)
|
|
208
280
|
|
|
209
281
|
# Use train_val_pipeline to evaluate this configuration
|
|
210
|
-
metric_value = train_val_pipeline(
|
|
282
|
+
metric_value = train_val_pipeline(
|
|
283
|
+
train_set, val_set, selected_config, metric, diff_type
|
|
284
|
+
)
|
|
211
285
|
return metric_value
|
|
212
286
|
|
|
213
|
-
|
|
214
287
|
# Create and optimize the Optuna study
|
|
215
288
|
study = optuna.create_study(direction="minimize")
|
|
216
289
|
study.optimize(objective, n_trials=n_trials)
|
|
@@ -224,25 +297,27 @@ def tune_hyperparameters_bayes_Henon(train_set: pd.DataFrame, val_set: pd.DataFr
|
|
|
224
297
|
return best_config, best_metric_value
|
|
225
298
|
|
|
226
299
|
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
300
|
+
def tune_hyperparameters_grid(
|
|
301
|
+
train_set: pd.DataFrame,
|
|
302
|
+
val_set: pd.DataFrame,
|
|
303
|
+
n_trials: int = 315,
|
|
304
|
+
metric: Literal["rmse", "mse", "mae"] = "rmse",
|
|
305
|
+
diff_type: Literal["perc", "abs"] = "perc",
|
|
306
|
+
):
|
|
232
307
|
# Define grid for Gaussian fuzzy function (includes 'sigma')
|
|
233
308
|
grid_gauss = {
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
309
|
+
"n_lags": [1, 3, 5, 7, 9],
|
|
310
|
+
"n_clusters": [4, 6, 8, 10, 12, 14, 16, 18, 20],
|
|
311
|
+
"sigma": [0.1, 0.5, 1, 5, 9],
|
|
312
|
+
"fuzzy_part_func": ["matrix_F_transform_gauss"],
|
|
238
313
|
}
|
|
239
314
|
|
|
240
315
|
# Define grid for non-Gaussian fuzzy functions (excludes 'sigma')
|
|
241
316
|
grid_non_gauss = {
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
317
|
+
"n_lags": [1, 3, 5, 7, 9],
|
|
318
|
+
"n_clusters": [4, 6, 8, 10, 12, 14, 16, 18, 20],
|
|
319
|
+
"sigma": [None], # Set sigma to None for non-Gaussian functions
|
|
320
|
+
"fuzzy_part_func": ["matrix_F_transform_cosine", "matrix_F_transform_triangle"],
|
|
246
321
|
}
|
|
247
322
|
|
|
248
323
|
# Combine the grids
|
|
@@ -268,84 +343,95 @@ def tune_hyperparameters_grid(train_set: pd.DataFrame, val_set: pd.DataFrame,n_t
|
|
|
268
343
|
print(f"Number of evaluations done: {num_evaluations}")
|
|
269
344
|
|
|
270
345
|
# Evaluate the config on the validation set using train_val_pipeline
|
|
271
|
-
metric_value = train_val_pipeline(
|
|
346
|
+
metric_value = train_val_pipeline(
|
|
347
|
+
train_set, val_set, selected_config, metric, diff_type
|
|
348
|
+
)
|
|
272
349
|
|
|
273
350
|
# Update best config if this one is better according to the selected metric
|
|
274
351
|
if metric_value < best_metric_value:
|
|
275
352
|
best_metric_value = metric_value
|
|
276
353
|
best_config = config
|
|
277
354
|
|
|
278
|
-
|
|
279
355
|
return best_config, best_metric_value, num_evaluations
|
|
280
356
|
|
|
281
357
|
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
358
|
+
def fit_calibrate_predict(
|
|
359
|
+
train_set: pd.DataFrame,
|
|
360
|
+
test_set: pd.DataFrame,
|
|
361
|
+
config: Dict,
|
|
362
|
+
model_type: Literal["xgb", "mlp", "tpot"] = "xgb",
|
|
363
|
+
number_cv_calib=5,
|
|
364
|
+
diff_type: Literal["perc", "abs"] = "perc",
|
|
365
|
+
covariates: list[str] = None,
|
|
366
|
+
exclude_bool: bool = False,
|
|
367
|
+
) -> float:
|
|
368
|
+
"""
|
|
289
369
|
Aim of this question is to train a model on the train set, calibrate it using the calibration method provided, and predict it on the test set using the metric provided.
|
|
290
|
-
|
|
370
|
+
"""
|
|
291
371
|
|
|
292
372
|
config = get_config(config)
|
|
293
|
-
|
|
373
|
+
|
|
294
374
|
# Step 1: Validate and preprocess the input data
|
|
295
375
|
train_set = validate_and_clean_input(train_set, covariates=covariates)
|
|
296
376
|
test_set = validate_and_clean_input(test_set, covariates=covariates)
|
|
297
377
|
|
|
298
|
-
train_set_preprocessed, scaler_train = preprocess_data(
|
|
299
|
-
|
|
378
|
+
train_set_preprocessed, scaler_train = preprocess_data(
|
|
379
|
+
train_set, diff_type=diff_type
|
|
380
|
+
)
|
|
381
|
+
test_set_preprocessed = preprocess_data_val(
|
|
382
|
+
df=test_set, df_train=train_set, diff_type=diff_type, scaler=scaler_train
|
|
383
|
+
)
|
|
300
384
|
|
|
301
385
|
# Step 2: Fuzzy Partition for train, validation, and test sets
|
|
302
|
-
fp = FuzzyPartition(
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
386
|
+
fp = FuzzyPartition(
|
|
387
|
+
fuzzy_function=config["fuzzy_part_func"],
|
|
388
|
+
n_clusters=config["n_clusters"],
|
|
389
|
+
sigma=config["sigma"],
|
|
390
|
+
scaler=scaler_train,
|
|
391
|
+
verbosity=config["verbosity"],
|
|
392
|
+
)
|
|
307
393
|
|
|
308
394
|
# Prepare train, validation, and test fuzzy partitions
|
|
309
|
-
X_training = train_set_preprocessed[
|
|
310
|
-
X_test = test_set_preprocessed[
|
|
395
|
+
X_training = train_set_preprocessed["diff_scaled"].values
|
|
396
|
+
X_test = test_set_preprocessed["diff_scaled"].values
|
|
311
397
|
|
|
312
398
|
train_fuzzy_partition, _, _ = fp.fuzzy_partition(X_training)
|
|
313
399
|
test_fuzzy_partition, _, center_points_unscaled_test = fp.fuzzy_partition(X_test)
|
|
314
400
|
|
|
315
401
|
if exclude_bool:
|
|
316
402
|
## Remove column left from train_fuzzy_partition
|
|
317
|
-
train_fuzzy_partition = train_fuzzy_partition.drop(columns=[
|
|
318
|
-
test_fuzzy_partition = test_fuzzy_partition.drop(columns=[
|
|
319
|
-
|
|
320
|
-
train_fuzzy_partition.to_csv('train_fuzzy_partition.csv')
|
|
321
|
-
test_fuzzy_partition.to_csv('test_fuzzy_partition.csv')
|
|
322
|
-
|
|
323
|
-
print('center_points_unscaled_test:', center_points_unscaled_test)
|
|
403
|
+
train_fuzzy_partition = train_fuzzy_partition.drop(columns=["left"])
|
|
404
|
+
test_fuzzy_partition = test_fuzzy_partition.drop(columns=["left"])
|
|
324
405
|
|
|
325
406
|
# Prepare data for model training, validation, and testing
|
|
326
|
-
X_train, y_train = prepare_for_model(
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
407
|
+
X_train, y_train = prepare_for_model(
|
|
408
|
+
train_fuzzy_partition.copy(), config["number_of_lags"]
|
|
409
|
+
)
|
|
410
|
+
X_test_final, _ = prepare_for_model_val_set(
|
|
411
|
+
df_val_fp=test_fuzzy_partition.copy(),
|
|
412
|
+
df_train_fp=train_fuzzy_partition.copy(),
|
|
413
|
+
n_lags=config["number_of_lags"],
|
|
414
|
+
)
|
|
415
|
+
|
|
331
416
|
if covariates:
|
|
332
417
|
train_covariates = train_set[covariates].copy()
|
|
333
|
-
test_covariates = test_set[covariates].copy()
|
|
418
|
+
test_covariates = test_set[covariates].copy()
|
|
334
419
|
|
|
335
420
|
X_train = pd.concat([X_train, train_covariates], axis=1)
|
|
336
421
|
X_test_final = pd.concat([X_test_final, test_covariates], axis=1)
|
|
337
|
-
|
|
338
422
|
|
|
339
423
|
# Step 3: Train the model on the combined train and validation set
|
|
340
|
-
model = FuzzyPipelineModel(
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
424
|
+
model = FuzzyPipelineModel(
|
|
425
|
+
n_clusters=config["n_clusters"],
|
|
426
|
+
number_of_lags=config["number_of_lags"],
|
|
427
|
+
verbosity=config["verbosity"],
|
|
428
|
+
)
|
|
429
|
+
|
|
344
430
|
model.fit(X_train, y_train, model_type=model_type)
|
|
345
431
|
|
|
346
432
|
try:
|
|
347
433
|
# Step 4: Calibrate the model using CalibratedClassifierCV
|
|
348
|
-
model.calibrate(X_train, y_train, method=
|
|
434
|
+
model.calibrate(X_train, y_train, method="sigmoid", cv=number_cv_calib)
|
|
349
435
|
except:
|
|
350
436
|
pass
|
|
351
437
|
|
|
@@ -353,31 +439,31 @@ def train_calib_pred_test(train_set: pd.DataFrame, test_set: pd.DataFrame,
|
|
|
353
439
|
y_test_pred_cluster = model.predict(X_test_final)
|
|
354
440
|
|
|
355
441
|
## Convert prediction to crips number using center points of clusters
|
|
356
|
-
y_test_pred_center_point = [
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
442
|
+
y_test_pred_center_point = [
|
|
443
|
+
center_points_unscaled_test[i] for i in y_test_pred_cluster
|
|
444
|
+
]
|
|
360
445
|
|
|
361
446
|
## Recalculate percentage difference to actual values
|
|
362
|
-
y_test_pred= [None] * len(test_set)
|
|
447
|
+
y_test_pred = [None] * len(test_set)
|
|
363
448
|
|
|
364
449
|
# Set the first prediction using the last known value from the train set
|
|
365
|
-
last_train_value = train_set[
|
|
450
|
+
last_train_value = train_set["Y"].iloc[
|
|
451
|
+
-1
|
|
452
|
+
] # Assuming `df_train` holds the training data
|
|
366
453
|
y_test_pred[0] = last_train_value * (1 + y_test_pred_center_point[0])
|
|
367
454
|
|
|
368
|
-
if diff_type ==
|
|
455
|
+
if diff_type == "perc":
|
|
369
456
|
# Loop to calculate each subsequent prediction based on the actual previous value in `df_test['Y']`
|
|
370
457
|
for i in range(1, len(test_set)):
|
|
371
|
-
prev_Y = test_set[
|
|
458
|
+
prev_Y = test_set["Y"].iloc[
|
|
459
|
+
i - 1
|
|
460
|
+
] # Use the previous actual value from `df_test`
|
|
372
461
|
perc_change = y_test_pred_center_point[i]
|
|
373
462
|
y_test_pred[i] = prev_Y * (1 + perc_change)
|
|
374
463
|
|
|
375
|
-
elif diff_type ==
|
|
464
|
+
elif diff_type == "abs":
|
|
376
465
|
for i in range(1, len(test_set)):
|
|
377
|
-
prev_Y = test_set[
|
|
466
|
+
prev_Y = test_set["Y"].iloc[i - 1]
|
|
378
467
|
y_test_pred[i] = prev_Y + y_test_pred_center_point[i]
|
|
379
468
|
|
|
380
|
-
return y_test_pred_cluster, y_test_pred_center_point,y_test_pred
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
469
|
+
return y_test_pred_cluster, y_test_pred_center_point, y_test_pred
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: autofuzzts
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: 'Time series forecasting using fuzzy logic and AutoML'
|
|
5
|
+
Author-email: Jan Timko <jantimko16@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/jtimko16/AutoFuzzTS
|
|
8
|
+
Project-URL: Repository, https://github.com/jtimko16/AutoFuzzTS
|
|
9
|
+
Requires-Python: >=3.11
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Requires-Dist: numpy>=1.26.0
|
|
13
|
+
Requires-Dist: pandas>=2.2.0
|
|
14
|
+
Requires-Dist: scikit-learn>=1.5.0
|
|
15
|
+
Requires-Dist: scipy>=1.15.0
|
|
16
|
+
Requires-Dist: xgboost>=3.0.0
|
|
17
|
+
Requires-Dist: lightgbm>=4.6.0
|
|
18
|
+
Requires-Dist: tpot>=1.0.0
|
|
19
|
+
Requires-Dist: optuna>=4.3.0
|
|
20
|
+
Requires-Dist: matplotlib>=3.10.0
|
|
21
|
+
Requires-Dist: seaborn>=0.13.0
|
|
22
|
+
Requires-Dist: requests>=2.32.0
|
|
23
|
+
Requires-Dist: PyYAML>=6.0.0
|
|
24
|
+
Requires-Dist: joblib>=1.4.0
|
|
25
|
+
Requires-Dist: tqdm>=4.67.0
|
|
26
|
+
Dynamic: license-file
|
|
27
|
+
|
|
28
|
+
# AutoFuzzTS
|
|
29
|
+
|
|
30
|
+
Time series forecasting library using fuzzy logic and automated machine learning.
|
|
31
|
+
Build and evaluate time series models automatically using fuzzy logic and AutoML techniques.
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install autofuzzts
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## 🚀 Quick Start
|
|
40
|
+
|
|
41
|
+
### Load and prepare your time series data
|
|
42
|
+
```python
|
|
43
|
+
import pandas as pd
|
|
44
|
+
|
|
45
|
+
# Load dataset into a pandas DataFrame
|
|
46
|
+
data = pd.read_csv("../clean_data/ADBE_yf_hourly_cleaned.csv").head(240)
|
|
47
|
+
|
|
48
|
+
# Select the target column to forecast
|
|
49
|
+
data_column_name = "close_price"
|
|
50
|
+
df = data[[data_column_name]].copy()
|
|
51
|
+
|
|
52
|
+
# Split into train, validation, and test sets
|
|
53
|
+
test_len = len(df) // 5
|
|
54
|
+
val_len = len(df) // 5
|
|
55
|
+
train_len = len(df) - test_len - val_len
|
|
56
|
+
|
|
57
|
+
df_train = df[:train_len]
|
|
58
|
+
df_val = df[train_len:(train_len + val_len)]
|
|
59
|
+
df_test = df[(train_len + val_len):]
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
### Tune hyperparameters using Bayesian search
|
|
65
|
+
```python
|
|
66
|
+
from autofuzzts import pipeline
|
|
67
|
+
|
|
68
|
+
# Run Bayesian optimization for fuzzy pipeline configuration
|
|
69
|
+
best_config, best_rmse = pipeline.tune_hyperparameters_bayes(
|
|
70
|
+
train_set=df_train,
|
|
71
|
+
val_set=df_val,
|
|
72
|
+
n_trials=20,
|
|
73
|
+
metric="rmse"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
print(f"Best configuration: {best_config}")
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
**Example output:**
|
|
80
|
+
```
|
|
81
|
+
Best configuration: {'n_clusters': 19, 'number_of_lags': 2, 'fuzzy_part_func': 'Triangle'}
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
---
|
|
85
|
+
|
|
86
|
+
### Train, calibrate, and predict
|
|
87
|
+
```python
|
|
88
|
+
from autofuzzts import fit_calibrate_predict
|
|
89
|
+
|
|
90
|
+
# Train model, calibrate, and make one-step-ahead predictions
|
|
91
|
+
pred_set, pred_center_points, pred_test = fit_calibrate_predict(
|
|
92
|
+
train_set=df_train,
|
|
93
|
+
test_set=df_test,
|
|
94
|
+
config=best_config,
|
|
95
|
+
model_type="xgb"
|
|
96
|
+
)
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
This returns:
|
|
100
|
+
- `pred_set`: predicted fuzzy sets
|
|
101
|
+
- `pred_center_points`: corresponding fuzzy center values
|
|
102
|
+
- `pred_test`: crisp numeric predictions (one-step-ahead forecast)
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
## Function Overview
|
|
107
|
+
|
|
108
|
+
### `fit_calibrate_predict()`
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
fit_calibrate_predict(
|
|
112
|
+
train_set: pd.DataFrame,
|
|
113
|
+
test_set: pd.DataFrame,
|
|
114
|
+
config: dict,
|
|
115
|
+
model_type: Literal['xgb', 'mlp', 'tpot'] = 'xgb',
|
|
116
|
+
number_cv_calib: int = 5,
|
|
117
|
+
diff_type: Literal['perc', 'abs'] = 'perc',
|
|
118
|
+
covariates: list[str] | None = None,
|
|
119
|
+
exclude_bool: bool = False
|
|
120
|
+
) -> float
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Trains and calibrates a fuzzy time series model on the training set using
|
|
124
|
+
cross-validation, then predicts on the test set and returns performance metrics.
|
|
125
|
+
|
|
126
|
+
---
|
|
127
|
+
|
|
128
|
+
## Description
|
|
129
|
+
|
|
130
|
+
AutoFuzzTS automates the process of fuzzy time series modeling by:
|
|
131
|
+
- building and testing multiple fuzzy pipelines,
|
|
132
|
+
- tuning hyperparameters using Bayesian optimization, and
|
|
133
|
+
- integrating tuned classification models - **XGBoost**, **MLP**, or **TPOT**.
|
|
134
|
+
|
|
135
|
+
This allows for rapid experimentation and selection of optimal configurations
|
|
136
|
+
for forecasting tasks.
|
|
137
|
+
|
|
138
|
+
---
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
## 📄 License
|
|
144
|
+
|
|
145
|
+
This project is licensed under the MIT License.
|
|
146
|
+
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
autofuzzts/__init__.py,sha256=2k_ZeqU7FvqZMFqGm-EYRiV98uxUxmiy5wXygvIobPU,13
|
|
2
2
|
autofuzzts/config.py,sha256=rzwULHfKKtf5Rdpm8pk-zwuXrkKc0dckF-xIfz1UVcY,392
|
|
3
|
-
autofuzzts/pipeline.py,sha256=
|
|
3
|
+
autofuzzts/pipeline.py,sha256=wwaVXBvnoAvd3MDvEaj4xKqPlBWMSyOHSR5TOTP2jTo,16189
|
|
4
4
|
autofuzzts/utils.py,sha256=lywC_KhHuYgjUmXjj-ay9vZYTKUSxFgWXY2q6EdWf9s,10
|
|
5
5
|
autofuzzts/data/__init__.py,sha256=2k_ZeqU7FvqZMFqGm-EYRiV98uxUxmiy5wXygvIobPU,13
|
|
6
6
|
autofuzzts/data/data_loader.py,sha256=VO8V9O3WgXffyktUMSmbGTiXWBJ2kgN5wLqgFgvkE6w,266
|
|
@@ -18,8 +18,8 @@ autofuzzts/partition/visualize_partition.py,sha256=F31yovGfosqa-EmtuQdIIuF61XejH
|
|
|
18
18
|
autofuzzts/preprocess/__init__.py,sha256=2k_ZeqU7FvqZMFqGm-EYRiV98uxUxmiy5wXygvIobPU,13
|
|
19
19
|
autofuzzts/preprocess/prep_for_model.py,sha256=mp19PGo_p8YWezSny__qKnuTREhAldSlxCzIutrisGk,2565
|
|
20
20
|
autofuzzts/preprocess/preprocess.py,sha256=QZ0h4bZslwOrjTUyvPQaXDT_lBlnL8nKdp545Qy3xdk,2786
|
|
21
|
-
autofuzzts-0.1.
|
|
22
|
-
autofuzzts-0.1.
|
|
23
|
-
autofuzzts-0.1.
|
|
24
|
-
autofuzzts-0.1.
|
|
25
|
-
autofuzzts-0.1.
|
|
21
|
+
autofuzzts-0.1.2.dist-info/licenses/LICENSE,sha256=bjnZy7iTBVYeRcAPI9NVlXeQGx62R13_t8xwoLq44Ms,1087
|
|
22
|
+
autofuzzts-0.1.2.dist-info/METADATA,sha256=XuLUJuUcurF9DZE0YLGwFzkdJbIEZMXdJ3MI2KFztNk,3764
|
|
23
|
+
autofuzzts-0.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
24
|
+
autofuzzts-0.1.2.dist-info/top_level.txt,sha256=YHgbVRUPg-x2WX7FKyJMUAeI9o46c8XFiR_eYKtXIxc,11
|
|
25
|
+
autofuzzts-0.1.2.dist-info/RECORD,,
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: autofuzzts
|
|
3
|
-
Version: 0.1.0
|
|
4
|
-
Summary: Automated fuzzy time series forecasting library
|
|
5
|
-
Home-page: https://github.com/jtimko16/AutoFuzzTS
|
|
6
|
-
Author: Jan Timko
|
|
7
|
-
Author-email: Jan Timko <jantimko16@gmail.com>
|
|
8
|
-
License: MIT
|
|
9
|
-
Project-URL: Homepage, https://github.com/jtimko16/AutoFuzzTS
|
|
10
|
-
Project-URL: Bug Tracker, https://github.com/jtimko16/AutoFuzzTS/issues
|
|
11
|
-
Requires-Python: >=3.11
|
|
12
|
-
Description-Content-Type: text/markdown
|
|
13
|
-
License-File: LICENSE
|
|
14
|
-
Requires-Dist: numpy>=1.26.0
|
|
15
|
-
Requires-Dist: pandas>=2.2.0
|
|
16
|
-
Requires-Dist: scikit-learn>=1.5.0
|
|
17
|
-
Requires-Dist: scipy>=1.15.0
|
|
18
|
-
Requires-Dist: xgboost>=3.0.0
|
|
19
|
-
Requires-Dist: lightgbm>=4.6.0
|
|
20
|
-
Requires-Dist: tpot>=1.0.0
|
|
21
|
-
Requires-Dist: optuna>=4.3.0
|
|
22
|
-
Requires-Dist: matplotlib>=3.10.0
|
|
23
|
-
Requires-Dist: seaborn>=0.13.0
|
|
24
|
-
Requires-Dist: requests>=2.32.0
|
|
25
|
-
Requires-Dist: PyYAML>=6.0.0
|
|
26
|
-
Requires-Dist: joblib>=1.4.0
|
|
27
|
-
Requires-Dist: tqdm>=4.67.0
|
|
28
|
-
Dynamic: author
|
|
29
|
-
Dynamic: home-page
|
|
30
|
-
Dynamic: license-file
|
|
31
|
-
Dynamic: requires-python
|
|
32
|
-
|
|
33
|
-
# AutoFuzzTS
|
|
34
|
-
|
|
35
|
-
Automated fuzzy time series forecasting library in Python.
|
|
36
|
-
Build and evaluate time series models automatically using fuzzy logic and AutoML techniques.
|
|
37
|
-
|
|
38
|
-
## Installation
|
|
39
|
-
|
|
40
|
-
```bash
|
|
41
|
-
pip install autofuzzts
|
|
File without changes
|
|
File without changes
|
|
File without changes
|