lecrapaud 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

Files changed (60) hide show
  1. lecrapaud/__init__.py +0 -0
  2. lecrapaud/config.py +16 -0
  3. lecrapaud/db/__init__.py +0 -0
  4. lecrapaud/db/alembic/README +1 -0
  5. lecrapaud/db/alembic/env.py +78 -0
  6. lecrapaud/db/alembic/script.py.mako +26 -0
  7. lecrapaud/db/alembic/versions/2025_04_06_1738-7390745388e4_initial_setup.py +295 -0
  8. lecrapaud/db/alembic/versions/2025_04_06_1755-40cd8d3e798e_unique_constraint_for_data.py +30 -0
  9. lecrapaud/db/alembic/versions/2025_05_23_1724-2360941fa0bd_longer_string.py +52 -0
  10. lecrapaud/db/alembic/versions/2025_05_27_1159-b96396dcfaff_add_env_to_trading_tables.py +34 -0
  11. lecrapaud/db/alembic/versions/2025_05_27_1337-40cbfc215f7c_fix_nb_character_on_portfolio.py +39 -0
  12. lecrapaud/db/alembic/versions/2025_05_27_1526-3de994115317_to_datetime.py +36 -0
  13. lecrapaud/db/alembic/versions/2025_05_27_2003-25c227c684f8_add_fees_to_transactions.py +30 -0
  14. lecrapaud/db/alembic/versions/2025_05_27_2047-6b6f2d38e9bc_double_instead_of_float.py +132 -0
  15. lecrapaud/db/alembic/versions/2025_05_31_1111-c175e4a36d68_generalise_stock_to_group.py +36 -0
  16. lecrapaud/db/alembic/versions/2025_05_31_1256-5681095bfc27_create_investment_run_and_portfolio_.py +62 -0
  17. lecrapaud/db/alembic/versions/2025_05_31_1806-339927587383_add_investment_run_id.py +107 -0
  18. lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +38 -0
  19. lecrapaud/db/alembic/versions/2025_05_31_1849-3b8550297e8e_change_date_to_datetime.py +44 -0
  20. lecrapaud/db/alembic/versions/2025_05_31_1852-e6b8c95d8243_add_date_to_portfolio_history.py +30 -0
  21. lecrapaud/db/alembic/versions/2025_06_10_1136-db8cdd83563a_addnewsandoptiontodata.py +32 -0
  22. lecrapaud/db/crud.py +179 -0
  23. lecrapaud/db/models/__init__.py +11 -0
  24. lecrapaud/db/models/base.py +6 -0
  25. lecrapaud/db/models/dataset.py +124 -0
  26. lecrapaud/db/models/feature.py +46 -0
  27. lecrapaud/db/models/feature_selection.py +126 -0
  28. lecrapaud/db/models/feature_selection_rank.py +80 -0
  29. lecrapaud/db/models/model.py +41 -0
  30. lecrapaud/db/models/model_selection.py +56 -0
  31. lecrapaud/db/models/model_training.py +54 -0
  32. lecrapaud/db/models/score.py +62 -0
  33. lecrapaud/db/models/target.py +59 -0
  34. lecrapaud/db/services.py +0 -0
  35. lecrapaud/db/setup.py +58 -0
  36. lecrapaud/directory_management.py +28 -0
  37. lecrapaud/feature_engineering.py +1119 -0
  38. lecrapaud/feature_selection.py +1229 -0
  39. lecrapaud/jobs/__init__.py +13 -0
  40. lecrapaud/jobs/config.py +17 -0
  41. lecrapaud/jobs/scheduler.py +36 -0
  42. lecrapaud/jobs/tasks.py +57 -0
  43. lecrapaud/model_selection.py +1571 -0
  44. lecrapaud/predictions.py +292 -0
  45. lecrapaud/search_space.py +844 -0
  46. lecrapaud/services/__init__.py +0 -0
  47. lecrapaud/services/embedding_categorical.py +71 -0
  48. lecrapaud/services/indicators.py +309 -0
  49. lecrapaud/speed_tests/experiments.py +139 -0
  50. lecrapaud/speed_tests/test-gpu-bilstm.ipynb +261 -0
  51. lecrapaud/speed_tests/test-gpu-resnet.ipynb +166 -0
  52. lecrapaud/speed_tests/test-gpu-transformers.ipynb +254 -0
  53. lecrapaud/speed_tests/tests.ipynb +145 -0
  54. lecrapaud/speed_tests/trash.py +37 -0
  55. lecrapaud/training.py +151 -0
  56. lecrapaud/utils.py +246 -0
  57. lecrapaud-0.4.0.dist-info/LICENSE +201 -0
  58. lecrapaud-0.4.0.dist-info/METADATA +103 -0
  59. lecrapaud-0.4.0.dist-info/RECORD +60 -0
  60. lecrapaud-0.4.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,1229 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from datetime import timedelta
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ import os
7
+ import time
8
+ from typing import Optional
9
+ from tqdm import tqdm
10
+ import warnings
11
+ from concurrent.futures import ProcessPoolExecutor, as_completed
12
+ import joblib
13
+ import re
14
+ from pathlib import Path
15
+
16
+ os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
17
+
18
+ # feature selection
19
+ from sklearn.feature_selection import (
20
+ f_classif,
21
+ f_regression,
22
+ mutual_info_classif,
23
+ mutual_info_regression,
24
+ chi2,
25
+ SelectPercentile,
26
+ SelectFpr,
27
+ RFE,
28
+ SelectFromModel,
29
+ )
30
+ from sklearn.decomposition import PCA
31
+ from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
32
+ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
33
+ from sklearn.model_selection import TimeSeriesSplit
34
+ from sklearn.metrics import root_mean_squared_error, log_loss, make_scorer
35
+ from mlxtend.feature_selection import SequentialFeatureSelector
36
+ from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
37
+ from sklearn.compose import ColumnTransformer
38
+ import category_encoders as ce
39
+ from scipy.stats import spearmanr, kendalltau
40
+
41
+ # Scaling
42
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler
43
+
44
+ # Internal
45
+ from src.directory_management import tmp_dir, clean_directory
46
+ from src.utils import logger
47
+ from src.config import PYTHON_ENV
48
+ from src.db.models import (
49
+ Dataset,
50
+ Target,
51
+ Feature,
52
+ FeatureSelection,
53
+ FeatureSelectionRank,
54
+ )
55
+ from src.db.setup import get_db
56
+
57
+ # Variables for targets handling
58
+ TARGETS_NUMBER = range(1, 15)
59
+ TARGETS_CLF = [2, 4, 6, 8, 9, 10, 11]
60
+ TARGETS_MCLF = [11]
61
+ GROUPING_COLUMN = "STOCK"
62
+ DATE_COLUMN = "DATE"
63
+
64
+ # Annoying Warnings
65
+ warnings.filterwarnings("ignore", category=FutureWarning)
66
+
67
+
68
+ def get_dataset_name(
69
+ df, corr_threshold: int = 80, percentile: int = 20, max_features: int = 20
70
+ ):
71
+ number_of_groups = df[GROUPING_COLUMN].nunique()
72
+
73
+ # Try to convert DATE column to datetime safely
74
+ if pd.api.types.is_integer_dtype(df[DATE_COLUMN]):
75
+ df_date = df[DATE_COLUMN].map(pd.Timestamp.fromordinal)
76
+ else:
77
+ df_date = pd.to_datetime(
78
+ df[DATE_COLUMN], errors="coerce"
79
+ ) # convert strings, datetime, etc.
80
+
81
+ name = f"data_{number_of_groups}_{corr_threshold}_{percentile}_{max_features}_{df_date.min().date()}_{df_date.max().date()}"
82
+ if PYTHON_ENV == "Test":
83
+ name = f"test_{name}"
84
+ return name
85
+
86
+
87
+ def create_sets_from_data(
88
+ df: pd.DataFrame,
89
+ corr_threshold: int = 80,
90
+ percentile: int = 20,
91
+ max_features: int = 20,
92
+ ):
93
+
94
+ df.sort_values([DATE_COLUMN, GROUPING_COLUMN], inplace=True)
95
+
96
+ # Drop non-useful column for training
97
+ if "ISIN" in df.columns:
98
+ df.drop(labels=["ISIN"], axis=1, inplace=True)
99
+ if "SECURITY" in df.columns:
100
+ df.drop(labels=["SECURITY"], axis=1, inplace=True)
101
+
102
+ dates = df[DATE_COLUMN].unique()
103
+
104
+ val_first_id = int(len(dates) * 0.6) + 1
105
+ test_first_id = int(len(dates) * 0.8) + 1
106
+
107
+ train = df[df[DATE_COLUMN].isin(dates[:val_first_id])]
108
+ val = df[df[DATE_COLUMN].isin(dates[val_first_id:test_first_id])]
109
+ test = df[df[DATE_COLUMN].isin(dates[test_first_id:])]
110
+
111
+ dates = {}
112
+ dates["start_date"] = pd.to_datetime(df[DATE_COLUMN].iat[0])
113
+ dates["end_date"] = pd.to_datetime(df[DATE_COLUMN].iat[-1])
114
+ for name, data in zip(["train", "val", "test"], [train, val, test]):
115
+ dates[f"{name}_start_date"] = pd.to_datetime(data[DATE_COLUMN].iat[0])
116
+ dates[f"{name}_end_date"] = pd.to_datetime(data[DATE_COLUMN].iat[-1])
117
+
118
+ logger.info(
119
+ f"{len(data['DATE'])} {name} data from {dates[f"{name}_start_date"].strftime('%d/%m/%Y')} to {dates[f"{name}_end_date"].strftime('%d/%m/%Y')}"
120
+ )
121
+
122
+ datasets = {}
123
+
124
+ with get_db() as db:
125
+ all_targets = Target.get_all(db=db)
126
+ matched_targets = [
127
+ target for target in all_targets if target.name in train.columns
128
+ ]
129
+ dataset_name = get_dataset_name(train, corr_threshold, percentile, max_features)
130
+ dataset_dir = f"{tmp_dir}/{dataset_name}"
131
+ preprocessing_dir = f"{dataset_dir}/preprocessing"
132
+ train_data_dir = f"{dataset_dir}/data"
133
+ os.makedirs(dataset_dir, exist_ok=True)
134
+ os.makedirs(preprocessing_dir, exist_ok=True)
135
+ os.makedirs(train_data_dir, exist_ok=True)
136
+
137
+ dataset = datasets[name] = Dataset.upsert(
138
+ match_fields=["name"],
139
+ db=db,
140
+ name=dataset_name,
141
+ path=Path(dataset_dir).resolve(),
142
+ type="training",
143
+ size=df.shape[0],
144
+ train_size=train.shape[0],
145
+ val_size=val.shape[0],
146
+ test_size=test.shape[0],
147
+ number_of_groups=data[GROUPING_COLUMN].nunique(),
148
+ list_of_groups=data[GROUPING_COLUMN].unique().tolist(),
149
+ corr_threshold=corr_threshold,
150
+ percentile=percentile,
151
+ max_features=max_features,
152
+ **dates,
153
+ targets=matched_targets,
154
+ )
155
+
156
+ # encode categoricals
157
+ train = encode_categorical_features(train, fit=True, save_dir=preprocessing_dir)
158
+ val = encode_categorical_features(val, save_dir=preprocessing_dir)
159
+ test = encode_categorical_features(test, save_dir=preprocessing_dir)
160
+
161
+ # save the full data
162
+ if PYTHON_ENV != "Test":
163
+ joblib.dump(df, f"{train_data_dir}/full.pkl")
164
+
165
+ return train, val, test, dataset
166
+
167
+
168
+ def encode_categorical_features(df: pd.DataFrame, save_dir: str, fit: bool = False):
169
+
170
+ X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
171
+ y = df.loc[:, df.columns.str.contains("^TARGET_")]
172
+
173
+ # 1. Timestamps for 'DATE'
174
+ X.loc[:, DATE_COLUMN] = pd.to_datetime(X[DATE_COLUMN]).map(pd.Timestamp.toordinal)
175
+
176
+ if fit:
177
+ # Define columns for ordinal and binary encoding (we should have all possible values in training set, unless we accept unknown values processing)
178
+ ordinal_encoding_features = ["STOCK"]
179
+
180
+ binary_encoding_features = ["SECTOR", "SUBINDUSTRY", "LOCATION"]
181
+
182
+ # Fit and save the ColumnTransformer with OrdinalEncoder and OneHotEncoder
183
+ column_transformer = ColumnTransformer(
184
+ transformers=[
185
+ (
186
+ "ordinal",
187
+ OrdinalEncoder(
188
+ handle_unknown="use_encoded_value",
189
+ unknown_value=-1, # rows with unseen STOCK values will be encoded as -1
190
+ ),
191
+ ordinal_encoding_features,
192
+ ),
193
+ (
194
+ "binary_encoder",
195
+ ce.BinaryEncoder(
196
+ handle_unknown="value",
197
+ ), # rows with unseen values will be encoded as all-zeros in the binary columns
198
+ binary_encoding_features,
199
+ ),
200
+ ],
201
+ remainder="passthrough", # Keep the non-encoded columns like 'DATE'
202
+ )
203
+ transformed_data = column_transformer.fit_transform(X)
204
+ if PYTHON_ENV != "Test":
205
+ joblib.dump(column_transformer, f"{save_dir}/column_transformer.pkl")
206
+ else:
207
+ # Load the ColumnTransformer and apply it
208
+ column_transformer = joblib.load(f"{save_dir}/column_transformer.pkl")
209
+
210
+ transformed_data = column_transformer.transform(X)
211
+
212
+ # Convert to DataFrame for readability and return
213
+ transformed_X = pd.DataFrame(
214
+ transformed_data,
215
+ columns=[
216
+ feature.split("__")[1]
217
+ for feature in column_transformer.get_feature_names_out()
218
+ ],
219
+ index=X.index,
220
+ )
221
+ transformed_X = transformed_X.apply(pd.to_numeric)
222
+ for col in [
223
+ feature.split("__")[1]
224
+ for feature in column_transformer.get_feature_names_out()
225
+ if "remainder" not in feature
226
+ ] + [DATE_COLUMN]:
227
+ transformed_X[col] = transformed_X[col].astype(int)
228
+
229
+ # Insert features in db
230
+ if fit:
231
+ # TODO: in bulk
232
+ for feature in transformed_X.columns:
233
+ dtype = transformed_X[feature].dtype
234
+ if pd.api.types.is_integer_dtype(dtype):
235
+ feature_type = "categorical"
236
+ elif pd.api.types.is_float_dtype(dtype):
237
+ feature_type = "numerical"
238
+ else:
239
+ feature_type = "other"
240
+ Feature.upsert(match_fields=["name"], name=feature, type=feature_type)
241
+ for target in y.columns:
242
+ type = (
243
+ "classification"
244
+ if int(target.split("_")[1]) in TARGETS_CLF
245
+ else "regression"
246
+ )
247
+ # TODO: what about description here ?
248
+ Target.upsert(match_fields=["name", "type"], name=target, type=type)
249
+
250
+ return pd.concat([transformed_X, y], axis=1)
251
+
252
+
253
+ # only work with all features from feat eng in the right order (unused for now)
254
+ def decode_categorical_features(df: pd.DataFrame, save_dir: str):
255
+ X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
256
+ y = df.loc[:, df.columns.str.contains("^TARGET_")]
257
+ index = X.index
258
+ original_dtypes = X.dtypes.to_dict()
259
+
260
+ column_transformer = joblib.load(f"{save_dir}/column_transformer.pkl")
261
+
262
+ X = X.to_numpy()
263
+ arrays = []
264
+ for name, indices in column_transformer.output_indices_.items():
265
+ transformer = column_transformer.named_transformers_.get(name, None)
266
+ arr = X[:, indices.start : indices.stop]
267
+
268
+ if transformer in (None, "passthrough", "drop"):
269
+ pass
270
+
271
+ else:
272
+ arr = transformer.inverse_transform(arr)
273
+
274
+ arrays.append(arr)
275
+
276
+ retarr = np.concatenate(arrays, axis=1)
277
+
278
+ columns_ordinal = [
279
+ feature.split("__")[1]
280
+ for feature in column_transformer.get_feature_names_out()
281
+ if feature.split("__")[0] == "ordinal"
282
+ ]
283
+ columns_binary_encoder = [
284
+ feature.split("__")[1]
285
+ for feature in column_transformer.get_feature_names_out()
286
+ if feature.split("__")[0] == "binary_encoder"
287
+ ]
288
+ # Remove trailing "_number" using regex
289
+ columns_binary_encoder = {
290
+ re.sub(r"_\d+$", "", col) for col in columns_binary_encoder
291
+ }
292
+ columns_binary_encoder = list(columns_binary_encoder)
293
+
294
+ columns_remainder = [
295
+ feature.split("__")[1]
296
+ for feature in column_transformer.get_feature_names_out()
297
+ if feature.split("__")[0] == "remainder"
298
+ ]
299
+ columns = columns_ordinal + columns_binary_encoder + columns_remainder
300
+ decoded_X = pd.DataFrame(
301
+ retarr,
302
+ columns=columns,
303
+ index=index,
304
+ )
305
+
306
+ for col in decoded_X.columns:
307
+ if col in columns_ordinal or col in columns_binary_encoder:
308
+ decoded_X[col] = decoded_X[col].astype(str)
309
+ elif col in original_dtypes:
310
+ decoded_X[col] = decoded_X[col].astype(original_dtypes[col])
311
+
312
+ # revert timestamps to dates
313
+ decoded_X.loc[:, DATE_COLUMN] = decoded_X[DATE_COLUMN].map(pd.Timestamp.fromordinal)
314
+
315
+ return pd.concat([decoded_X, y], axis=1)
316
+
317
+
318
+ # Filter methods
319
+ # ----------------
320
+
321
+
322
+ # Linear correlation (Person's R for regression and ANOVA for classification)
323
+ def select_feature_by_linear_correlation(
324
+ X, y, target_type, percentile: int = 20, save_dir: Optional[str] = None
325
+ ):
326
+ start = time.time()
327
+ test_type = "Person’s R" if target_type == "regression" else "ANOVA"
328
+ logger.debug(f"Running {test_type}...")
329
+
330
+ model = f_regression if target_type == "regression" else f_classif
331
+ feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
332
+ feat_scores = pd.DataFrame()
333
+ feat_scores["score"] = feat_selector.scores_
334
+ feat_scores["pvalue"] = feat_selector.pvalues_
335
+ feat_scores["support"] = feat_selector.get_support()
336
+ feat_scores["features"] = X.columns
337
+ feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
338
+ feat_scores["method"] = test_type
339
+ feat_scores.sort_values("rank", ascending=True, inplace=True)
340
+ stop = time.time()
341
+ training_time = timedelta(seconds=(stop - start)).total_seconds()
342
+ feat_scores["training_time"] = training_time
343
+
344
+ logger.debug(
345
+ f"{test_type} evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
346
+ )
347
+
348
+ feat_scores.to_csv(
349
+ f"{save_dir}/{test_type}.csv",
350
+ index=True,
351
+ header=True,
352
+ index_label="ID",
353
+ )
354
+
355
+ return feat_scores
356
+
357
+
358
+ # Non-Linear correlation (Spearsman's R for regression and Kendall’s Tau for classification)
359
+ def select_feature_by_nonlinear_correlation(
360
+ X, y, target_type, percentile: int = 20, save_dir: Optional[str] = None
361
+ ):
362
+ start = time.time()
363
+
364
+ def model(X_model, y_model):
365
+ X_model = pd.DataFrame(X_model)
366
+ y_model = pd.Series(y_model)
367
+
368
+ method = "spearman" if target_type == "regression" else "kendall"
369
+
370
+ corr_scores = []
371
+ p_values = []
372
+
373
+ for col in X_model.columns:
374
+ if method == "spearman":
375
+ corr, pval = spearmanr(X_model[col], y_model)
376
+ else: # Kendall's Tau for classification
377
+ corr, pval = kendalltau(X_model[col], y_model)
378
+
379
+ corr_scores.append(abs(corr)) # Keeping absolute correlation
380
+ p_values.append(pval)
381
+
382
+ return np.array(corr_scores), np.array(p_values)
383
+
384
+ test_type = "Spearman’s R" if target_type == "regression" else "Kendall’s Tau"
385
+ logger.debug(f"Running {test_type}...")
386
+
387
+ feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
388
+ feat_scores = pd.DataFrame()
389
+ feat_scores["score"] = feat_selector.scores_
390
+ feat_scores["pvalue"] = feat_selector.pvalues_
391
+ feat_scores["support"] = feat_selector.get_support()
392
+ feat_scores["features"] = X.columns
393
+ feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
394
+ feat_scores["method"] = test_type
395
+ feat_scores.sort_values("rank", ascending=True, inplace=True)
396
+ stop = time.time()
397
+ training_time = timedelta(seconds=(stop - start)).total_seconds()
398
+ feat_scores["training_time"] = training_time
399
+
400
+ logger.debug(
401
+ f"{test_type} evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
402
+ )
403
+
404
+ feat_scores.to_csv(
405
+ f"{save_dir}/{test_type}.csv",
406
+ index=True,
407
+ header=True,
408
+ index_label="ID",
409
+ )
410
+
411
+ return feat_scores
412
+
413
+
414
+ # Mutual Information
415
+ def select_feature_by_mi(
416
+ X, y, target_type, percentile: int = 20, save_dir: Optional[str] = None
417
+ ):
418
+ start = time.time()
419
+ logger.debug("Running Mutual Information...")
420
+ model = (
421
+ mutual_info_regression if target_type == "regression" else mutual_info_classif
422
+ )
423
+ feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
424
+ feat_scores = pd.DataFrame()
425
+ feat_scores["score"] = feat_selector.scores_
426
+ feat_scores["support"] = feat_selector.get_support()
427
+ feat_scores["features"] = X.columns
428
+ feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
429
+ feat_scores["method"] = "Mutual Information"
430
+ feat_scores.sort_values("rank", ascending=True, inplace=True)
431
+ stop = time.time()
432
+ training_time = timedelta(seconds=(stop - start)).total_seconds()
433
+ feat_scores["training_time"] = training_time
434
+
435
+ logger.debug(
436
+ f"MI evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
437
+ )
438
+
439
+ feat_scores.to_csv(f"{save_dir}/MI.csv", index=True, header=True, index_label="ID")
440
+
441
+ return feat_scores
442
+
443
+
444
+ def select_categorical_features(X, y, percentile, save_dir: Optional[str] = None):
445
+ start = time.time()
446
+ logger.debug("Running Chi2 for categorical features...")
447
+ feat_selector = SelectPercentile(chi2, percentile=percentile).fit(X, y)
448
+ feat_scores = pd.DataFrame()
449
+ feat_scores["score"] = feat_selector.scores_
450
+ feat_scores["pvalue"] = feat_selector.pvalues_
451
+ feat_scores["support"] = feat_selector.get_support()
452
+ feat_scores["features"] = X.columns
453
+ feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
454
+ feat_scores["method"] = "Chi2"
455
+ feat_scores.sort_values("rank", ascending=True, inplace=True)
456
+ stop = time.time()
457
+ training_time = timedelta(seconds=(stop - start)).total_seconds()
458
+ feat_scores["training_time"] = training_time
459
+
460
+ logger.debug(
461
+ f"Chi2 evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
462
+ )
463
+
464
+ feat_scores.to_csv(
465
+ f"{save_dir}/Chi2.csv", index=True, header=True, index_label="ID"
466
+ )
467
+
468
+ return feat_scores
469
+
470
+
471
+ # Intrisic/embeedded method
472
+ # ----------------
473
+
474
+
475
+ # feature importance
476
+ def select_feature_by_feat_imp(
477
+ X, y, target_type, percentile: int = 20, save_dir: Optional[str] = None
478
+ ):
479
+ start = time.time()
480
+ logger.debug("Running Feature importance...")
481
+
482
+ params = {"n_estimators": 500, "max_depth": 2**3, "random_state": 42, "n_jobs": -1}
483
+
484
+ estimator = (
485
+ RandomForestClassifier(**params)
486
+ if target_type == "classification"
487
+ else RandomForestRegressor(**params)
488
+ )
489
+
490
+ feat_selector = SelectFromModel(
491
+ estimator=estimator,
492
+ threshold=-np.inf,
493
+ max_features=int(percentile * X.shape[1] / 100),
494
+ ).fit(X, y)
495
+
496
+ feat_scores = pd.DataFrame()
497
+ feat_scores["score"] = feat_selector.estimator_.feature_importances_
498
+ feat_scores["support"] = feat_selector.get_support()
499
+ feat_scores["features"] = X.columns
500
+ feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
501
+ feat_scores["method"] = "FI"
502
+ feat_scores.sort_values("rank", ascending=True, inplace=True)
503
+
504
+ stop = time.time()
505
+ training_time = timedelta(seconds=(stop - start)).total_seconds()
506
+ feat_scores["training_time"] = training_time
507
+
508
+ logger.debug(
509
+ f"Feat importance evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
510
+ )
511
+
512
+ feat_scores.to_csv(f"{save_dir}/FI.csv", index=True, header=True, index_label="ID")
513
+
514
+ return feat_scores
515
+
516
+
517
+ # Wrapper method
518
+ # ----------------
519
+
520
+
521
+ # recursive feature elimination
522
+ def select_feature_by_rfe(
523
+ X, y, target_type, percentile: int = 20, save_dir: Optional[str] = None
524
+ ):
525
+ start = time.time()
526
+ logger.debug("Running Recursive Feature Elimination...")
527
+
528
+ params = {
529
+ "max_depth": 2**3,
530
+ "random_state": 42,
531
+ }
532
+ estimator = (
533
+ DecisionTreeClassifier(**params)
534
+ if target_type == "classification"
535
+ else DecisionTreeRegressor(**params)
536
+ )
537
+ rfe = RFE(estimator, n_features_to_select=percentile / 100, step=4, verbose=0)
538
+ feat_selector = rfe.fit(X, y)
539
+
540
+ feat_scores = pd.DataFrame(
541
+ {
542
+ "score": 0.0, # Default feature importance
543
+ "support": feat_selector.get_support(),
544
+ "features": X.columns,
545
+ "rank": 0,
546
+ "method": "RFE",
547
+ }
548
+ )
549
+ feat_scores.loc[
550
+ feat_scores["features"].isin(feat_selector.get_feature_names_out()), "score"
551
+ ] = list(feat_selector.estimator_.feature_importances_)
552
+ feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
553
+ feat_scores.sort_values("rank", ascending=True, inplace=True)
554
+
555
+ stop = time.time()
556
+ training_time = timedelta(seconds=(stop - start)).total_seconds()
557
+ feat_scores["training_time"] = training_time
558
+
559
+ logger.debug(
560
+ f"RFE evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
561
+ )
562
+
563
+ feat_scores.to_csv(f"{save_dir}/RFE.csv", index=True, header=True, index_label="ID")
564
+
565
+ return feat_scores
566
+
567
+
568
+ # SequentialFeatureSelector (loss based, possibility to do forwards or backwards selection or removal)
569
+ def select_feature_by_sfs(
570
+ X, y, target_type, percentile: int = 20, save_dir: Optional[str] = None
571
+ ):
572
+ start = time.time()
573
+ logger.debug("Running Sequential Feature Selection...")
574
+ warnings.filterwarnings("ignore", category=FutureWarning)
575
+
576
+ params = {
577
+ "max_depth": 2**3,
578
+ "random_state": 42,
579
+ }
580
+ estimator = (
581
+ DecisionTreeClassifier(**params)
582
+ if target_type == "classification"
583
+ else DecisionTreeRegressor(**params)
584
+ )
585
+
586
+ n_splits = 3
587
+ n_samples = len(X)
588
+ test_size = int(n_samples / (n_splits + 4))
589
+ tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
590
+
591
+ score_function = (
592
+ make_scorer(
593
+ log_loss, response_method="predict_proba"
594
+ ) # logloss needs probabilities
595
+ if target_type == "classification"
596
+ else make_scorer(root_mean_squared_error)
597
+ ) # we avoid greater_is_better = False because it make the score negative and mess up ranking
598
+
599
+ sfs = SequentialFeatureSelector(
600
+ estimator,
601
+ k_features=int(percentile * X.shape[1] / 100),
602
+ forward=True,
603
+ floating=True, # Enables dynamic feature elimination
604
+ scoring=score_function,
605
+ cv=tscv,
606
+ n_jobs=-1,
607
+ verbose=0,
608
+ )
609
+
610
+ feat_selector = sfs.fit(X, y)
611
+
612
+ # Extract selected features and their scores
613
+ selected_features = set(feat_selector.k_feature_names_)
614
+ feat_subsets = feat_selector.subsets_
615
+
616
+ # Create DataFrame for feature scores
617
+ feat_scores = pd.DataFrame(
618
+ {
619
+ "features": X.columns,
620
+ "support": X.columns.isin(
621
+ selected_features
622
+ ), # TODO: comprendre pourquoi le support n'est pas correct (les bons scores ne sont pas toujours choisis)
623
+ "score": 1000,
624
+ "rank": None,
625
+ "method": "SFS",
626
+ }
627
+ )
628
+
629
+ # Sort subsets by score (lower is better)
630
+ sorted_subsets = sorted(feat_subsets.items(), key=lambda item: item[1]["avg_score"])
631
+
632
+ # Record score per feature (first appearance)
633
+ feature_score_map = {}
634
+ for step in sorted_subsets:
635
+ step = step[1]
636
+ for feature in step["feature_names"]:
637
+ if feature not in feature_score_map:
638
+ feature_score_map[feature] = step["avg_score"]
639
+
640
+ # Assign scores
641
+ for feature, score in feature_score_map.items():
642
+ feat_scores.loc[feat_scores["features"] == feature, "score"] = score
643
+
644
+ # rank by score (lower = better)
645
+ feat_scores["rank"] = (
646
+ feat_scores["score"].rank(method="first", ascending=True).astype(int)
647
+ )
648
+
649
+ feat_scores.sort_values("rank", ascending=True, inplace=True)
650
+
651
+ stop = time.time()
652
+ training_time = timedelta(seconds=(stop - start)).total_seconds()
653
+ feat_scores["training_time"] = training_time
654
+
655
+ logger.debug(
656
+ f"SFS evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
657
+ )
658
+
659
+ feat_scores.to_csv(f"{save_dir}/SFS.csv", index=True, header=True, index_label="ID")
660
+
661
+ return feat_scores
662
+
663
+
664
+ # Remove correlation
665
+ # ------------------
666
+
667
+
668
+ def remove_correlated_features(
669
+ X: pd.DataFrame, features: list, corr_threshold: int, vizualize: bool = False
670
+ ):
671
+ # Create correlation matrix, select upper triangle & remove features with correlation greater than threshold
672
+ corr_matrix = X[features].corr().abs()
673
+
674
+ upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
675
+ features_uncorrelated = [
676
+ column
677
+ for column in upper.columns
678
+ if all(upper[column].dropna() <= corr_threshold / 100)
679
+ ]
680
+ features_correlated = [
681
+ column for column in upper.columns if any(upper[column] > corr_threshold / 100)
682
+ ]
683
+
684
+ if vizualize:
685
+ features_selected_visualization = (
686
+ X[features]
687
+ .corr()
688
+ .where(np.triu(np.ones(len(features)), k=1).astype(bool))
689
+ .fillna(0)
690
+ )
691
+ # Plot the heatmap
692
+ plt.figure(figsize=(10, 8))
693
+ sns.heatmap(
694
+ corr_matrix,
695
+ annot=True,
696
+ cmap="coolwarm",
697
+ center=0,
698
+ linewidths=1,
699
+ linecolor="black",
700
+ )
701
+ plt.title(f"Correlation Matrix")
702
+ plt.show()
703
+
704
+ logger.info(f"\n{features_selected_visualization.describe().to_string()}")
705
+ logger.info(f"\n{features_selected_visualization.to_string()}")
706
+ return features_uncorrelated, features_correlated
707
+
708
+
709
+ # Main feature selection function
710
+ def feature_selection(
711
+ dataset_id: int,
712
+ train: pd.DataFrame,
713
+ target_number: int,
714
+ single_process: bool = False,
715
+ ):
716
+ """Function to do feature selection with a range of different feature selection technics
717
+
718
+ Args:
719
+ - train (pd.DataFrame): a pandas train set
720
+ - target_number (in): a target, targets need to be name ``TARGET_{n}```
721
+ - single_process (bool): if True, run all feature selection methods in a single process. If False, run them in parallel.
722
+ """
723
+
724
+ # Create the feature selection in db
725
+ target = Target.find_by(name=f"TARGET_{target_number}")
726
+ dataset = Dataset.get(dataset_id)
727
+ percentile = dataset.percentile
728
+ corr_threshold = dataset.corr_threshold
729
+ max_features = dataset.max_features
730
+
731
+ feature_selection = FeatureSelection.upsert(
732
+ match_fields=["target_id", "dataset_id"],
733
+ target_id=target.id,
734
+ dataset_id=dataset.id,
735
+ )
736
+
737
+ X = train.loc[:, ~train.columns.str.contains("^TARGET_")]
738
+ y = train[f"TARGET_{target_number}"]
739
+
740
+ logger.info(f"Starting feature selection for TARGET_{target_number}...")
741
+
742
+ target_type = "classification" if target_number in TARGETS_CLF else "regression"
743
+
744
+ fs_dir_target = f"{dataset.path}/{y.name}/feature_selection"
745
+ preprocessing_dir = f"{dataset.path}/preprocessing"
746
+ os.makedirs(fs_dir_target, exist_ok=True)
747
+ clean_directory(fs_dir_target)
748
+
749
+ # Let's start by removing extremly correlated features
750
+ # This is needed to reduce nb of feature but also for methods such as anova or chi2 that requires independent features
751
+ # TODO: we could also remove low variance features
752
+ features_uncorrelated, features_correlated = remove_correlated_features(
753
+ X, X.columns, 90, vizualize=False
754
+ )
755
+ X = X[features_uncorrelated]
756
+
757
+ logger.debug(
758
+ f"""
759
+ \nWe first have removed {len(features_correlated)} features with correlation greater than 90%
760
+ \nWe are looking to capture {percentile}% of {len(X.columns)} features, i.e. {int(len(X.columns)*percentile/100)} features, with different feature selection methods
761
+ \nWe will then remove above {corr_threshold}% correlated features, keeping the one with the best ranks
762
+ \nFinally, we will keep only the {max_features} best ranked features
763
+ """
764
+ )
765
+
766
+ start = time.time()
767
+
768
+ # handling categorical features (only if classification)
769
+ categorical_features = X.select_dtypes(include=["int64", "Int64"]).columns.tolist()
770
+ X_categorical = X[categorical_features]
771
+
772
+ if target_type == "classification":
773
+ feat_scores = select_categorical_features(
774
+ X_categorical, y, percentile, save_dir=fs_dir_target
775
+ )
776
+ with get_db() as db:
777
+ for row in feat_scores.itertuples(index=False):
778
+ feature = Feature.find_by(name=row.features, db=db)
779
+ FeatureSelectionRank.upsert(
780
+ ["feature_selection_id", "feature_id", "method"],
781
+ db=db,
782
+ score=row.score,
783
+ pvalue=row.pvalue,
784
+ support=row.support,
785
+ rank=row.rank,
786
+ method=row.method,
787
+ training_time=row.training_time,
788
+ feature_selection_id=feature_selection.id,
789
+ feature_id=feature.id,
790
+ )
791
+ categorical_features_selected = feat_scores[feat_scores["support"] == True][
792
+ "features"
793
+ ].values.tolist()
794
+
795
+ # removing categorical features from X
796
+ numerical_features = list(set(X.columns).difference(set(categorical_features)))
797
+ X_numerical = X[numerical_features]
798
+
799
+ results = []
800
+ if single_process:
801
+ results = [
802
+ select_feature_by_linear_correlation(
803
+ X_numerical, y, target_type, percentile, save_dir=fs_dir_target
804
+ ),
805
+ select_feature_by_nonlinear_correlation(
806
+ X_numerical, y, target_type, percentile, save_dir=fs_dir_target
807
+ ),
808
+ select_feature_by_mi(
809
+ X_numerical, y, target_type, percentile, save_dir=fs_dir_target
810
+ ),
811
+ select_feature_by_feat_imp(
812
+ X_numerical, y, target_type, percentile, save_dir=fs_dir_target
813
+ ),
814
+ select_feature_by_rfe(
815
+ X_numerical, y, target_type, percentile, save_dir=fs_dir_target
816
+ ),
817
+ # select_feature_by_sfs(
818
+ # X_numerical, y, target_type, percentile, save_dir=fs_dir_target
819
+ # ), # TODO: this is taking too long
820
+ ]
821
+ else:
822
+ # Use ProcessPoolExecutor to run tasks in parallel
823
+ with ProcessPoolExecutor() as executor:
824
+ # Submit different functions to be executed in parallel
825
+ futures = [
826
+ executor.submit(
827
+ select_feature_by_linear_correlation,
828
+ X_numerical,
829
+ y,
830
+ target_type,
831
+ percentile,
832
+ save_dir=fs_dir_target,
833
+ ),
834
+ executor.submit(
835
+ select_feature_by_nonlinear_correlation,
836
+ X_numerical,
837
+ y,
838
+ target_type,
839
+ percentile,
840
+ save_dir=fs_dir_target,
841
+ ),
842
+ executor.submit(
843
+ select_feature_by_mi,
844
+ X_numerical,
845
+ y,
846
+ target_type,
847
+ percentile,
848
+ save_dir=fs_dir_target,
849
+ ),
850
+ executor.submit(
851
+ select_feature_by_feat_imp,
852
+ X_numerical,
853
+ y,
854
+ target_type,
855
+ percentile,
856
+ save_dir=fs_dir_target,
857
+ ),
858
+ executor.submit(
859
+ select_feature_by_rfe,
860
+ X_numerical,
861
+ y,
862
+ target_type,
863
+ percentile,
864
+ save_dir=fs_dir_target,
865
+ ),
866
+ executor.submit(
867
+ select_feature_by_sfs,
868
+ X_numerical,
869
+ y,
870
+ target_type,
871
+ percentile,
872
+ save_dir=fs_dir_target,
873
+ ),
874
+ ]
875
+
876
+ # Wait for all futures to complete and gather the results
877
+ with tqdm(total=len(futures)) as pbar:
878
+ for future in as_completed(futures):
879
+ results.append(future.result())
880
+ pbar.update(1)
881
+ logger.info(f"Finished feature selection for target {target_number}")
882
+
883
+ stop = time.time()
884
+
885
+ # Once all tasks are completed, start by inserting results to db
886
+ feat_scores = pd.concat(
887
+ results,
888
+ axis=0,
889
+ )
890
+
891
+ logger.info("Inserting feature selection results to db...")
892
+ rows = []
893
+
894
+ with get_db() as db:
895
+ feature_map = {f.name: f.id for f in Feature.get_all(db=db, limit=20000)}
896
+ for row in feat_scores.itertuples(index=False):
897
+ feature_id = feature_map.get(row.features)
898
+ if not feature_id:
899
+ continue # or raise if feature must exist
900
+
901
+ rows.append(
902
+ {
903
+ "feature_selection_id": feature_selection.id,
904
+ "feature_id": feature_id,
905
+ "method": row.method,
906
+ "score": row.score,
907
+ "pvalue": None if pd.isna(row.pvalue) else row.pvalue,
908
+ "support": row.support,
909
+ "rank": row.rank,
910
+ "training_time": row.training_time,
911
+ }
912
+ )
913
+
914
+ if len(rows) == 0:
915
+ raise ValueError(f"No features selected for TARGET_{target_number}")
916
+
917
+ FeatureSelectionRank.bulk_upsert(rows=rows, db=db)
918
+
919
+ # Merge the results
920
+ features_selected = feat_scores[feat_scores["support"] == True][
921
+ ["features", "rank"]
922
+ ]
923
+ features_selected.sort_values("rank", inplace=True)
924
+ features_selected.drop_duplicates("features", inplace=True)
925
+
926
+ features_selected_list = features_selected["features"].values.tolist()
927
+
928
+ logger.info("Merging feature selection methods...")
929
+ # features_selected = list(dict.fromkeys(features_selected_by_mi + features_selected_by_nonlinear_correlation + features_selected_by_linear_correlation))
930
+ features_selected_by_every_methods = set(results[0]["features"].values.tolist())
931
+
932
+ for df in results[1:]:
933
+ features_selected_by_every_methods &= set(
934
+ df["features"].values.tolist()
935
+ ) # intersection
936
+
937
+ features_selected_by_every_methods = list(features_selected_by_every_methods)
938
+
939
+ logger.debug(
940
+ f"We selected {len(features_selected_list)} features and {len(features_selected_by_every_methods)} were selected unanimously:"
941
+ )
942
+ logger.debug(features_selected_by_every_methods)
943
+
944
+ pd.Series(features_selected_list).to_csv(
945
+ f"{fs_dir_target}/features_before_corr.csv",
946
+ index=True,
947
+ header=True,
948
+ index_label="ID",
949
+ )
950
+ features, features_correlated = remove_correlated_features(
951
+ X, features_selected_list, corr_threshold
952
+ )
953
+ pd.Series(features).to_csv(
954
+ f"{fs_dir_target}/features_before_max.csv",
955
+ index=True,
956
+ header=True,
957
+ index_label="ID",
958
+ )
959
+ features = features[:max_features]
960
+
961
+ features += categorical_features_selected if target_type == "classification" else []
962
+ logger.debug(
963
+ f"Final pre-selection: {len(features)} features below {corr_threshold}% out of {len(features_selected_list)} features, and rejected {len(features_correlated)} features, {100*len(features)/len(features_selected_list):.2f}% features selected"
964
+ )
965
+
966
+ features_selected_by_every_methods_uncorrelated = list(
967
+ set(features) & set(features_selected_by_every_methods)
968
+ )
969
+ logger.debug(
970
+ f"In this pre-selection, there is {len(features_selected_by_every_methods_uncorrelated)} features from the {len(features_selected_by_every_methods)} selected unanimously\n"
971
+ )
972
+
973
+ logger.debug(
974
+ features_selected[features_selected["features"].isin(features)].to_markdown()
975
+ )
976
+
977
+ best_features_path = Path(
978
+ f"{preprocessing_dir}/features_{target_number}.pkl"
979
+ ).resolve()
980
+ if PYTHON_ENV != "Test":
981
+ joblib.dump(features, best_features_path)
982
+
983
+ db_features = Feature.filter(name__in=features)
984
+ # Order matters, to keep the same order in db as in features, we need: map features by name
985
+ feature_by_name = {f.name: f for f in db_features}
986
+ # Reorder them according to original `features` list
987
+ ordered_db_features = [
988
+ feature_by_name[name] for name in features if name in feature_by_name
989
+ ]
990
+
991
+ feature_selection = FeatureSelection.get(feature_selection.id)
992
+ feature_selection = feature_selection.add_features(ordered_db_features)
993
+ feature_selection.training_time = stop - start
994
+ feature_selection.best_features_path = best_features_path
995
+ feature_selection.save()
996
+
997
+ return features
998
+
999
+
1000
+ # TODO : can we use this to select the ideal number of features ?
1001
+ def feature_selection_analysis(feature_selection_id: int, n_components: int = 5):
1002
+
1003
+ feature_selection = FeatureSelection.get(feature_selection_id)
1004
+ dataset_dir = feature_selection.dataset.path
1005
+ features = [f.name for f in feature_selection.features]
1006
+ target = feature_selection.target.name
1007
+ target_number = target.split("_")[1]
1008
+
1009
+ train, val, train_scaled, val_scaled, _scaler_y = load_train_data(
1010
+ dataset_dir, target_number, target_type=feature_selection.target.type
1011
+ )
1012
+ train = train[features + [target]]
1013
+ train_scaled = train_scaled[features + [target]]
1014
+
1015
+ logger.info("Plot features correlation with target variable...")
1016
+
1017
+ correlations = train.corr()[target].sort_values(ascending=False)
1018
+
1019
+ plt.figure(figsize=(12, 6))
1020
+ sns.barplot(x=correlations.index, y=correlations.values, palette="coolwarm")
1021
+ plt.xticks(rotation=90)
1022
+ plt.title("Feature correlation with target variable")
1023
+ plt.ylabel("Correlation")
1024
+ plt.xlabel("Features")
1025
+ plt.grid(axis="y", linestyle="--", alpha=0.7)
1026
+ plt.show()
1027
+
1028
+ plt.figure(figsize=(14, 10))
1029
+ sns.heatmap(train.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
1030
+ plt.title("Correlation Matrix")
1031
+ plt.show()
1032
+
1033
+ logger.info("Plot explained variance by components...")
1034
+ n_components = min(len(features), n_components)
1035
+ pca = PCA(n_components=n_components)
1036
+ X_pca = pca.fit_transform(train_scaled)
1037
+
1038
+ explained_variance = pca.explained_variance_ratio_
1039
+
1040
+ plt.figure(figsize=(10, 7))
1041
+ plt.bar(
1042
+ range(1, len(explained_variance) + 1),
1043
+ explained_variance,
1044
+ label="Explained Variance",
1045
+ )
1046
+ plt.plot(
1047
+ range(1, len(explained_variance) + 1),
1048
+ np.cumsum(explained_variance),
1049
+ label="Cumulative Explained Variance",
1050
+ color="orange",
1051
+ marker="o",
1052
+ )
1053
+ plt.title("Explained Variance by Components")
1054
+ plt.xlabel("Number of Components")
1055
+ plt.ylabel("Explained Variance")
1056
+ plt.legend()
1057
+ plt.grid(axis="y", linestyle="--", alpha=0.7)
1058
+ plt.show()
1059
+
1060
+ logger.info("Main PCA vs target variable...")
1061
+ plt.scatter(
1062
+ X_pca[:, 0],
1063
+ X_pca[:, 1],
1064
+ c=train[target],
1065
+ cmap="coolwarm",
1066
+ alpha=0.7,
1067
+ )
1068
+ plt.title("PCA of target variable")
1069
+ plt.xlabel("First Principal Component")
1070
+ plt.ylabel("Second Principal Component")
1071
+ plt.colorbar()
1072
+ plt.show()
1073
+
1074
+
1075
+ # scaling
1076
+ def scale_data(
1077
+ df: pd.DataFrame, save_dir: str, scaler_x=None, scalers_y: Optional[list] = None
1078
+ ):
1079
+ logger.info("Scale data...")
1080
+ X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
1081
+
1082
+ if scaler_x:
1083
+ X_scaled = pd.DataFrame(
1084
+ scaler_x.transform(X), columns=list(X.columns), index=X.index
1085
+ )
1086
+ else:
1087
+ scaler_x = StandardScaler() # MinMaxScaler(feature_range=(-1,1))
1088
+ X_scaled = pd.DataFrame(
1089
+ scaler_x.fit_transform(X), columns=list(X.columns), index=X.index
1090
+ )
1091
+ if PYTHON_ENV != "Test":
1092
+ joblib.dump(scaler_x, f"{save_dir}/scaler_x.pkl")
1093
+
1094
+ # Determine which targets need to be scaled
1095
+ targets_numbers_to_scale = [i for i in TARGETS_NUMBER if i not in TARGETS_CLF]
1096
+
1097
+ # Dictionary to store scaled target data
1098
+ scaled_targets = {}
1099
+
1100
+ if scalers_y:
1101
+ for target_number in targets_numbers_to_scale:
1102
+ y = df[[f"TARGET_{target_number}"]]
1103
+ scaled_targets[target_number] = pd.DataFrame(
1104
+ scalers_y[f"scaler_y_{target_number}"].transform(y.values),
1105
+ columns=y.columns,
1106
+ index=y.index,
1107
+ )
1108
+ else:
1109
+ scalers_y = {}
1110
+ for target_number in targets_numbers_to_scale:
1111
+ scaler_y = StandardScaler()
1112
+ y = df[[f"TARGET_{target_number}"]]
1113
+
1114
+ scaled_y = pd.DataFrame(
1115
+ scaler_y.fit_transform(y.values),
1116
+ columns=y.columns,
1117
+ index=y.index,
1118
+ )
1119
+ if PYTHON_ENV != "Test":
1120
+ joblib.dump(scaler_y, f"{save_dir}/scaler_y_{target_number}.pkl")
1121
+
1122
+ scalers_y[f"scaler_y_{target_number}"] = scaler_y
1123
+ scaled_targets[target_number] = scaled_y
1124
+
1125
+ # Reconstruct y_scaled in the original order
1126
+ y_scaled = pd.concat(
1127
+ [scaled_targets[target_number] for target_number in targets_numbers_to_scale],
1128
+ axis=1,
1129
+ )
1130
+ y_not_scaled = df[df.columns.intersection([f"TARGET_{i}" for i in TARGETS_CLF])]
1131
+
1132
+ # Ensure the final DataFrame keeps the original order
1133
+ df_scaled = pd.concat(
1134
+ [X_scaled, y_scaled, y_not_scaled],
1135
+ axis=1,
1136
+ )[
1137
+ df.columns
1138
+ ] # Reorder columns to match original `df`
1139
+
1140
+ if not df_scaled.columns.equals(df.columns):
1141
+ raise Exception("Columns are not in the same order after scaling.")
1142
+
1143
+ return df_scaled, scaler_x, scalers_y
1144
+
1145
+
1146
+ # Reshape into 3D tensors for recurrent models
1147
+ def reshape_time_series(
1148
+ train: pd.DataFrame,
1149
+ val: pd.DataFrame,
1150
+ test: pd.DataFrame,
1151
+ features: list,
1152
+ timesteps: int = 120,
1153
+ ):
1154
+ # always scale for recurrent layers : train should be scaled
1155
+
1156
+ target_columns = train.columns.intersection([f"TARGET_{i}" for i in TARGETS_NUMBER])
1157
+
1158
+ data = pd.concat([train, val, test], axis=0)
1159
+
1160
+ data_reshaped = reshape_df(data[features], data[GROUPING_COLUMN], timesteps)
1161
+
1162
+ data_reshaped[target_columns] = data[target_columns]
1163
+
1164
+ logger.info("Separating train, val, test data and creating np arrays...")
1165
+ train_reshaped = data_reshaped.loc[train.index]
1166
+ val_reshaped = data_reshaped.loc[val.index]
1167
+ test_reshaped = data_reshaped.loc[test.index]
1168
+
1169
+ x_train_reshaped = np.array(train_reshaped["RECURRENT_FEATURES"].values.tolist())
1170
+ y_train_reshaped = np.array(train_reshaped[target_columns].reset_index())
1171
+ x_val_reshaped = np.array(val_reshaped["RECURRENT_FEATURES"].values.tolist())
1172
+ y_val_reshaped = np.array(val_reshaped[target_columns].reset_index())
1173
+ x_test_reshaped = np.array(test_reshaped["RECURRENT_FEATURES"].values.tolist())
1174
+ y_test_reshaped = np.array(test_reshaped[target_columns].reset_index())
1175
+
1176
+ reshaped_data = {
1177
+ "x_train_reshaped": x_train_reshaped,
1178
+ "y_train_reshaped": y_train_reshaped,
1179
+ "x_val_reshaped": x_val_reshaped,
1180
+ "y_val_reshaped": y_val_reshaped,
1181
+ "x_test_reshaped": x_test_reshaped,
1182
+ "y_test_reshaped": y_test_reshaped,
1183
+ }
1184
+
1185
+ return reshaped_data
1186
+
1187
+
1188
+ def reshape_df(df: pd.DataFrame, stock_column: pd.DataFrame, timesteps: int):
1189
+ fill_value = [[[0] * len(df.columns)]]
1190
+
1191
+ def shiftsum(x, timesteps: int):
1192
+ tmp = x.copy()
1193
+ for i in range(1, timesteps):
1194
+ tmp = x.shift(i, fill_value=fill_value) + tmp
1195
+ return tmp
1196
+
1197
+ logger.info("Grouping each feature in a unique column with list...")
1198
+ df_reshaped = df.apply(list, axis=1).apply(lambda x: [list(x)])
1199
+ df_reshaped = pd.concat([df_reshaped, stock_column], axis=1)
1200
+
1201
+ logger.info("Grouping method stock and creating timesteps...")
1202
+ df_reshaped = (
1203
+ df_reshaped.groupby(GROUPING_COLUMN)[0]
1204
+ .apply(lambda x: shiftsum(x, timesteps))
1205
+ .reset_index(GROUPING_COLUMN, drop=True)
1206
+ .rename("RECURRENT_FEATURES")
1207
+ )
1208
+ df_reshaped = pd.DataFrame(df_reshaped)
1209
+
1210
+ return df_reshaped
1211
+
1212
+
1213
+ def load_train_data(dataset_dir, target_number, target_type="regression"):
1214
+ train_data_dir = f"{dataset_dir}/data"
1215
+ preprocessing_dir = f"{dataset_dir}/preprocessing"
1216
+
1217
+ _scaler_y = (
1218
+ joblib.load(f"{preprocessing_dir}/scaler_y_{target_number}.pkl")
1219
+ if target_type == "regression"
1220
+ else None
1221
+ )
1222
+
1223
+ logger.info("Loading data...")
1224
+ train = joblib.load(f"{train_data_dir}/train.pkl")
1225
+ val = joblib.load(f"{train_data_dir}/val.pkl")
1226
+ train_scaled = joblib.load(f"{train_data_dir}/train_scaled.pkl")
1227
+ val_scaled = joblib.load(f"{train_data_dir}/val_scaled.pkl")
1228
+
1229
+ return train, val, train_scaled, val_scaled, _scaler_y