lecrapaud 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

Files changed (63) hide show
  1. lecrapaud/__init__.py +1 -0
  2. lecrapaud/api.py +271 -0
  3. lecrapaud/config.py +25 -0
  4. lecrapaud/db/__init__.py +1 -0
  5. lecrapaud/db/alembic/README +1 -0
  6. lecrapaud/db/alembic/env.py +78 -0
  7. lecrapaud/db/alembic/script.py.mako +26 -0
  8. lecrapaud/db/alembic/versions/2025_04_06_1738-7390745388e4_initial_setup.py +295 -0
  9. lecrapaud/db/alembic/versions/2025_04_06_1755-40cd8d3e798e_unique_constraint_for_data.py +30 -0
  10. lecrapaud/db/alembic/versions/2025_05_23_1724-2360941fa0bd_longer_string.py +52 -0
  11. lecrapaud/db/alembic/versions/2025_05_27_1159-b96396dcfaff_add_env_to_trading_tables.py +34 -0
  12. lecrapaud/db/alembic/versions/2025_05_27_1337-40cbfc215f7c_fix_nb_character_on_portfolio.py +39 -0
  13. lecrapaud/db/alembic/versions/2025_05_27_1526-3de994115317_to_datetime.py +36 -0
  14. lecrapaud/db/alembic/versions/2025_05_27_2003-25c227c684f8_add_fees_to_transactions.py +30 -0
  15. lecrapaud/db/alembic/versions/2025_05_27_2047-6b6f2d38e9bc_double_instead_of_float.py +132 -0
  16. lecrapaud/db/alembic/versions/2025_05_31_1111-c175e4a36d68_generalise_stock_to_group.py +36 -0
  17. lecrapaud/db/alembic/versions/2025_05_31_1256-5681095bfc27_create_investment_run_and_portfolio_.py +62 -0
  18. lecrapaud/db/alembic/versions/2025_05_31_1806-339927587383_add_investment_run_id.py +107 -0
  19. lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +38 -0
  20. lecrapaud/db/alembic/versions/2025_05_31_1849-3b8550297e8e_change_date_to_datetime.py +44 -0
  21. lecrapaud/db/alembic/versions/2025_05_31_1852-e6b8c95d8243_add_date_to_portfolio_history.py +30 -0
  22. lecrapaud/db/alembic/versions/2025_06_10_1136-db8cdd83563a_addnewsandoptiontodata.py +32 -0
  23. lecrapaud/db/alembic/versions/2025_06_17_1652-c45f5e49fa2c_make_fields_nullable.py +89 -0
  24. lecrapaud/db/models/__init__.py +11 -0
  25. lecrapaud/db/models/base.py +181 -0
  26. lecrapaud/db/models/dataset.py +129 -0
  27. lecrapaud/db/models/feature.py +45 -0
  28. lecrapaud/db/models/feature_selection.py +125 -0
  29. lecrapaud/db/models/feature_selection_rank.py +79 -0
  30. lecrapaud/db/models/model.py +40 -0
  31. lecrapaud/db/models/model_selection.py +63 -0
  32. lecrapaud/db/models/model_training.py +62 -0
  33. lecrapaud/db/models/score.py +65 -0
  34. lecrapaud/db/models/target.py +67 -0
  35. lecrapaud/db/session.py +45 -0
  36. lecrapaud/directory_management.py +28 -0
  37. lecrapaud/experiment.py +64 -0
  38. lecrapaud/feature_engineering.py +846 -0
  39. lecrapaud/feature_selection.py +1167 -0
  40. lecrapaud/integrations/openai_integration.py +225 -0
  41. lecrapaud/jobs/__init__.py +13 -0
  42. lecrapaud/jobs/config.py +17 -0
  43. lecrapaud/jobs/scheduler.py +36 -0
  44. lecrapaud/jobs/tasks.py +57 -0
  45. lecrapaud/model_selection.py +1671 -0
  46. lecrapaud/predictions.py +292 -0
  47. lecrapaud/preprocessing.py +984 -0
  48. lecrapaud/search_space.py +848 -0
  49. lecrapaud/services/__init__.py +0 -0
  50. lecrapaud/services/embedding_categorical.py +71 -0
  51. lecrapaud/services/indicators.py +309 -0
  52. lecrapaud/speed_tests/experiments.py +139 -0
  53. lecrapaud/speed_tests/test-gpu-bilstm.ipynb +261 -0
  54. lecrapaud/speed_tests/test-gpu-resnet.ipynb +166 -0
  55. lecrapaud/speed_tests/test-gpu-transformers.ipynb +254 -0
  56. lecrapaud/speed_tests/tests.ipynb +145 -0
  57. lecrapaud/speed_tests/trash.py +37 -0
  58. lecrapaud/training.py +239 -0
  59. lecrapaud/utils.py +246 -0
  60. lecrapaud-0.1.0.dist-info/LICENSE +201 -0
  61. lecrapaud-0.1.0.dist-info/METADATA +105 -0
  62. lecrapaud-0.1.0.dist-info/RECORD +63 -0
  63. lecrapaud-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,1167 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from datetime import timedelta
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ import os
7
+ import time
8
+ from typing import Optional
9
+ from tqdm import tqdm
10
+ import warnings
11
+ from concurrent.futures import ProcessPoolExecutor, as_completed
12
+ import joblib
13
+ import re
14
+ from pathlib import Path
15
+
16
+ os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
17
+
18
+ # feature selection
19
+ from sklearn.feature_selection import (
20
+ f_classif,
21
+ f_regression,
22
+ mutual_info_classif,
23
+ mutual_info_regression,
24
+ chi2,
25
+ SelectPercentile,
26
+ SelectFpr,
27
+ RFE,
28
+ SelectFromModel,
29
+ )
30
+ from sklearn.decomposition import PCA
31
+ from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
32
+ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
33
+ from sklearn.model_selection import TimeSeriesSplit
34
+ from sklearn.metrics import root_mean_squared_error, log_loss, make_scorer
35
+ from mlxtend.feature_selection import SequentialFeatureSelector
36
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler
37
+ from scipy.stats import spearmanr, kendalltau
38
+
39
+ # Internal
40
+ from lecrapaud.directory_management import tmp_dir, clean_directory
41
+ from lecrapaud.utils import logger
42
+ from lecrapaud.config import PYTHON_ENV
43
+ from lecrapaud.db import (
44
+ Dataset,
45
+ Target,
46
+ Feature,
47
+ FeatureSelection,
48
+ FeatureSelectionRank,
49
+ )
50
+ from lecrapaud.db.session import get_db
51
+ from lecrapaud.search_space import all_models
52
+
53
+ # Variables for targets handling
54
+ TARGETS_MCLF = [11]
55
+ GROUPING_COLUMN = "STOCK"
56
+
57
+ # Annoying Warnings
58
+ warnings.filterwarnings("ignore", category=FutureWarning)
59
+
60
+
61
+ def load_train_data(dataset_dir, target_number, target_type="regression"):
62
+ data_dir = f"{dataset_dir}/data"
63
+
64
+ logger.info("Loading data...")
65
+ train = joblib.load(f"{data_dir}/train.pkl")
66
+ val = joblib.load(f"{data_dir}/val.pkl")
67
+ test = joblib.load(f"{data_dir}/test.pkl")
68
+ try:
69
+ train_scaled = joblib.load(f"{data_dir}/train_scaled.pkl")
70
+ val_scaled = joblib.load(f"{data_dir}/val_scaled.pkl")
71
+ test_scaled = joblib.load(f"{data_dir}/test_scaled.pkl")
72
+ except FileNotFoundError:
73
+ train_scaled = None
74
+ val_scaled = None
75
+ test_scaled = None
76
+
77
+ return train, val, test, train_scaled, val_scaled, test_scaled
78
+
79
+
80
+ class FeatureSelectionEngine:
81
+ def __init__(self, train, dataset, target_number, target_clf, **kwargs):
82
+ self.dataset = dataset
83
+ self.train = train
84
+ self.target_number = target_number
85
+ self.target_clf = target_clf
86
+
87
+ self.target_type = (
88
+ "classification" if self.target_number in self.target_clf else "regression"
89
+ )
90
+ self.percentile = self.dataset.percentile
91
+ self.corr_threshold = self.dataset.corr_threshold
92
+ self.max_features = self.dataset.max_features
93
+
94
+ self.dataset_dir = self.dataset.path
95
+ self.dataset_id = self.dataset.id
96
+ self.data_dir = f"{self.dataset_dir}/data"
97
+ self.preprocessing_dir = f"{self.dataset_dir}/preprocessing"
98
+ self.fs_dir_target = (
99
+ f"{self.dataset_dir}/{f"TARGET_{self.target_number}"}/feature_selection"
100
+ )
101
+ os.makedirs(self.fs_dir_target, exist_ok=True)
102
+
103
+ # Main feature selection function
104
+ def run(
105
+ self,
106
+ single_process: bool = False,
107
+ ):
108
+ """Function to do feature selection with a range of different feature selection technics
109
+
110
+ Args:
111
+ - train (pd.DataFrame): a pandas train set
112
+ - target_number (in): a target, targets need to be name ``TARGET_{n}```
113
+ - single_process (bool): if True, run all feature selection methods in a single process. If False, run them in parallel.
114
+ """
115
+ target_number = self.target_number
116
+ target_type = self.target_type
117
+ if PYTHON_ENV != "Test":
118
+ fs_dir_target = self.fs_dir_target
119
+ else:
120
+ fs_dir_target = None
121
+
122
+ # Create the feature selection in db
123
+ target = Target.find_by(name=f"TARGET_{target_number}")
124
+ percentile = self.percentile
125
+ corr_threshold = self.corr_threshold
126
+ max_features = self.max_features
127
+
128
+ feature_selection = FeatureSelection.upsert(
129
+ match_fields=["target_id", "dataset_id"],
130
+ target_id=target.id,
131
+ dataset_id=self.dataset_id,
132
+ )
133
+
134
+ if feature_selection.best_features_path:
135
+ return joblib.load(feature_selection.best_features_path)
136
+
137
+ self.X = self.train.loc[:, ~self.train.columns.str.contains("^TARGET_")]
138
+ self.y = self.train[f"TARGET_{target_number}"]
139
+
140
+ logger.info(f"Starting feature selection for TARGET_{target_number}...")
141
+ clean_directory(self.fs_dir_target)
142
+
143
+ # Let's start by removing extremly correlated features
144
+ # This is needed to reduce nb of feature but also for methods such as anova or chi2 that requires independent features
145
+ # TODO: we could also remove low variance features
146
+ features_uncorrelated, features_correlated = self.remove_correlated_features(
147
+ 90, vizualize=False
148
+ )
149
+ self.X = self.X[features_uncorrelated]
150
+
151
+ logger.debug(
152
+ f"""
153
+ \nWe first have removed {len(features_correlated)} features with correlation greater than 90%
154
+ \nWe are looking to capture {percentile}% of {len(self.X.columns)} features, i.e. {int(len(self.X.columns)*percentile/100)} features, with different feature selection methods
155
+ \nWe will then remove above {corr_threshold}% correlated features, keeping the one with the best ranks
156
+ \nFinally, we will keep only the {max_features} best ranked features
157
+ """
158
+ )
159
+
160
+ start = time.time()
161
+
162
+ # handling categorical features (only if classification)
163
+ self.X_categorical, self.X_numerical = get_features_by_types(self.X)
164
+
165
+ if target_type == "classification":
166
+ feat_scores = self.select_categorical_features(
167
+ percentile=percentile, save_dir=fs_dir_target
168
+ )
169
+ with get_db() as db:
170
+ for row in feat_scores.itertuples(index=False):
171
+ feature = Feature.find_by(name=row.features, db=db)
172
+ FeatureSelectionRank.upsert(
173
+ ["feature_selection_id", "feature_id", "method"],
174
+ db=db,
175
+ score=row.score,
176
+ pvalue=row.pvalue,
177
+ support=row.support,
178
+ rank=row.rank,
179
+ method=row.method,
180
+ training_time=row.training_time,
181
+ feature_selection_id=feature_selection.id,
182
+ feature_id=feature.id,
183
+ )
184
+ categorical_features_selected = feat_scores[feat_scores["support"]][
185
+ "features"
186
+ ].values.tolist()
187
+
188
+ results = []
189
+ params = {"percentile": percentile, "save_dir": fs_dir_target}
190
+ if single_process:
191
+ results = [
192
+ self.select_feature_by_linear_correlation(**params),
193
+ self.select_feature_by_nonlinear_correlation(**params),
194
+ self.select_feature_by_mi(**params),
195
+ self.select_feature_by_feat_imp(**params),
196
+ self.select_feature_by_rfe(**params),
197
+ # self.select_feature_by_sfs(
198
+ # **params
199
+ # ), # TODO: this is taking too long
200
+ ]
201
+ else:
202
+ # Use ProcessPoolExecutor to run tasks in parallel
203
+ # TODO: not sure it's efficient from previous tests... especially because rfe and sfs methods are doing parallel processing already, this can create overhead
204
+ with ProcessPoolExecutor() as executor:
205
+ # Submit different functions to be executed in parallel
206
+ futures = [
207
+ executor.submit(
208
+ self.select_feature_by_linear_correlation,
209
+ **params,
210
+ ),
211
+ executor.submit(
212
+ self.select_feature_by_nonlinear_correlation,
213
+ **params,
214
+ ),
215
+ executor.submit(
216
+ self.select_feature_by_mi,
217
+ **params,
218
+ ),
219
+ executor.submit(
220
+ self.select_feature_by_feat_imp,
221
+ **params,
222
+ ),
223
+ executor.submit(
224
+ self.select_feature_by_rfe,
225
+ **params,
226
+ ),
227
+ # executor.submit(
228
+ # self.select_feature_by_sfs,
229
+ # **params,
230
+ # ), # TODO: this is taking too long
231
+ ]
232
+
233
+ # Wait for all futures to complete and gather the results
234
+ with tqdm(total=len(futures)) as pbar:
235
+ for future in as_completed(futures):
236
+ results.append(future.result())
237
+ pbar.update(1)
238
+
239
+ logger.info(f"Finished feature selection for target {target_number}")
240
+
241
+ stop = time.time()
242
+
243
+ # Once all tasks are completed, start by inserting results to db
244
+ feat_scores = pd.concat(
245
+ results,
246
+ axis=0,
247
+ )
248
+
249
+ logger.info("Inserting feature selection results to db...")
250
+ rows = []
251
+ with get_db() as db:
252
+ feature_map = {f.name: f.id for f in Feature.get_all(db=db, limit=20000)}
253
+ for row in feat_scores.itertuples(index=False):
254
+ feature_id = feature_map.get(row.features)
255
+ if not feature_id:
256
+ continue # or raise if feature must exist
257
+
258
+ rows.append(
259
+ {
260
+ "feature_selection_id": feature_selection.id,
261
+ "feature_id": feature_id,
262
+ "method": row.method,
263
+ "score": row.score,
264
+ "pvalue": None if pd.isna(row.pvalue) else row.pvalue,
265
+ "support": row.support,
266
+ "rank": row.rank,
267
+ "training_time": row.training_time,
268
+ }
269
+ )
270
+
271
+ if len(rows) == 0:
272
+ raise ValueError(f"No features selected for TARGET_{target_number}")
273
+
274
+ FeatureSelectionRank.bulk_upsert(rows=rows, db=db)
275
+
276
+ # Merge the results
277
+ logger.info("Merging feature selection methods...")
278
+ features_selected = feat_scores[feat_scores["support"]][["features", "rank"]]
279
+ features_selected.sort_values("rank", inplace=True)
280
+ features_selected.drop_duplicates("features", inplace=True)
281
+
282
+ features_selected_list = features_selected["features"].values.tolist()
283
+
284
+ # analysis 1
285
+ features_selected_by_every_methods = set(results[0]["features"].values.tolist())
286
+ for df in results[1:]:
287
+ features_selected_by_every_methods &= set(
288
+ df["features"].values.tolist()
289
+ ) # intersection
290
+ features_selected_by_every_methods = list(features_selected_by_every_methods)
291
+ logger.debug(
292
+ f"We selected {len(features_selected_list)} features and {len(features_selected_by_every_methods)} were selected unanimously:"
293
+ )
294
+ logger.debug(features_selected_by_every_methods)
295
+ if PYTHON_ENV != "Test":
296
+ pd.Series(features_selected_list).to_csv(
297
+ f"{fs_dir_target}/features_before_corr.csv",
298
+ index=True,
299
+ header=True,
300
+ index_label="ID",
301
+ )
302
+
303
+ # removing correlated features
304
+ self.X = self.X[features_selected_list]
305
+ features, features_correlated = self.remove_correlated_features(corr_threshold)
306
+ if PYTHON_ENV != "Test":
307
+ pd.Series(features).to_csv(
308
+ f"{fs_dir_target}/features_before_max.csv",
309
+ index=True,
310
+ header=True,
311
+ index_label="ID",
312
+ )
313
+ features = features[:max_features]
314
+
315
+ # adding categorical features selected
316
+ features += (
317
+ categorical_features_selected if target_type == "classification" else []
318
+ )
319
+ logger.debug(
320
+ f"Final pre-selection: {len(features)} features below {corr_threshold}% out of {len(features_selected_list)} features, and rejected {len(features_correlated)} features, {100*len(features)/len(features_selected_list):.2f}% features selected"
321
+ )
322
+
323
+ # analysis 2
324
+ features_selected_by_every_methods_uncorrelated = list(
325
+ set(features) & set(features_selected_by_every_methods)
326
+ )
327
+ logger.debug(
328
+ f"In this pre-selection, there is {len(features_selected_by_every_methods_uncorrelated)} features from the {len(features_selected_by_every_methods)} selected unanimously\n"
329
+ )
330
+ logger.debug(
331
+ features_selected[
332
+ features_selected["features"].isin(features)
333
+ ].to_markdown()
334
+ )
335
+
336
+ # save to path
337
+ best_features_path = Path(
338
+ f"{self.preprocessing_dir}/features_{target_number}.pkl"
339
+ ).resolve()
340
+ if PYTHON_ENV != "Test":
341
+ joblib.dump(features, best_features_path)
342
+
343
+ # save in db
344
+ db_features = Feature.filter(name__in=features)
345
+ # Order matters, to keep the same order in db as in features, we need: map features by name
346
+ feature_by_name = {f.name: f for f in db_features}
347
+ # Reorder them according to original `features` list
348
+ ordered_db_features = [
349
+ feature_by_name[name] for name in features if name in feature_by_name
350
+ ]
351
+
352
+ feature_selection = FeatureSelection.get(feature_selection.id)
353
+ feature_selection = feature_selection.add_features(ordered_db_features)
354
+ feature_selection.training_time = stop - start
355
+ feature_selection.best_features_path = best_features_path
356
+ feature_selection.save()
357
+
358
+ return features
359
+
360
+ # Remove correlation
361
+ # ------------------
362
+
363
+ def remove_correlated_features(self, corr_threshold: int, vizualize: bool = False):
364
+ X = self.X
365
+ features = X.columns
366
+ # Create correlation matrix, select upper triangle & remove features with correlation greater than threshold
367
+ corr_matrix = X[features].corr().abs()
368
+
369
+ upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
370
+ features_uncorrelated = [
371
+ column
372
+ for column in upper.columns
373
+ if all(upper[column].dropna() <= corr_threshold / 100)
374
+ ]
375
+ features_correlated = [
376
+ column
377
+ for column in upper.columns
378
+ if any(upper[column] > corr_threshold / 100)
379
+ ]
380
+
381
+ if vizualize:
382
+ features_selected_visualization = (
383
+ X[features]
384
+ .corr()
385
+ .where(np.triu(np.ones(len(features)), k=1).astype(bool))
386
+ .fillna(0)
387
+ )
388
+ # Plot the heatmap
389
+ plt.figure(figsize=(10, 8))
390
+ sns.heatmap(
391
+ corr_matrix,
392
+ annot=True,
393
+ cmap="coolwarm",
394
+ center=0,
395
+ linewidths=1,
396
+ linecolor="black",
397
+ )
398
+ plt.title(f"Correlation Matrix")
399
+ plt.show()
400
+
401
+ logger.info(f"\n{features_selected_visualization.describe().to_string()}")
402
+ logger.info(f"\n{features_selected_visualization.to_string()}")
403
+ return features_uncorrelated, features_correlated
404
+
405
+ # Filter methods
406
+ # ----------------
407
+
408
+ def select_categorical_features(self, percentile, save_dir: Optional[str] = None):
409
+ X, y = self.X_categorical, self.y
410
+
411
+ start = time.time()
412
+ logger.debug("Running Chi2 for categorical features...")
413
+ feat_selector = SelectPercentile(chi2, percentile=percentile).fit(X, y)
414
+ feat_scores = pd.DataFrame()
415
+ feat_scores["score"] = feat_selector.scores_
416
+ feat_scores["pvalue"] = feat_selector.pvalues_
417
+ feat_scores["support"] = feat_selector.get_support()
418
+ feat_scores["features"] = X.columns
419
+ feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
420
+ feat_scores["method"] = "Chi2"
421
+ feat_scores.sort_values("rank", ascending=True, inplace=True)
422
+ stop = time.time()
423
+ training_time = timedelta(seconds=(stop - start)).total_seconds()
424
+ feat_scores["training_time"] = training_time
425
+
426
+ logger.debug(
427
+ f"Chi2 evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
428
+ )
429
+
430
+ feat_scores.to_csv(
431
+ f"{save_dir}/Chi2.csv", index=True, header=True, index_label="ID"
432
+ )
433
+
434
+ return feat_scores
435
+
436
+ # Linear correlation (Person's R for regression and ANOVA for classification)
437
+ def select_feature_by_linear_correlation(
438
+ self, percentile: int = 20, save_dir: Optional[str] = None
439
+ ):
440
+ X, y, target_type = self.X_numerical, self.y, self.target_type
441
+
442
+ start = time.time()
443
+ test_type = "Person's R" if target_type == "regression" else "ANOVA"
444
+ logger.debug(f"Running {test_type}...")
445
+
446
+ model = f_regression if target_type == "regression" else f_classif
447
+ feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
448
+ feat_scores = pd.DataFrame()
449
+ feat_scores["score"] = feat_selector.scores_
450
+ feat_scores["pvalue"] = feat_selector.pvalues_
451
+ feat_scores["support"] = feat_selector.get_support()
452
+ feat_scores["features"] = X.columns
453
+ feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
454
+ feat_scores["method"] = test_type
455
+ feat_scores.sort_values("rank", ascending=True, inplace=True)
456
+ stop = time.time()
457
+ training_time = timedelta(seconds=(stop - start)).total_seconds()
458
+ feat_scores["training_time"] = training_time
459
+
460
+ logger.debug(
461
+ f"{test_type} evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
462
+ )
463
+
464
+ feat_scores.to_csv(
465
+ f"{save_dir}/{test_type}.csv",
466
+ index=True,
467
+ header=True,
468
+ index_label="ID",
469
+ )
470
+
471
+ return feat_scores
472
+
473
+ # Non-Linear correlation (Spearsman's R for regression and Kendall's Tau for classification)
474
+ def select_feature_by_nonlinear_correlation(
475
+ self, percentile: int = 20, save_dir: Optional[str] = None
476
+ ):
477
+ X, y, target_type = self.X_numerical, self.y, self.target_type
478
+
479
+ start = time.time()
480
+
481
+ def model(X_model, y_model):
482
+ X_model = pd.DataFrame(X_model)
483
+ y_model = pd.Series(y_model)
484
+
485
+ method = "spearman" if target_type == "regression" else "kendall"
486
+
487
+ corr_scores = []
488
+ p_values = []
489
+
490
+ for col in X_model.columns:
491
+ if method == "spearman":
492
+ corr, pval = spearmanr(X_model[col], y_model)
493
+ else: # Kendall's Tau for classification
494
+ corr, pval = kendalltau(X_model[col], y_model)
495
+
496
+ corr_scores.append(abs(corr)) # Keeping absolute correlation
497
+ p_values.append(pval)
498
+
499
+ return np.array(corr_scores), np.array(p_values)
500
+
501
+ test_type = "Spearman's R" if target_type == "regression" else "Kendall's Tau"
502
+ logger.debug(f"Running {test_type}...")
503
+
504
+ feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
505
+ feat_scores = pd.DataFrame()
506
+ feat_scores["score"] = feat_selector.scores_
507
+ feat_scores["pvalue"] = feat_selector.pvalues_
508
+ feat_scores["support"] = feat_selector.get_support()
509
+ feat_scores["features"] = X.columns
510
+ feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
511
+ feat_scores["method"] = test_type
512
+ feat_scores.sort_values("rank", ascending=True, inplace=True)
513
+ stop = time.time()
514
+ training_time = timedelta(seconds=(stop - start)).total_seconds()
515
+ feat_scores["training_time"] = training_time
516
+
517
+ logger.debug(
518
+ f"{test_type} evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
519
+ )
520
+
521
+ feat_scores.to_csv(
522
+ f"{save_dir}/{test_type}.csv",
523
+ index=True,
524
+ header=True,
525
+ index_label="ID",
526
+ )
527
+
528
+ return feat_scores
529
+
530
+ # Mutual Information
531
+ def select_feature_by_mi(
532
+ self, percentile: int = 20, save_dir: Optional[str] = None
533
+ ):
534
+ X, y, target_type = self.X_numerical, self.y, self.target_type
535
+
536
+ start = time.time()
537
+ logger.debug("Running Mutual Information...")
538
+ model = (
539
+ mutual_info_regression
540
+ if target_type == "regression"
541
+ else mutual_info_classif
542
+ )
543
+ feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
544
+ feat_scores = pd.DataFrame()
545
+ feat_scores["score"] = feat_selector.scores_
546
+ feat_scores["support"] = feat_selector.get_support()
547
+ feat_scores["features"] = X.columns
548
+ feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
549
+ feat_scores["method"] = "Mutual Information"
550
+ feat_scores.sort_values("rank", ascending=True, inplace=True)
551
+ stop = time.time()
552
+ training_time = timedelta(seconds=(stop - start)).total_seconds()
553
+ feat_scores["training_time"] = training_time
554
+
555
+ logger.debug(
556
+ f"MI evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
557
+ )
558
+
559
+ feat_scores.to_csv(
560
+ f"{save_dir}/MI.csv", index=True, header=True, index_label="ID"
561
+ )
562
+
563
+ return feat_scores
564
+
565
+ # Intrisic/embeedded method
566
+ # ----------------
567
+
568
+ # feature importance
569
+ def select_feature_by_feat_imp(
570
+ self, percentile: int = 20, save_dir: Optional[str] = None
571
+ ):
572
+ X, y, target_type = self.X_numerical, self.y, self.target_type
573
+
574
+ start = time.time()
575
+ logger.debug("Running Feature importance...")
576
+
577
+ params = {
578
+ "n_estimators": 500,
579
+ "max_depth": 2**3,
580
+ "random_state": 42,
581
+ "n_jobs": -1,
582
+ }
583
+
584
+ estimator = (
585
+ RandomForestClassifier(**params)
586
+ if target_type == "classification"
587
+ else RandomForestRegressor(**params)
588
+ )
589
+
590
+ feat_selector = SelectFromModel(
591
+ estimator=estimator,
592
+ threshold=-np.inf,
593
+ max_features=int(percentile * X.shape[1] / 100),
594
+ ).fit(X, y)
595
+
596
+ feat_scores = pd.DataFrame()
597
+ feat_scores["score"] = feat_selector.estimator_.feature_importances_
598
+ feat_scores["support"] = feat_selector.get_support()
599
+ feat_scores["features"] = X.columns
600
+ feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
601
+ feat_scores["method"] = "FI"
602
+ feat_scores.sort_values("rank", ascending=True, inplace=True)
603
+
604
+ stop = time.time()
605
+ training_time = timedelta(seconds=(stop - start)).total_seconds()
606
+ feat_scores["training_time"] = training_time
607
+
608
+ logger.debug(
609
+ f"Feat importance evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
610
+ )
611
+
612
+ feat_scores.to_csv(
613
+ f"{save_dir}/FI.csv", index=True, header=True, index_label="ID"
614
+ )
615
+
616
+ return feat_scores
617
+
618
+ # Wrapper method
619
+ # ----------------
620
+
621
+ # recursive feature elimination
622
+ def select_feature_by_rfe(
623
+ self, percentile: int = 20, save_dir: Optional[str] = None
624
+ ):
625
+ X, y, target_type = self.X_numerical, self.y, self.target_type
626
+
627
+ start = time.time()
628
+ logger.debug("Running Recursive Feature Elimination...")
629
+
630
+ params = {
631
+ "max_depth": 2**3,
632
+ "random_state": 42,
633
+ }
634
+ estimator = (
635
+ DecisionTreeClassifier(**params)
636
+ if target_type == "classification"
637
+ else DecisionTreeRegressor(**params)
638
+ )
639
+ rfe = RFE(estimator, n_features_to_select=percentile / 100, step=4, verbose=0)
640
+ feat_selector = rfe.fit(X, y)
641
+
642
+ feat_scores = pd.DataFrame(
643
+ {
644
+ "score": 0.0, # Default feature importance
645
+ "support": feat_selector.get_support(),
646
+ "features": X.columns,
647
+ "rank": 0,
648
+ "method": "RFE",
649
+ }
650
+ )
651
+ feat_scores.loc[
652
+ feat_scores["features"].isin(feat_selector.get_feature_names_out()), "score"
653
+ ] = list(feat_selector.estimator_.feature_importances_)
654
+ feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
655
+ feat_scores.sort_values("rank", ascending=True, inplace=True)
656
+
657
+ stop = time.time()
658
+ training_time = timedelta(seconds=(stop - start)).total_seconds()
659
+ feat_scores["training_time"] = training_time
660
+
661
+ logger.debug(
662
+ f"RFE evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
663
+ )
664
+
665
+ feat_scores.to_csv(
666
+ f"{save_dir}/RFE.csv", index=True, header=True, index_label="ID"
667
+ )
668
+
669
+ return feat_scores
670
+
671
+ # SequentialFeatureSelector (loss based, possibility to do forwards or backwards selection or removal)
672
+ def select_feature_by_sfs(
673
+ self, percentile: int = 20, save_dir: Optional[str] = None
674
+ ):
675
+ X, y, target_type = self.X_numerical, self.y, self.target_type
676
+
677
+ start = time.time()
678
+ logger.debug("Running Sequential Feature Selection...")
679
+ warnings.filterwarnings("ignore", category=FutureWarning)
680
+
681
+ params = {
682
+ "max_depth": 2**3,
683
+ "random_state": 42,
684
+ }
685
+ estimator = (
686
+ DecisionTreeClassifier(**params)
687
+ if target_type == "classification"
688
+ else DecisionTreeRegressor(**params)
689
+ )
690
+
691
+ n_splits = 3
692
+ n_samples = len(X)
693
+ test_size = int(n_samples / (n_splits + 4))
694
+ tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
695
+
696
+ score_function = (
697
+ make_scorer(
698
+ log_loss, response_method="predict_proba"
699
+ ) # logloss needs probabilities
700
+ if target_type == "classification"
701
+ else make_scorer(root_mean_squared_error)
702
+ ) # we avoid greater_is_better = False because it make the score negative and mess up ranking
703
+
704
+ sfs = SequentialFeatureSelector(
705
+ estimator,
706
+ k_features=int(percentile * X.shape[1] / 100),
707
+ forward=True,
708
+ floating=True, # Enables dynamic feature elimination
709
+ scoring=score_function,
710
+ cv=tscv,
711
+ n_jobs=-1,
712
+ verbose=0,
713
+ )
714
+
715
+ feat_selector = sfs.fit(X, y)
716
+
717
+ # Extract selected features and their scores
718
+ selected_features = set(feat_selector.k_feature_names_)
719
+ feat_subsets = feat_selector.subsets_
720
+
721
+ # Create DataFrame for feature scores
722
+ feat_scores = pd.DataFrame(
723
+ {
724
+ "features": X.columns,
725
+ "support": X.columns.isin(
726
+ selected_features
727
+ ), # TODO: comprendre pourquoi le support n'est pas correct (les bons scores ne sont pas toujours choisis)
728
+ "score": 1000,
729
+ "rank": None,
730
+ "method": "SFS",
731
+ }
732
+ )
733
+
734
+ # Sort subsets by score (lower is better)
735
+ sorted_subsets = sorted(
736
+ feat_subsets.items(), key=lambda item: item[1]["avg_score"]
737
+ )
738
+
739
+ # Record score per feature (first appearance)
740
+ feature_score_map = {}
741
+ for step in sorted_subsets:
742
+ step = step[1]
743
+ for feature in step["feature_names"]:
744
+ if feature not in feature_score_map:
745
+ feature_score_map[feature] = step["avg_score"]
746
+
747
+ # Assign scores
748
+ for feature, score in feature_score_map.items():
749
+ feat_scores.loc[feat_scores["features"] == feature, "score"] = score
750
+
751
+ # rank by score (lower = better)
752
+ feat_scores["rank"] = (
753
+ feat_scores["score"].rank(method="first", ascending=True).astype(int)
754
+ )
755
+
756
+ feat_scores.sort_values("rank", ascending=True, inplace=True)
757
+
758
+ stop = time.time()
759
+ training_time = timedelta(seconds=(stop - start)).total_seconds()
760
+ feat_scores["training_time"] = training_time
761
+
762
+ logger.debug(
763
+ f"SFS evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
764
+ )
765
+
766
+ feat_scores.to_csv(
767
+ f"{save_dir}/SFS.csv", index=True, header=True, index_label="ID"
768
+ )
769
+
770
+ return feat_scores
771
+
772
+
773
+ class PreprocessModel:
774
+
775
+ def __init__(
776
+ self,
777
+ train,
778
+ val,
779
+ test,
780
+ dataset,
781
+ target_numbers,
782
+ target_clf,
783
+ models_idx,
784
+ time_series,
785
+ max_timesteps,
786
+ group_column,
787
+ date_column,
788
+ **kwargs,
789
+ ):
790
+ self.dataset = dataset
791
+ self.target_numbers = target_numbers
792
+ self.target_clf = target_clf
793
+ self.models_idx = models_idx
794
+ self.time_series = time_series
795
+ self.max_timesteps = max_timesteps
796
+ self.group_column = group_column
797
+ self.date_column = date_column
798
+
799
+ self.dataset_dir = dataset.path
800
+ self.data_dir = f"{self.dataset_dir}/data"
801
+
802
+ self.all_features = dataset.get_all_features(
803
+ date_column=date_column, group_column=group_column
804
+ )
805
+ columns_to_keep = self.all_features + [
806
+ f"TARGET_{i}" for i in self.target_numbers
807
+ ]
808
+ duplicates = [
809
+ col for col in set(columns_to_keep) if columns_to_keep.count(col) > 1
810
+ ]
811
+ if duplicates:
812
+ raise ValueError(f"Doublons détectés dans columns_to_keep: {duplicates}")
813
+
814
+ logger.info(self.all_features)
815
+
816
+ self.train = train[columns_to_keep]
817
+ if val:
818
+ self.val = val[columns_to_keep]
819
+ if test:
820
+ self.test = test[columns_to_keep]
821
+
822
+ def run(self):
823
+ # save data
824
+ if PYTHON_ENV != "Test":
825
+ joblib.dump(self.train, f"{self.data_dir}/train.pkl")
826
+ joblib.dump(self.val, f"{self.data_dir}/val.pkl")
827
+ joblib.dump(self.test, f"{self.data_dir}/test.pkl")
828
+ preprocessing_dir = f"{self.dataset_dir}/preprocessing"
829
+ else:
830
+ preprocessing_dir = None
831
+
832
+ # scaling features
833
+ if any(t not in self.target_clf for t in self.target_numbers) and any(
834
+ all_models[i].get("need_scaling") for i in self.models_idx
835
+ ):
836
+ logger.info("Scaling features...")
837
+ train_scaled, scaler_x, scalers_y = self.scale_data(
838
+ self.train, save_dir=preprocessing_dir
839
+ )
840
+ val_scaled, _, _ = self.scale_data(
841
+ self.val,
842
+ save_dir=preprocessing_dir,
843
+ scaler_x=scaler_x,
844
+ scalers_y=scalers_y,
845
+ )
846
+ test_scaled, _, _ = self.scale_data(
847
+ self.test,
848
+ save_dir=preprocessing_dir,
849
+ scaler_x=scaler_x,
850
+ scalers_y=scalers_y,
851
+ )
852
+ else:
853
+ train_scaled = None
854
+ val_scaled = None
855
+ test_scaled = None
856
+
857
+ # save data
858
+ if PYTHON_ENV != "Test":
859
+ joblib.dump(train_scaled, f"{self.data_dir}/train_scaled.pkl")
860
+ joblib.dump(val_scaled, f"{self.data_dir}/val_scaled.pkl")
861
+ joblib.dump(test_scaled, f"{self.data_dir}/test_scaled.pkl")
862
+
863
+ data = {
864
+ "train": self.train,
865
+ "val": self.val,
866
+ "test": self.test,
867
+ "train_scaled": train_scaled,
868
+ "val_scaled": val_scaled,
869
+ "test_scaled": test_scaled,
870
+ "scalers_y": scalers_y,
871
+ }
872
+
873
+ # reshape data for time series
874
+ reshaped_data = None
875
+ if (
876
+ any(all_models[i].get("recurrent") for i in self.models_idx)
877
+ and self.time_series
878
+ ):
879
+ # reshaping data for recurrent models
880
+ logger.info("Reshaping data for recurrent models...")
881
+ reshaped_data = self.reshape_time_series(
882
+ train_scaled,
883
+ val_scaled,
884
+ test_scaled,
885
+ features=self.all_features,
886
+ timesteps=self.max_timesteps,
887
+ )
888
+
889
+ return data, reshaped_data
890
+
891
+ def inference(self):
892
+ # self.train is new data here
893
+ scaler_x = joblib.load(f"{self.preprocessing_dir}/scaler_x.pkl")
894
+ scaled_data = scaler_x.transform(self.train)
895
+ scaled_data = pd.DataFrame(
896
+ scaled_data, columns=self.train.columns, index=self.train.index
897
+ )
898
+
899
+ reshaped_data = None
900
+ if (
901
+ any(all_models[i].get("recurrent") for i in self.models_idx)
902
+ and self.time_series
903
+ ):
904
+ # we need to make sur we have max_timesteps of data after grouping by group_column
905
+ if (
906
+ self.group_column
907
+ and scaled_data.groupby(self.group_column).size().min()
908
+ < self.max_timesteps
909
+ ) or scaled_data.shape[0] < self.max_timesteps:
910
+ raise ValueError(
911
+ f"Not enough data for group_column {self.group_column} to reshape data for recurrent models"
912
+ )
913
+
914
+ # reshaping data for recurrent models
915
+ logger.info("Reshaping data for recurrent models...")
916
+ reshaped_data = self.reshape_time_series(
917
+ scaled_data,
918
+ features=self.all_features,
919
+ timesteps=self.max_timesteps,
920
+ )
921
+
922
+ return self.train, scaled_data, reshaped_data
923
+
924
+ # scaling
925
+ def scale_data(
926
+ self,
927
+ df: pd.DataFrame,
928
+ save_dir: str,
929
+ scaler_x=None,
930
+ scalers_y: Optional[list] = None,
931
+ ):
932
+ logger.info("Scale data...")
933
+ X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
934
+
935
+ if scaler_x:
936
+ X_scaled = pd.DataFrame(
937
+ scaler_x.transform(X), columns=list(X.columns), index=X.index
938
+ )
939
+ else:
940
+ scaler_x = StandardScaler() # MinMaxScaler(feature_range=(-1,1))
941
+ X_scaled = pd.DataFrame(
942
+ scaler_x.fit_transform(X), columns=list(X.columns), index=X.index
943
+ )
944
+ if save_dir:
945
+ joblib.dump(scaler_x, f"{save_dir}/scaler_x.pkl")
946
+
947
+ # Determine which targets need to be scaled
948
+ targets_numbers_to_scale = [
949
+ i for i in self.target_numbers if i not in self.target_clf
950
+ ]
951
+
952
+ # Dictionary to store scaled target data
953
+ scaled_targets = {}
954
+
955
+ if scalers_y:
956
+ for target_number in targets_numbers_to_scale:
957
+ y = df[[f"TARGET_{target_number}"]]
958
+ scaled_targets[target_number] = pd.DataFrame(
959
+ scalers_y[f"scaler_y_{target_number}"].transform(y.values),
960
+ columns=y.columns,
961
+ index=y.index,
962
+ )
963
+ else:
964
+ scalers_y = {}
965
+ for target_number in targets_numbers_to_scale:
966
+ scaler_y = StandardScaler()
967
+ y = df[[f"TARGET_{target_number}"]]
968
+
969
+ scaled_y = pd.DataFrame(
970
+ scaler_y.fit_transform(y.values),
971
+ columns=y.columns,
972
+ index=y.index,
973
+ )
974
+ if save_dir:
975
+ joblib.dump(scaler_y, f"{save_dir}/scaler_y_{target_number}.pkl")
976
+
977
+ scalers_y[f"scaler_y_{target_number}"] = scaler_y
978
+ scaled_targets[target_number] = scaled_y
979
+
980
+ # Reconstruct y_scaled in the original order
981
+ y_scaled = pd.concat(
982
+ [
983
+ scaled_targets[target_number]
984
+ for target_number in targets_numbers_to_scale
985
+ ],
986
+ axis=1,
987
+ )
988
+ y_not_scaled = df[
989
+ df.columns.intersection([f"TARGET_{i}" for i in self.target_clf])
990
+ ]
991
+
992
+ # Ensure the final DataFrame keeps the original order
993
+ df_scaled = pd.concat(
994
+ [X_scaled, y_scaled, y_not_scaled],
995
+ axis=1,
996
+ )[
997
+ df.columns
998
+ ] # Reorder columns to match original `df`
999
+
1000
+ if not df_scaled.columns.equals(df.columns):
1001
+ raise Exception("Columns are not in the same order after scaling.")
1002
+
1003
+ return df_scaled, scaler_x, scalers_y
1004
+
1005
+ # Reshape into 3D tensors for recurrent models
1006
+ def reshape_time_series(
1007
+ self,
1008
+ train: pd.DataFrame,
1009
+ val: pd.DataFrame,
1010
+ test: pd.DataFrame,
1011
+ features: list,
1012
+ timesteps: int = 120,
1013
+ ):
1014
+ # always scale for recurrent layers : train should be scaled
1015
+ group_column = self.group_column
1016
+
1017
+ target_columns = train.columns.intersection(
1018
+ [f"TARGET_{i}" for i in self.target_numbers]
1019
+ )
1020
+
1021
+ data = pd.concat([train, val, test], axis=0)
1022
+
1023
+ def reshape_df(df: pd.DataFrame, group_series: pd.Series, timesteps: int):
1024
+ fill_value = [[[0] * len(df.columns)]]
1025
+
1026
+ def shiftsum(x, timesteps: int):
1027
+ tmp = x.copy()
1028
+ for i in range(1, timesteps):
1029
+ tmp = x.shift(i, fill_value=fill_value) + tmp
1030
+ return tmp
1031
+
1032
+ logger.info("Grouping each feature in a unique column with list...")
1033
+ df_reshaped = df.apply(list, axis=1).apply(lambda x: [list(x)])
1034
+ df_reshaped = pd.concat([df_reshaped, group_series], axis=1)
1035
+
1036
+ logger.info("Grouping method stock and creating timesteps...")
1037
+ df_reshaped = (
1038
+ df_reshaped.groupby(group_column)[0]
1039
+ .apply(lambda x: shiftsum(x, timesteps))
1040
+ .reset_index(group_column, drop=True)
1041
+ .rename("RECURRENT_FEATURES")
1042
+ )
1043
+ df_reshaped = pd.DataFrame(df_reshaped)
1044
+
1045
+ return df_reshaped
1046
+
1047
+ data_reshaped = reshape_df(data[features], data[group_column], timesteps)
1048
+
1049
+ data_reshaped[target_columns] = data[target_columns]
1050
+
1051
+ logger.info("Separating train, val, test data and creating np arrays...")
1052
+ train_reshaped = data_reshaped.loc[train.index]
1053
+ val_reshaped = data_reshaped.loc[val.index]
1054
+ test_reshaped = data_reshaped.loc[test.index]
1055
+
1056
+ x_train_reshaped = np.array(
1057
+ train_reshaped["RECURRENT_FEATURES"].values.tolist()
1058
+ )
1059
+ y_train_reshaped = np.array(train_reshaped[target_columns].reset_index())
1060
+ x_val_reshaped = np.array(val_reshaped["RECURRENT_FEATURES"].values.tolist())
1061
+ y_val_reshaped = np.array(val_reshaped[target_columns].reset_index())
1062
+ x_test_reshaped = np.array(test_reshaped["RECURRENT_FEATURES"].values.tolist())
1063
+ y_test_reshaped = np.array(test_reshaped[target_columns].reset_index())
1064
+
1065
+ reshaped_data = {
1066
+ "x_train_reshaped": x_train_reshaped,
1067
+ "y_train_reshaped": y_train_reshaped,
1068
+ "x_val_reshaped": x_val_reshaped,
1069
+ "y_val_reshaped": y_val_reshaped,
1070
+ "x_test_reshaped": x_test_reshaped,
1071
+ "y_test_reshaped": y_test_reshaped,
1072
+ }
1073
+
1074
+ return reshaped_data
1075
+
1076
+
1077
+ # utils
1078
+ # TODO : can we use this to select the ideal number of features ?
1079
+ def feature_selection_analysis(feature_selection_id: int, n_components: int = 5):
1080
+
1081
+ feature_selection = FeatureSelection.get(feature_selection_id)
1082
+ dataset_dir = feature_selection.dataset.path
1083
+ features = [f.name for f in feature_selection.features]
1084
+ target = feature_selection.target.name
1085
+ target_number = target.split("_")[1]
1086
+
1087
+ train, val, train_scaled, val_scaled, _scaler_y = load_train_data(
1088
+ dataset_dir, target_number, target_type=feature_selection.target.type
1089
+ )
1090
+ train = train[features + [target]]
1091
+ train_scaled = train_scaled[features + [target]]
1092
+
1093
+ logger.info("Plot features correlation with target variable...")
1094
+
1095
+ correlations = train.corr()[target].sort_values(ascending=False)
1096
+
1097
+ plt.figure(figsize=(12, 6))
1098
+ sns.barplot(x=correlations.index, y=correlations.values, palette="coolwarm")
1099
+ plt.xticks(rotation=90)
1100
+ plt.title("Feature correlation with target variable")
1101
+ plt.ylabel("Correlation")
1102
+ plt.xlabel("Features")
1103
+ plt.grid(axis="y", linestyle="--", alpha=0.7)
1104
+ plt.show()
1105
+
1106
+ plt.figure(figsize=(14, 10))
1107
+ sns.heatmap(train.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
1108
+ plt.title("Correlation Matrix")
1109
+ plt.show()
1110
+
1111
+ logger.info("Plot explained variance by components...")
1112
+ n_components = min(len(features), n_components)
1113
+ pca = PCA(n_components=n_components)
1114
+ X_pca = pca.fit_transform(train_scaled)
1115
+
1116
+ explained_variance = pca.explained_variance_ratio_
1117
+
1118
+ plt.figure(figsize=(10, 7))
1119
+ plt.bar(
1120
+ range(1, len(explained_variance) + 1),
1121
+ explained_variance,
1122
+ label="Explained Variance",
1123
+ )
1124
+ plt.plot(
1125
+ range(1, len(explained_variance) + 1),
1126
+ np.cumsum(explained_variance),
1127
+ label="Cumulative Explained Variance",
1128
+ color="orange",
1129
+ marker="o",
1130
+ )
1131
+ plt.title("Explained Variance by Components")
1132
+ plt.xlabel("Number of Components")
1133
+ plt.ylabel("Explained Variance")
1134
+ plt.legend()
1135
+ plt.grid(axis="y", linestyle="--", alpha=0.7)
1136
+ plt.show()
1137
+
1138
+ logger.info("Main PCA vs target variable...")
1139
+ plt.scatter(
1140
+ X_pca[:, 0],
1141
+ X_pca[:, 1],
1142
+ c=train[target],
1143
+ cmap="coolwarm",
1144
+ alpha=0.7,
1145
+ )
1146
+ plt.title("PCA of target variable")
1147
+ plt.xlabel("First Principal Component")
1148
+ plt.ylabel("Second Principal Component")
1149
+ plt.colorbar()
1150
+ plt.show()
1151
+
1152
+
1153
+ def get_features_by_types(df: pd.DataFrame, sample_categorical_threshold: int = 15):
1154
+ categorical_features = [
1155
+ col
1156
+ for col in df.columns
1157
+ if df[col].nunique() <= sample_categorical_threshold
1158
+ and df[col].dtype in ["int64", "Int64"]
1159
+ ]
1160
+ df_categorical = df[categorical_features]
1161
+ logger.info(f"Number of categorical features: {len(categorical_features)}")
1162
+
1163
+ numerical_features = list(set(df.columns).difference(set(categorical_features)))
1164
+ df_numerical = df[numerical_features]
1165
+ logger.info(f"Number of numerical features: {len(numerical_features)}")
1166
+
1167
+ return df_categorical, df_numerical