lecrapaud 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lecrapaud might be problematic. Click here for more details.
- lecrapaud/__init__.py +1 -0
- lecrapaud/api.py +271 -0
- lecrapaud/config.py +25 -0
- lecrapaud/db/__init__.py +1 -0
- lecrapaud/db/alembic/README +1 -0
- lecrapaud/db/alembic/env.py +78 -0
- lecrapaud/db/alembic/script.py.mako +26 -0
- lecrapaud/db/alembic/versions/2025_04_06_1738-7390745388e4_initial_setup.py +295 -0
- lecrapaud/db/alembic/versions/2025_04_06_1755-40cd8d3e798e_unique_constraint_for_data.py +30 -0
- lecrapaud/db/alembic/versions/2025_05_23_1724-2360941fa0bd_longer_string.py +52 -0
- lecrapaud/db/alembic/versions/2025_05_27_1159-b96396dcfaff_add_env_to_trading_tables.py +34 -0
- lecrapaud/db/alembic/versions/2025_05_27_1337-40cbfc215f7c_fix_nb_character_on_portfolio.py +39 -0
- lecrapaud/db/alembic/versions/2025_05_27_1526-3de994115317_to_datetime.py +36 -0
- lecrapaud/db/alembic/versions/2025_05_27_2003-25c227c684f8_add_fees_to_transactions.py +30 -0
- lecrapaud/db/alembic/versions/2025_05_27_2047-6b6f2d38e9bc_double_instead_of_float.py +132 -0
- lecrapaud/db/alembic/versions/2025_05_31_1111-c175e4a36d68_generalise_stock_to_group.py +36 -0
- lecrapaud/db/alembic/versions/2025_05_31_1256-5681095bfc27_create_investment_run_and_portfolio_.py +62 -0
- lecrapaud/db/alembic/versions/2025_05_31_1806-339927587383_add_investment_run_id.py +107 -0
- lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +38 -0
- lecrapaud/db/alembic/versions/2025_05_31_1849-3b8550297e8e_change_date_to_datetime.py +44 -0
- lecrapaud/db/alembic/versions/2025_05_31_1852-e6b8c95d8243_add_date_to_portfolio_history.py +30 -0
- lecrapaud/db/alembic/versions/2025_06_10_1136-db8cdd83563a_addnewsandoptiontodata.py +32 -0
- lecrapaud/db/alembic/versions/2025_06_17_1652-c45f5e49fa2c_make_fields_nullable.py +89 -0
- lecrapaud/db/models/__init__.py +11 -0
- lecrapaud/db/models/base.py +181 -0
- lecrapaud/db/models/dataset.py +129 -0
- lecrapaud/db/models/feature.py +45 -0
- lecrapaud/db/models/feature_selection.py +125 -0
- lecrapaud/db/models/feature_selection_rank.py +79 -0
- lecrapaud/db/models/model.py +40 -0
- lecrapaud/db/models/model_selection.py +63 -0
- lecrapaud/db/models/model_training.py +62 -0
- lecrapaud/db/models/score.py +65 -0
- lecrapaud/db/models/target.py +67 -0
- lecrapaud/db/session.py +45 -0
- lecrapaud/directory_management.py +28 -0
- lecrapaud/experiment.py +64 -0
- lecrapaud/feature_engineering.py +846 -0
- lecrapaud/feature_selection.py +1167 -0
- lecrapaud/integrations/openai_integration.py +225 -0
- lecrapaud/jobs/__init__.py +13 -0
- lecrapaud/jobs/config.py +17 -0
- lecrapaud/jobs/scheduler.py +36 -0
- lecrapaud/jobs/tasks.py +57 -0
- lecrapaud/model_selection.py +1671 -0
- lecrapaud/predictions.py +292 -0
- lecrapaud/preprocessing.py +984 -0
- lecrapaud/search_space.py +848 -0
- lecrapaud/services/__init__.py +0 -0
- lecrapaud/services/embedding_categorical.py +71 -0
- lecrapaud/services/indicators.py +309 -0
- lecrapaud/speed_tests/experiments.py +139 -0
- lecrapaud/speed_tests/test-gpu-bilstm.ipynb +261 -0
- lecrapaud/speed_tests/test-gpu-resnet.ipynb +166 -0
- lecrapaud/speed_tests/test-gpu-transformers.ipynb +254 -0
- lecrapaud/speed_tests/tests.ipynb +145 -0
- lecrapaud/speed_tests/trash.py +37 -0
- lecrapaud/training.py +239 -0
- lecrapaud/utils.py +246 -0
- lecrapaud-0.1.0.dist-info/LICENSE +201 -0
- lecrapaud-0.1.0.dist-info/METADATA +105 -0
- lecrapaud-0.1.0.dist-info/RECORD +63 -0
- lecrapaud-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,1167 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from datetime import timedelta
|
|
4
|
+
import matplotlib.pyplot as plt
|
|
5
|
+
import seaborn as sns
|
|
6
|
+
import os
|
|
7
|
+
import time
|
|
8
|
+
from typing import Optional
|
|
9
|
+
from tqdm import tqdm
|
|
10
|
+
import warnings
|
|
11
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
12
|
+
import joblib
|
|
13
|
+
import re
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
|
|
17
|
+
|
|
18
|
+
# feature selection
|
|
19
|
+
from sklearn.feature_selection import (
|
|
20
|
+
f_classif,
|
|
21
|
+
f_regression,
|
|
22
|
+
mutual_info_classif,
|
|
23
|
+
mutual_info_regression,
|
|
24
|
+
chi2,
|
|
25
|
+
SelectPercentile,
|
|
26
|
+
SelectFpr,
|
|
27
|
+
RFE,
|
|
28
|
+
SelectFromModel,
|
|
29
|
+
)
|
|
30
|
+
from sklearn.decomposition import PCA
|
|
31
|
+
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
|
|
32
|
+
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
|
33
|
+
from sklearn.model_selection import TimeSeriesSplit
|
|
34
|
+
from sklearn.metrics import root_mean_squared_error, log_loss, make_scorer
|
|
35
|
+
from mlxtend.feature_selection import SequentialFeatureSelector
|
|
36
|
+
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
|
37
|
+
from scipy.stats import spearmanr, kendalltau
|
|
38
|
+
|
|
39
|
+
# Internal
|
|
40
|
+
from lecrapaud.directory_management import tmp_dir, clean_directory
|
|
41
|
+
from lecrapaud.utils import logger
|
|
42
|
+
from lecrapaud.config import PYTHON_ENV
|
|
43
|
+
from lecrapaud.db import (
|
|
44
|
+
Dataset,
|
|
45
|
+
Target,
|
|
46
|
+
Feature,
|
|
47
|
+
FeatureSelection,
|
|
48
|
+
FeatureSelectionRank,
|
|
49
|
+
)
|
|
50
|
+
from lecrapaud.db.session import get_db
|
|
51
|
+
from lecrapaud.search_space import all_models
|
|
52
|
+
|
|
53
|
+
# Variables for targets handling
|
|
54
|
+
TARGETS_MCLF = [11]
|
|
55
|
+
GROUPING_COLUMN = "STOCK"
|
|
56
|
+
|
|
57
|
+
# Annoying Warnings
|
|
58
|
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def load_train_data(dataset_dir, target_number, target_type="regression"):
|
|
62
|
+
data_dir = f"{dataset_dir}/data"
|
|
63
|
+
|
|
64
|
+
logger.info("Loading data...")
|
|
65
|
+
train = joblib.load(f"{data_dir}/train.pkl")
|
|
66
|
+
val = joblib.load(f"{data_dir}/val.pkl")
|
|
67
|
+
test = joblib.load(f"{data_dir}/test.pkl")
|
|
68
|
+
try:
|
|
69
|
+
train_scaled = joblib.load(f"{data_dir}/train_scaled.pkl")
|
|
70
|
+
val_scaled = joblib.load(f"{data_dir}/val_scaled.pkl")
|
|
71
|
+
test_scaled = joblib.load(f"{data_dir}/test_scaled.pkl")
|
|
72
|
+
except FileNotFoundError:
|
|
73
|
+
train_scaled = None
|
|
74
|
+
val_scaled = None
|
|
75
|
+
test_scaled = None
|
|
76
|
+
|
|
77
|
+
return train, val, test, train_scaled, val_scaled, test_scaled
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class FeatureSelectionEngine:
|
|
81
|
+
def __init__(self, train, dataset, target_number, target_clf, **kwargs):
|
|
82
|
+
self.dataset = dataset
|
|
83
|
+
self.train = train
|
|
84
|
+
self.target_number = target_number
|
|
85
|
+
self.target_clf = target_clf
|
|
86
|
+
|
|
87
|
+
self.target_type = (
|
|
88
|
+
"classification" if self.target_number in self.target_clf else "regression"
|
|
89
|
+
)
|
|
90
|
+
self.percentile = self.dataset.percentile
|
|
91
|
+
self.corr_threshold = self.dataset.corr_threshold
|
|
92
|
+
self.max_features = self.dataset.max_features
|
|
93
|
+
|
|
94
|
+
self.dataset_dir = self.dataset.path
|
|
95
|
+
self.dataset_id = self.dataset.id
|
|
96
|
+
self.data_dir = f"{self.dataset_dir}/data"
|
|
97
|
+
self.preprocessing_dir = f"{self.dataset_dir}/preprocessing"
|
|
98
|
+
self.fs_dir_target = (
|
|
99
|
+
f"{self.dataset_dir}/{f"TARGET_{self.target_number}"}/feature_selection"
|
|
100
|
+
)
|
|
101
|
+
os.makedirs(self.fs_dir_target, exist_ok=True)
|
|
102
|
+
|
|
103
|
+
# Main feature selection function
|
|
104
|
+
def run(
|
|
105
|
+
self,
|
|
106
|
+
single_process: bool = False,
|
|
107
|
+
):
|
|
108
|
+
"""Function to do feature selection with a range of different feature selection technics
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
- train (pd.DataFrame): a pandas train set
|
|
112
|
+
- target_number (in): a target, targets need to be name ``TARGET_{n}```
|
|
113
|
+
- single_process (bool): if True, run all feature selection methods in a single process. If False, run them in parallel.
|
|
114
|
+
"""
|
|
115
|
+
target_number = self.target_number
|
|
116
|
+
target_type = self.target_type
|
|
117
|
+
if PYTHON_ENV != "Test":
|
|
118
|
+
fs_dir_target = self.fs_dir_target
|
|
119
|
+
else:
|
|
120
|
+
fs_dir_target = None
|
|
121
|
+
|
|
122
|
+
# Create the feature selection in db
|
|
123
|
+
target = Target.find_by(name=f"TARGET_{target_number}")
|
|
124
|
+
percentile = self.percentile
|
|
125
|
+
corr_threshold = self.corr_threshold
|
|
126
|
+
max_features = self.max_features
|
|
127
|
+
|
|
128
|
+
feature_selection = FeatureSelection.upsert(
|
|
129
|
+
match_fields=["target_id", "dataset_id"],
|
|
130
|
+
target_id=target.id,
|
|
131
|
+
dataset_id=self.dataset_id,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
if feature_selection.best_features_path:
|
|
135
|
+
return joblib.load(feature_selection.best_features_path)
|
|
136
|
+
|
|
137
|
+
self.X = self.train.loc[:, ~self.train.columns.str.contains("^TARGET_")]
|
|
138
|
+
self.y = self.train[f"TARGET_{target_number}"]
|
|
139
|
+
|
|
140
|
+
logger.info(f"Starting feature selection for TARGET_{target_number}...")
|
|
141
|
+
clean_directory(self.fs_dir_target)
|
|
142
|
+
|
|
143
|
+
# Let's start by removing extremly correlated features
|
|
144
|
+
# This is needed to reduce nb of feature but also for methods such as anova or chi2 that requires independent features
|
|
145
|
+
# TODO: we could also remove low variance features
|
|
146
|
+
features_uncorrelated, features_correlated = self.remove_correlated_features(
|
|
147
|
+
90, vizualize=False
|
|
148
|
+
)
|
|
149
|
+
self.X = self.X[features_uncorrelated]
|
|
150
|
+
|
|
151
|
+
logger.debug(
|
|
152
|
+
f"""
|
|
153
|
+
\nWe first have removed {len(features_correlated)} features with correlation greater than 90%
|
|
154
|
+
\nWe are looking to capture {percentile}% of {len(self.X.columns)} features, i.e. {int(len(self.X.columns)*percentile/100)} features, with different feature selection methods
|
|
155
|
+
\nWe will then remove above {corr_threshold}% correlated features, keeping the one with the best ranks
|
|
156
|
+
\nFinally, we will keep only the {max_features} best ranked features
|
|
157
|
+
"""
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
start = time.time()
|
|
161
|
+
|
|
162
|
+
# handling categorical features (only if classification)
|
|
163
|
+
self.X_categorical, self.X_numerical = get_features_by_types(self.X)
|
|
164
|
+
|
|
165
|
+
if target_type == "classification":
|
|
166
|
+
feat_scores = self.select_categorical_features(
|
|
167
|
+
percentile=percentile, save_dir=fs_dir_target
|
|
168
|
+
)
|
|
169
|
+
with get_db() as db:
|
|
170
|
+
for row in feat_scores.itertuples(index=False):
|
|
171
|
+
feature = Feature.find_by(name=row.features, db=db)
|
|
172
|
+
FeatureSelectionRank.upsert(
|
|
173
|
+
["feature_selection_id", "feature_id", "method"],
|
|
174
|
+
db=db,
|
|
175
|
+
score=row.score,
|
|
176
|
+
pvalue=row.pvalue,
|
|
177
|
+
support=row.support,
|
|
178
|
+
rank=row.rank,
|
|
179
|
+
method=row.method,
|
|
180
|
+
training_time=row.training_time,
|
|
181
|
+
feature_selection_id=feature_selection.id,
|
|
182
|
+
feature_id=feature.id,
|
|
183
|
+
)
|
|
184
|
+
categorical_features_selected = feat_scores[feat_scores["support"]][
|
|
185
|
+
"features"
|
|
186
|
+
].values.tolist()
|
|
187
|
+
|
|
188
|
+
results = []
|
|
189
|
+
params = {"percentile": percentile, "save_dir": fs_dir_target}
|
|
190
|
+
if single_process:
|
|
191
|
+
results = [
|
|
192
|
+
self.select_feature_by_linear_correlation(**params),
|
|
193
|
+
self.select_feature_by_nonlinear_correlation(**params),
|
|
194
|
+
self.select_feature_by_mi(**params),
|
|
195
|
+
self.select_feature_by_feat_imp(**params),
|
|
196
|
+
self.select_feature_by_rfe(**params),
|
|
197
|
+
# self.select_feature_by_sfs(
|
|
198
|
+
# **params
|
|
199
|
+
# ), # TODO: this is taking too long
|
|
200
|
+
]
|
|
201
|
+
else:
|
|
202
|
+
# Use ProcessPoolExecutor to run tasks in parallel
|
|
203
|
+
# TODO: not sure it's efficient from previous tests... especially because rfe and sfs methods are doing parallel processing already, this can create overhead
|
|
204
|
+
with ProcessPoolExecutor() as executor:
|
|
205
|
+
# Submit different functions to be executed in parallel
|
|
206
|
+
futures = [
|
|
207
|
+
executor.submit(
|
|
208
|
+
self.select_feature_by_linear_correlation,
|
|
209
|
+
**params,
|
|
210
|
+
),
|
|
211
|
+
executor.submit(
|
|
212
|
+
self.select_feature_by_nonlinear_correlation,
|
|
213
|
+
**params,
|
|
214
|
+
),
|
|
215
|
+
executor.submit(
|
|
216
|
+
self.select_feature_by_mi,
|
|
217
|
+
**params,
|
|
218
|
+
),
|
|
219
|
+
executor.submit(
|
|
220
|
+
self.select_feature_by_feat_imp,
|
|
221
|
+
**params,
|
|
222
|
+
),
|
|
223
|
+
executor.submit(
|
|
224
|
+
self.select_feature_by_rfe,
|
|
225
|
+
**params,
|
|
226
|
+
),
|
|
227
|
+
# executor.submit(
|
|
228
|
+
# self.select_feature_by_sfs,
|
|
229
|
+
# **params,
|
|
230
|
+
# ), # TODO: this is taking too long
|
|
231
|
+
]
|
|
232
|
+
|
|
233
|
+
# Wait for all futures to complete and gather the results
|
|
234
|
+
with tqdm(total=len(futures)) as pbar:
|
|
235
|
+
for future in as_completed(futures):
|
|
236
|
+
results.append(future.result())
|
|
237
|
+
pbar.update(1)
|
|
238
|
+
|
|
239
|
+
logger.info(f"Finished feature selection for target {target_number}")
|
|
240
|
+
|
|
241
|
+
stop = time.time()
|
|
242
|
+
|
|
243
|
+
# Once all tasks are completed, start by inserting results to db
|
|
244
|
+
feat_scores = pd.concat(
|
|
245
|
+
results,
|
|
246
|
+
axis=0,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
logger.info("Inserting feature selection results to db...")
|
|
250
|
+
rows = []
|
|
251
|
+
with get_db() as db:
|
|
252
|
+
feature_map = {f.name: f.id for f in Feature.get_all(db=db, limit=20000)}
|
|
253
|
+
for row in feat_scores.itertuples(index=False):
|
|
254
|
+
feature_id = feature_map.get(row.features)
|
|
255
|
+
if not feature_id:
|
|
256
|
+
continue # or raise if feature must exist
|
|
257
|
+
|
|
258
|
+
rows.append(
|
|
259
|
+
{
|
|
260
|
+
"feature_selection_id": feature_selection.id,
|
|
261
|
+
"feature_id": feature_id,
|
|
262
|
+
"method": row.method,
|
|
263
|
+
"score": row.score,
|
|
264
|
+
"pvalue": None if pd.isna(row.pvalue) else row.pvalue,
|
|
265
|
+
"support": row.support,
|
|
266
|
+
"rank": row.rank,
|
|
267
|
+
"training_time": row.training_time,
|
|
268
|
+
}
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
if len(rows) == 0:
|
|
272
|
+
raise ValueError(f"No features selected for TARGET_{target_number}")
|
|
273
|
+
|
|
274
|
+
FeatureSelectionRank.bulk_upsert(rows=rows, db=db)
|
|
275
|
+
|
|
276
|
+
# Merge the results
|
|
277
|
+
logger.info("Merging feature selection methods...")
|
|
278
|
+
features_selected = feat_scores[feat_scores["support"]][["features", "rank"]]
|
|
279
|
+
features_selected.sort_values("rank", inplace=True)
|
|
280
|
+
features_selected.drop_duplicates("features", inplace=True)
|
|
281
|
+
|
|
282
|
+
features_selected_list = features_selected["features"].values.tolist()
|
|
283
|
+
|
|
284
|
+
# analysis 1
|
|
285
|
+
features_selected_by_every_methods = set(results[0]["features"].values.tolist())
|
|
286
|
+
for df in results[1:]:
|
|
287
|
+
features_selected_by_every_methods &= set(
|
|
288
|
+
df["features"].values.tolist()
|
|
289
|
+
) # intersection
|
|
290
|
+
features_selected_by_every_methods = list(features_selected_by_every_methods)
|
|
291
|
+
logger.debug(
|
|
292
|
+
f"We selected {len(features_selected_list)} features and {len(features_selected_by_every_methods)} were selected unanimously:"
|
|
293
|
+
)
|
|
294
|
+
logger.debug(features_selected_by_every_methods)
|
|
295
|
+
if PYTHON_ENV != "Test":
|
|
296
|
+
pd.Series(features_selected_list).to_csv(
|
|
297
|
+
f"{fs_dir_target}/features_before_corr.csv",
|
|
298
|
+
index=True,
|
|
299
|
+
header=True,
|
|
300
|
+
index_label="ID",
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
# removing correlated features
|
|
304
|
+
self.X = self.X[features_selected_list]
|
|
305
|
+
features, features_correlated = self.remove_correlated_features(corr_threshold)
|
|
306
|
+
if PYTHON_ENV != "Test":
|
|
307
|
+
pd.Series(features).to_csv(
|
|
308
|
+
f"{fs_dir_target}/features_before_max.csv",
|
|
309
|
+
index=True,
|
|
310
|
+
header=True,
|
|
311
|
+
index_label="ID",
|
|
312
|
+
)
|
|
313
|
+
features = features[:max_features]
|
|
314
|
+
|
|
315
|
+
# adding categorical features selected
|
|
316
|
+
features += (
|
|
317
|
+
categorical_features_selected if target_type == "classification" else []
|
|
318
|
+
)
|
|
319
|
+
logger.debug(
|
|
320
|
+
f"Final pre-selection: {len(features)} features below {corr_threshold}% out of {len(features_selected_list)} features, and rejected {len(features_correlated)} features, {100*len(features)/len(features_selected_list):.2f}% features selected"
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
# analysis 2
|
|
324
|
+
features_selected_by_every_methods_uncorrelated = list(
|
|
325
|
+
set(features) & set(features_selected_by_every_methods)
|
|
326
|
+
)
|
|
327
|
+
logger.debug(
|
|
328
|
+
f"In this pre-selection, there is {len(features_selected_by_every_methods_uncorrelated)} features from the {len(features_selected_by_every_methods)} selected unanimously\n"
|
|
329
|
+
)
|
|
330
|
+
logger.debug(
|
|
331
|
+
features_selected[
|
|
332
|
+
features_selected["features"].isin(features)
|
|
333
|
+
].to_markdown()
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
# save to path
|
|
337
|
+
best_features_path = Path(
|
|
338
|
+
f"{self.preprocessing_dir}/features_{target_number}.pkl"
|
|
339
|
+
).resolve()
|
|
340
|
+
if PYTHON_ENV != "Test":
|
|
341
|
+
joblib.dump(features, best_features_path)
|
|
342
|
+
|
|
343
|
+
# save in db
|
|
344
|
+
db_features = Feature.filter(name__in=features)
|
|
345
|
+
# Order matters, to keep the same order in db as in features, we need: map features by name
|
|
346
|
+
feature_by_name = {f.name: f for f in db_features}
|
|
347
|
+
# Reorder them according to original `features` list
|
|
348
|
+
ordered_db_features = [
|
|
349
|
+
feature_by_name[name] for name in features if name in feature_by_name
|
|
350
|
+
]
|
|
351
|
+
|
|
352
|
+
feature_selection = FeatureSelection.get(feature_selection.id)
|
|
353
|
+
feature_selection = feature_selection.add_features(ordered_db_features)
|
|
354
|
+
feature_selection.training_time = stop - start
|
|
355
|
+
feature_selection.best_features_path = best_features_path
|
|
356
|
+
feature_selection.save()
|
|
357
|
+
|
|
358
|
+
return features
|
|
359
|
+
|
|
360
|
+
# Remove correlation
|
|
361
|
+
# ------------------
|
|
362
|
+
|
|
363
|
+
def remove_correlated_features(self, corr_threshold: int, vizualize: bool = False):
|
|
364
|
+
X = self.X
|
|
365
|
+
features = X.columns
|
|
366
|
+
# Create correlation matrix, select upper triangle & remove features with correlation greater than threshold
|
|
367
|
+
corr_matrix = X[features].corr().abs()
|
|
368
|
+
|
|
369
|
+
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
|
370
|
+
features_uncorrelated = [
|
|
371
|
+
column
|
|
372
|
+
for column in upper.columns
|
|
373
|
+
if all(upper[column].dropna() <= corr_threshold / 100)
|
|
374
|
+
]
|
|
375
|
+
features_correlated = [
|
|
376
|
+
column
|
|
377
|
+
for column in upper.columns
|
|
378
|
+
if any(upper[column] > corr_threshold / 100)
|
|
379
|
+
]
|
|
380
|
+
|
|
381
|
+
if vizualize:
|
|
382
|
+
features_selected_visualization = (
|
|
383
|
+
X[features]
|
|
384
|
+
.corr()
|
|
385
|
+
.where(np.triu(np.ones(len(features)), k=1).astype(bool))
|
|
386
|
+
.fillna(0)
|
|
387
|
+
)
|
|
388
|
+
# Plot the heatmap
|
|
389
|
+
plt.figure(figsize=(10, 8))
|
|
390
|
+
sns.heatmap(
|
|
391
|
+
corr_matrix,
|
|
392
|
+
annot=True,
|
|
393
|
+
cmap="coolwarm",
|
|
394
|
+
center=0,
|
|
395
|
+
linewidths=1,
|
|
396
|
+
linecolor="black",
|
|
397
|
+
)
|
|
398
|
+
plt.title(f"Correlation Matrix")
|
|
399
|
+
plt.show()
|
|
400
|
+
|
|
401
|
+
logger.info(f"\n{features_selected_visualization.describe().to_string()}")
|
|
402
|
+
logger.info(f"\n{features_selected_visualization.to_string()}")
|
|
403
|
+
return features_uncorrelated, features_correlated
|
|
404
|
+
|
|
405
|
+
# Filter methods
|
|
406
|
+
# ----------------
|
|
407
|
+
|
|
408
|
+
def select_categorical_features(self, percentile, save_dir: Optional[str] = None):
|
|
409
|
+
X, y = self.X_categorical, self.y
|
|
410
|
+
|
|
411
|
+
start = time.time()
|
|
412
|
+
logger.debug("Running Chi2 for categorical features...")
|
|
413
|
+
feat_selector = SelectPercentile(chi2, percentile=percentile).fit(X, y)
|
|
414
|
+
feat_scores = pd.DataFrame()
|
|
415
|
+
feat_scores["score"] = feat_selector.scores_
|
|
416
|
+
feat_scores["pvalue"] = feat_selector.pvalues_
|
|
417
|
+
feat_scores["support"] = feat_selector.get_support()
|
|
418
|
+
feat_scores["features"] = X.columns
|
|
419
|
+
feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
|
|
420
|
+
feat_scores["method"] = "Chi2"
|
|
421
|
+
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
422
|
+
stop = time.time()
|
|
423
|
+
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
424
|
+
feat_scores["training_time"] = training_time
|
|
425
|
+
|
|
426
|
+
logger.debug(
|
|
427
|
+
f"Chi2 evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
feat_scores.to_csv(
|
|
431
|
+
f"{save_dir}/Chi2.csv", index=True, header=True, index_label="ID"
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
return feat_scores
|
|
435
|
+
|
|
436
|
+
# Linear correlation (Person's R for regression and ANOVA for classification)
|
|
437
|
+
def select_feature_by_linear_correlation(
|
|
438
|
+
self, percentile: int = 20, save_dir: Optional[str] = None
|
|
439
|
+
):
|
|
440
|
+
X, y, target_type = self.X_numerical, self.y, self.target_type
|
|
441
|
+
|
|
442
|
+
start = time.time()
|
|
443
|
+
test_type = "Person's R" if target_type == "regression" else "ANOVA"
|
|
444
|
+
logger.debug(f"Running {test_type}...")
|
|
445
|
+
|
|
446
|
+
model = f_regression if target_type == "regression" else f_classif
|
|
447
|
+
feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
|
|
448
|
+
feat_scores = pd.DataFrame()
|
|
449
|
+
feat_scores["score"] = feat_selector.scores_
|
|
450
|
+
feat_scores["pvalue"] = feat_selector.pvalues_
|
|
451
|
+
feat_scores["support"] = feat_selector.get_support()
|
|
452
|
+
feat_scores["features"] = X.columns
|
|
453
|
+
feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
|
|
454
|
+
feat_scores["method"] = test_type
|
|
455
|
+
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
456
|
+
stop = time.time()
|
|
457
|
+
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
458
|
+
feat_scores["training_time"] = training_time
|
|
459
|
+
|
|
460
|
+
logger.debug(
|
|
461
|
+
f"{test_type} evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
feat_scores.to_csv(
|
|
465
|
+
f"{save_dir}/{test_type}.csv",
|
|
466
|
+
index=True,
|
|
467
|
+
header=True,
|
|
468
|
+
index_label="ID",
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
return feat_scores
|
|
472
|
+
|
|
473
|
+
# Non-Linear correlation (Spearsman's R for regression and Kendall's Tau for classification)
|
|
474
|
+
def select_feature_by_nonlinear_correlation(
|
|
475
|
+
self, percentile: int = 20, save_dir: Optional[str] = None
|
|
476
|
+
):
|
|
477
|
+
X, y, target_type = self.X_numerical, self.y, self.target_type
|
|
478
|
+
|
|
479
|
+
start = time.time()
|
|
480
|
+
|
|
481
|
+
def model(X_model, y_model):
|
|
482
|
+
X_model = pd.DataFrame(X_model)
|
|
483
|
+
y_model = pd.Series(y_model)
|
|
484
|
+
|
|
485
|
+
method = "spearman" if target_type == "regression" else "kendall"
|
|
486
|
+
|
|
487
|
+
corr_scores = []
|
|
488
|
+
p_values = []
|
|
489
|
+
|
|
490
|
+
for col in X_model.columns:
|
|
491
|
+
if method == "spearman":
|
|
492
|
+
corr, pval = spearmanr(X_model[col], y_model)
|
|
493
|
+
else: # Kendall's Tau for classification
|
|
494
|
+
corr, pval = kendalltau(X_model[col], y_model)
|
|
495
|
+
|
|
496
|
+
corr_scores.append(abs(corr)) # Keeping absolute correlation
|
|
497
|
+
p_values.append(pval)
|
|
498
|
+
|
|
499
|
+
return np.array(corr_scores), np.array(p_values)
|
|
500
|
+
|
|
501
|
+
test_type = "Spearman's R" if target_type == "regression" else "Kendall's Tau"
|
|
502
|
+
logger.debug(f"Running {test_type}...")
|
|
503
|
+
|
|
504
|
+
feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
|
|
505
|
+
feat_scores = pd.DataFrame()
|
|
506
|
+
feat_scores["score"] = feat_selector.scores_
|
|
507
|
+
feat_scores["pvalue"] = feat_selector.pvalues_
|
|
508
|
+
feat_scores["support"] = feat_selector.get_support()
|
|
509
|
+
feat_scores["features"] = X.columns
|
|
510
|
+
feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
|
|
511
|
+
feat_scores["method"] = test_type
|
|
512
|
+
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
513
|
+
stop = time.time()
|
|
514
|
+
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
515
|
+
feat_scores["training_time"] = training_time
|
|
516
|
+
|
|
517
|
+
logger.debug(
|
|
518
|
+
f"{test_type} evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
feat_scores.to_csv(
|
|
522
|
+
f"{save_dir}/{test_type}.csv",
|
|
523
|
+
index=True,
|
|
524
|
+
header=True,
|
|
525
|
+
index_label="ID",
|
|
526
|
+
)
|
|
527
|
+
|
|
528
|
+
return feat_scores
|
|
529
|
+
|
|
530
|
+
# Mutual Information
|
|
531
|
+
def select_feature_by_mi(
|
|
532
|
+
self, percentile: int = 20, save_dir: Optional[str] = None
|
|
533
|
+
):
|
|
534
|
+
X, y, target_type = self.X_numerical, self.y, self.target_type
|
|
535
|
+
|
|
536
|
+
start = time.time()
|
|
537
|
+
logger.debug("Running Mutual Information...")
|
|
538
|
+
model = (
|
|
539
|
+
mutual_info_regression
|
|
540
|
+
if target_type == "regression"
|
|
541
|
+
else mutual_info_classif
|
|
542
|
+
)
|
|
543
|
+
feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
|
|
544
|
+
feat_scores = pd.DataFrame()
|
|
545
|
+
feat_scores["score"] = feat_selector.scores_
|
|
546
|
+
feat_scores["support"] = feat_selector.get_support()
|
|
547
|
+
feat_scores["features"] = X.columns
|
|
548
|
+
feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
|
|
549
|
+
feat_scores["method"] = "Mutual Information"
|
|
550
|
+
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
551
|
+
stop = time.time()
|
|
552
|
+
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
553
|
+
feat_scores["training_time"] = training_time
|
|
554
|
+
|
|
555
|
+
logger.debug(
|
|
556
|
+
f"MI evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
557
|
+
)
|
|
558
|
+
|
|
559
|
+
feat_scores.to_csv(
|
|
560
|
+
f"{save_dir}/MI.csv", index=True, header=True, index_label="ID"
|
|
561
|
+
)
|
|
562
|
+
|
|
563
|
+
return feat_scores
|
|
564
|
+
|
|
565
|
+
# Intrisic/embeedded method
|
|
566
|
+
# ----------------
|
|
567
|
+
|
|
568
|
+
# feature importance
|
|
569
|
+
def select_feature_by_feat_imp(
|
|
570
|
+
self, percentile: int = 20, save_dir: Optional[str] = None
|
|
571
|
+
):
|
|
572
|
+
X, y, target_type = self.X_numerical, self.y, self.target_type
|
|
573
|
+
|
|
574
|
+
start = time.time()
|
|
575
|
+
logger.debug("Running Feature importance...")
|
|
576
|
+
|
|
577
|
+
params = {
|
|
578
|
+
"n_estimators": 500,
|
|
579
|
+
"max_depth": 2**3,
|
|
580
|
+
"random_state": 42,
|
|
581
|
+
"n_jobs": -1,
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
estimator = (
|
|
585
|
+
RandomForestClassifier(**params)
|
|
586
|
+
if target_type == "classification"
|
|
587
|
+
else RandomForestRegressor(**params)
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
feat_selector = SelectFromModel(
|
|
591
|
+
estimator=estimator,
|
|
592
|
+
threshold=-np.inf,
|
|
593
|
+
max_features=int(percentile * X.shape[1] / 100),
|
|
594
|
+
).fit(X, y)
|
|
595
|
+
|
|
596
|
+
feat_scores = pd.DataFrame()
|
|
597
|
+
feat_scores["score"] = feat_selector.estimator_.feature_importances_
|
|
598
|
+
feat_scores["support"] = feat_selector.get_support()
|
|
599
|
+
feat_scores["features"] = X.columns
|
|
600
|
+
feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
|
|
601
|
+
feat_scores["method"] = "FI"
|
|
602
|
+
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
603
|
+
|
|
604
|
+
stop = time.time()
|
|
605
|
+
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
606
|
+
feat_scores["training_time"] = training_time
|
|
607
|
+
|
|
608
|
+
logger.debug(
|
|
609
|
+
f"Feat importance evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
610
|
+
)
|
|
611
|
+
|
|
612
|
+
feat_scores.to_csv(
|
|
613
|
+
f"{save_dir}/FI.csv", index=True, header=True, index_label="ID"
|
|
614
|
+
)
|
|
615
|
+
|
|
616
|
+
return feat_scores
|
|
617
|
+
|
|
618
|
+
# Wrapper method
|
|
619
|
+
# ----------------
|
|
620
|
+
|
|
621
|
+
# recursive feature elimination
|
|
622
|
+
def select_feature_by_rfe(
|
|
623
|
+
self, percentile: int = 20, save_dir: Optional[str] = None
|
|
624
|
+
):
|
|
625
|
+
X, y, target_type = self.X_numerical, self.y, self.target_type
|
|
626
|
+
|
|
627
|
+
start = time.time()
|
|
628
|
+
logger.debug("Running Recursive Feature Elimination...")
|
|
629
|
+
|
|
630
|
+
params = {
|
|
631
|
+
"max_depth": 2**3,
|
|
632
|
+
"random_state": 42,
|
|
633
|
+
}
|
|
634
|
+
estimator = (
|
|
635
|
+
DecisionTreeClassifier(**params)
|
|
636
|
+
if target_type == "classification"
|
|
637
|
+
else DecisionTreeRegressor(**params)
|
|
638
|
+
)
|
|
639
|
+
rfe = RFE(estimator, n_features_to_select=percentile / 100, step=4, verbose=0)
|
|
640
|
+
feat_selector = rfe.fit(X, y)
|
|
641
|
+
|
|
642
|
+
feat_scores = pd.DataFrame(
|
|
643
|
+
{
|
|
644
|
+
"score": 0.0, # Default feature importance
|
|
645
|
+
"support": feat_selector.get_support(),
|
|
646
|
+
"features": X.columns,
|
|
647
|
+
"rank": 0,
|
|
648
|
+
"method": "RFE",
|
|
649
|
+
}
|
|
650
|
+
)
|
|
651
|
+
feat_scores.loc[
|
|
652
|
+
feat_scores["features"].isin(feat_selector.get_feature_names_out()), "score"
|
|
653
|
+
] = list(feat_selector.estimator_.feature_importances_)
|
|
654
|
+
feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
|
|
655
|
+
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
656
|
+
|
|
657
|
+
stop = time.time()
|
|
658
|
+
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
659
|
+
feat_scores["training_time"] = training_time
|
|
660
|
+
|
|
661
|
+
logger.debug(
|
|
662
|
+
f"RFE evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
663
|
+
)
|
|
664
|
+
|
|
665
|
+
feat_scores.to_csv(
|
|
666
|
+
f"{save_dir}/RFE.csv", index=True, header=True, index_label="ID"
|
|
667
|
+
)
|
|
668
|
+
|
|
669
|
+
return feat_scores
|
|
670
|
+
|
|
671
|
+
# SequentialFeatureSelector (loss based, possibility to do forwards or backwards selection or removal)
|
|
672
|
+
def select_feature_by_sfs(
|
|
673
|
+
self, percentile: int = 20, save_dir: Optional[str] = None
|
|
674
|
+
):
|
|
675
|
+
X, y, target_type = self.X_numerical, self.y, self.target_type
|
|
676
|
+
|
|
677
|
+
start = time.time()
|
|
678
|
+
logger.debug("Running Sequential Feature Selection...")
|
|
679
|
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
680
|
+
|
|
681
|
+
params = {
|
|
682
|
+
"max_depth": 2**3,
|
|
683
|
+
"random_state": 42,
|
|
684
|
+
}
|
|
685
|
+
estimator = (
|
|
686
|
+
DecisionTreeClassifier(**params)
|
|
687
|
+
if target_type == "classification"
|
|
688
|
+
else DecisionTreeRegressor(**params)
|
|
689
|
+
)
|
|
690
|
+
|
|
691
|
+
n_splits = 3
|
|
692
|
+
n_samples = len(X)
|
|
693
|
+
test_size = int(n_samples / (n_splits + 4))
|
|
694
|
+
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
|
|
695
|
+
|
|
696
|
+
score_function = (
|
|
697
|
+
make_scorer(
|
|
698
|
+
log_loss, response_method="predict_proba"
|
|
699
|
+
) # logloss needs probabilities
|
|
700
|
+
if target_type == "classification"
|
|
701
|
+
else make_scorer(root_mean_squared_error)
|
|
702
|
+
) # we avoid greater_is_better = False because it make the score negative and mess up ranking
|
|
703
|
+
|
|
704
|
+
sfs = SequentialFeatureSelector(
|
|
705
|
+
estimator,
|
|
706
|
+
k_features=int(percentile * X.shape[1] / 100),
|
|
707
|
+
forward=True,
|
|
708
|
+
floating=True, # Enables dynamic feature elimination
|
|
709
|
+
scoring=score_function,
|
|
710
|
+
cv=tscv,
|
|
711
|
+
n_jobs=-1,
|
|
712
|
+
verbose=0,
|
|
713
|
+
)
|
|
714
|
+
|
|
715
|
+
feat_selector = sfs.fit(X, y)
|
|
716
|
+
|
|
717
|
+
# Extract selected features and their scores
|
|
718
|
+
selected_features = set(feat_selector.k_feature_names_)
|
|
719
|
+
feat_subsets = feat_selector.subsets_
|
|
720
|
+
|
|
721
|
+
# Create DataFrame for feature scores
|
|
722
|
+
feat_scores = pd.DataFrame(
|
|
723
|
+
{
|
|
724
|
+
"features": X.columns,
|
|
725
|
+
"support": X.columns.isin(
|
|
726
|
+
selected_features
|
|
727
|
+
), # TODO: comprendre pourquoi le support n'est pas correct (les bons scores ne sont pas toujours choisis)
|
|
728
|
+
"score": 1000,
|
|
729
|
+
"rank": None,
|
|
730
|
+
"method": "SFS",
|
|
731
|
+
}
|
|
732
|
+
)
|
|
733
|
+
|
|
734
|
+
# Sort subsets by score (lower is better)
|
|
735
|
+
sorted_subsets = sorted(
|
|
736
|
+
feat_subsets.items(), key=lambda item: item[1]["avg_score"]
|
|
737
|
+
)
|
|
738
|
+
|
|
739
|
+
# Record score per feature (first appearance)
|
|
740
|
+
feature_score_map = {}
|
|
741
|
+
for step in sorted_subsets:
|
|
742
|
+
step = step[1]
|
|
743
|
+
for feature in step["feature_names"]:
|
|
744
|
+
if feature not in feature_score_map:
|
|
745
|
+
feature_score_map[feature] = step["avg_score"]
|
|
746
|
+
|
|
747
|
+
# Assign scores
|
|
748
|
+
for feature, score in feature_score_map.items():
|
|
749
|
+
feat_scores.loc[feat_scores["features"] == feature, "score"] = score
|
|
750
|
+
|
|
751
|
+
# rank by score (lower = better)
|
|
752
|
+
feat_scores["rank"] = (
|
|
753
|
+
feat_scores["score"].rank(method="first", ascending=True).astype(int)
|
|
754
|
+
)
|
|
755
|
+
|
|
756
|
+
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
757
|
+
|
|
758
|
+
stop = time.time()
|
|
759
|
+
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
760
|
+
feat_scores["training_time"] = training_time
|
|
761
|
+
|
|
762
|
+
logger.debug(
|
|
763
|
+
f"SFS evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
764
|
+
)
|
|
765
|
+
|
|
766
|
+
feat_scores.to_csv(
|
|
767
|
+
f"{save_dir}/SFS.csv", index=True, header=True, index_label="ID"
|
|
768
|
+
)
|
|
769
|
+
|
|
770
|
+
return feat_scores
|
|
771
|
+
|
|
772
|
+
|
|
773
|
+
class PreprocessModel:
|
|
774
|
+
|
|
775
|
+
def __init__(
|
|
776
|
+
self,
|
|
777
|
+
train,
|
|
778
|
+
val,
|
|
779
|
+
test,
|
|
780
|
+
dataset,
|
|
781
|
+
target_numbers,
|
|
782
|
+
target_clf,
|
|
783
|
+
models_idx,
|
|
784
|
+
time_series,
|
|
785
|
+
max_timesteps,
|
|
786
|
+
group_column,
|
|
787
|
+
date_column,
|
|
788
|
+
**kwargs,
|
|
789
|
+
):
|
|
790
|
+
self.dataset = dataset
|
|
791
|
+
self.target_numbers = target_numbers
|
|
792
|
+
self.target_clf = target_clf
|
|
793
|
+
self.models_idx = models_idx
|
|
794
|
+
self.time_series = time_series
|
|
795
|
+
self.max_timesteps = max_timesteps
|
|
796
|
+
self.group_column = group_column
|
|
797
|
+
self.date_column = date_column
|
|
798
|
+
|
|
799
|
+
self.dataset_dir = dataset.path
|
|
800
|
+
self.data_dir = f"{self.dataset_dir}/data"
|
|
801
|
+
|
|
802
|
+
self.all_features = dataset.get_all_features(
|
|
803
|
+
date_column=date_column, group_column=group_column
|
|
804
|
+
)
|
|
805
|
+
columns_to_keep = self.all_features + [
|
|
806
|
+
f"TARGET_{i}" for i in self.target_numbers
|
|
807
|
+
]
|
|
808
|
+
duplicates = [
|
|
809
|
+
col for col in set(columns_to_keep) if columns_to_keep.count(col) > 1
|
|
810
|
+
]
|
|
811
|
+
if duplicates:
|
|
812
|
+
raise ValueError(f"Doublons détectés dans columns_to_keep: {duplicates}")
|
|
813
|
+
|
|
814
|
+
logger.info(self.all_features)
|
|
815
|
+
|
|
816
|
+
self.train = train[columns_to_keep]
|
|
817
|
+
if val:
|
|
818
|
+
self.val = val[columns_to_keep]
|
|
819
|
+
if test:
|
|
820
|
+
self.test = test[columns_to_keep]
|
|
821
|
+
|
|
822
|
+
def run(self):
|
|
823
|
+
# save data
|
|
824
|
+
if PYTHON_ENV != "Test":
|
|
825
|
+
joblib.dump(self.train, f"{self.data_dir}/train.pkl")
|
|
826
|
+
joblib.dump(self.val, f"{self.data_dir}/val.pkl")
|
|
827
|
+
joblib.dump(self.test, f"{self.data_dir}/test.pkl")
|
|
828
|
+
preprocessing_dir = f"{self.dataset_dir}/preprocessing"
|
|
829
|
+
else:
|
|
830
|
+
preprocessing_dir = None
|
|
831
|
+
|
|
832
|
+
# scaling features
|
|
833
|
+
if any(t not in self.target_clf for t in self.target_numbers) and any(
|
|
834
|
+
all_models[i].get("need_scaling") for i in self.models_idx
|
|
835
|
+
):
|
|
836
|
+
logger.info("Scaling features...")
|
|
837
|
+
train_scaled, scaler_x, scalers_y = self.scale_data(
|
|
838
|
+
self.train, save_dir=preprocessing_dir
|
|
839
|
+
)
|
|
840
|
+
val_scaled, _, _ = self.scale_data(
|
|
841
|
+
self.val,
|
|
842
|
+
save_dir=preprocessing_dir,
|
|
843
|
+
scaler_x=scaler_x,
|
|
844
|
+
scalers_y=scalers_y,
|
|
845
|
+
)
|
|
846
|
+
test_scaled, _, _ = self.scale_data(
|
|
847
|
+
self.test,
|
|
848
|
+
save_dir=preprocessing_dir,
|
|
849
|
+
scaler_x=scaler_x,
|
|
850
|
+
scalers_y=scalers_y,
|
|
851
|
+
)
|
|
852
|
+
else:
|
|
853
|
+
train_scaled = None
|
|
854
|
+
val_scaled = None
|
|
855
|
+
test_scaled = None
|
|
856
|
+
|
|
857
|
+
# save data
|
|
858
|
+
if PYTHON_ENV != "Test":
|
|
859
|
+
joblib.dump(train_scaled, f"{self.data_dir}/train_scaled.pkl")
|
|
860
|
+
joblib.dump(val_scaled, f"{self.data_dir}/val_scaled.pkl")
|
|
861
|
+
joblib.dump(test_scaled, f"{self.data_dir}/test_scaled.pkl")
|
|
862
|
+
|
|
863
|
+
data = {
|
|
864
|
+
"train": self.train,
|
|
865
|
+
"val": self.val,
|
|
866
|
+
"test": self.test,
|
|
867
|
+
"train_scaled": train_scaled,
|
|
868
|
+
"val_scaled": val_scaled,
|
|
869
|
+
"test_scaled": test_scaled,
|
|
870
|
+
"scalers_y": scalers_y,
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
# reshape data for time series
|
|
874
|
+
reshaped_data = None
|
|
875
|
+
if (
|
|
876
|
+
any(all_models[i].get("recurrent") for i in self.models_idx)
|
|
877
|
+
and self.time_series
|
|
878
|
+
):
|
|
879
|
+
# reshaping data for recurrent models
|
|
880
|
+
logger.info("Reshaping data for recurrent models...")
|
|
881
|
+
reshaped_data = self.reshape_time_series(
|
|
882
|
+
train_scaled,
|
|
883
|
+
val_scaled,
|
|
884
|
+
test_scaled,
|
|
885
|
+
features=self.all_features,
|
|
886
|
+
timesteps=self.max_timesteps,
|
|
887
|
+
)
|
|
888
|
+
|
|
889
|
+
return data, reshaped_data
|
|
890
|
+
|
|
891
|
+
def inference(self):
|
|
892
|
+
# self.train is new data here
|
|
893
|
+
scaler_x = joblib.load(f"{self.preprocessing_dir}/scaler_x.pkl")
|
|
894
|
+
scaled_data = scaler_x.transform(self.train)
|
|
895
|
+
scaled_data = pd.DataFrame(
|
|
896
|
+
scaled_data, columns=self.train.columns, index=self.train.index
|
|
897
|
+
)
|
|
898
|
+
|
|
899
|
+
reshaped_data = None
|
|
900
|
+
if (
|
|
901
|
+
any(all_models[i].get("recurrent") for i in self.models_idx)
|
|
902
|
+
and self.time_series
|
|
903
|
+
):
|
|
904
|
+
# we need to make sur we have max_timesteps of data after grouping by group_column
|
|
905
|
+
if (
|
|
906
|
+
self.group_column
|
|
907
|
+
and scaled_data.groupby(self.group_column).size().min()
|
|
908
|
+
< self.max_timesteps
|
|
909
|
+
) or scaled_data.shape[0] < self.max_timesteps:
|
|
910
|
+
raise ValueError(
|
|
911
|
+
f"Not enough data for group_column {self.group_column} to reshape data for recurrent models"
|
|
912
|
+
)
|
|
913
|
+
|
|
914
|
+
# reshaping data for recurrent models
|
|
915
|
+
logger.info("Reshaping data for recurrent models...")
|
|
916
|
+
reshaped_data = self.reshape_time_series(
|
|
917
|
+
scaled_data,
|
|
918
|
+
features=self.all_features,
|
|
919
|
+
timesteps=self.max_timesteps,
|
|
920
|
+
)
|
|
921
|
+
|
|
922
|
+
return self.train, scaled_data, reshaped_data
|
|
923
|
+
|
|
924
|
+
# scaling
|
|
925
|
+
def scale_data(
|
|
926
|
+
self,
|
|
927
|
+
df: pd.DataFrame,
|
|
928
|
+
save_dir: str,
|
|
929
|
+
scaler_x=None,
|
|
930
|
+
scalers_y: Optional[list] = None,
|
|
931
|
+
):
|
|
932
|
+
logger.info("Scale data...")
|
|
933
|
+
X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
|
|
934
|
+
|
|
935
|
+
if scaler_x:
|
|
936
|
+
X_scaled = pd.DataFrame(
|
|
937
|
+
scaler_x.transform(X), columns=list(X.columns), index=X.index
|
|
938
|
+
)
|
|
939
|
+
else:
|
|
940
|
+
scaler_x = StandardScaler() # MinMaxScaler(feature_range=(-1,1))
|
|
941
|
+
X_scaled = pd.DataFrame(
|
|
942
|
+
scaler_x.fit_transform(X), columns=list(X.columns), index=X.index
|
|
943
|
+
)
|
|
944
|
+
if save_dir:
|
|
945
|
+
joblib.dump(scaler_x, f"{save_dir}/scaler_x.pkl")
|
|
946
|
+
|
|
947
|
+
# Determine which targets need to be scaled
|
|
948
|
+
targets_numbers_to_scale = [
|
|
949
|
+
i for i in self.target_numbers if i not in self.target_clf
|
|
950
|
+
]
|
|
951
|
+
|
|
952
|
+
# Dictionary to store scaled target data
|
|
953
|
+
scaled_targets = {}
|
|
954
|
+
|
|
955
|
+
if scalers_y:
|
|
956
|
+
for target_number in targets_numbers_to_scale:
|
|
957
|
+
y = df[[f"TARGET_{target_number}"]]
|
|
958
|
+
scaled_targets[target_number] = pd.DataFrame(
|
|
959
|
+
scalers_y[f"scaler_y_{target_number}"].transform(y.values),
|
|
960
|
+
columns=y.columns,
|
|
961
|
+
index=y.index,
|
|
962
|
+
)
|
|
963
|
+
else:
|
|
964
|
+
scalers_y = {}
|
|
965
|
+
for target_number in targets_numbers_to_scale:
|
|
966
|
+
scaler_y = StandardScaler()
|
|
967
|
+
y = df[[f"TARGET_{target_number}"]]
|
|
968
|
+
|
|
969
|
+
scaled_y = pd.DataFrame(
|
|
970
|
+
scaler_y.fit_transform(y.values),
|
|
971
|
+
columns=y.columns,
|
|
972
|
+
index=y.index,
|
|
973
|
+
)
|
|
974
|
+
if save_dir:
|
|
975
|
+
joblib.dump(scaler_y, f"{save_dir}/scaler_y_{target_number}.pkl")
|
|
976
|
+
|
|
977
|
+
scalers_y[f"scaler_y_{target_number}"] = scaler_y
|
|
978
|
+
scaled_targets[target_number] = scaled_y
|
|
979
|
+
|
|
980
|
+
# Reconstruct y_scaled in the original order
|
|
981
|
+
y_scaled = pd.concat(
|
|
982
|
+
[
|
|
983
|
+
scaled_targets[target_number]
|
|
984
|
+
for target_number in targets_numbers_to_scale
|
|
985
|
+
],
|
|
986
|
+
axis=1,
|
|
987
|
+
)
|
|
988
|
+
y_not_scaled = df[
|
|
989
|
+
df.columns.intersection([f"TARGET_{i}" for i in self.target_clf])
|
|
990
|
+
]
|
|
991
|
+
|
|
992
|
+
# Ensure the final DataFrame keeps the original order
|
|
993
|
+
df_scaled = pd.concat(
|
|
994
|
+
[X_scaled, y_scaled, y_not_scaled],
|
|
995
|
+
axis=1,
|
|
996
|
+
)[
|
|
997
|
+
df.columns
|
|
998
|
+
] # Reorder columns to match original `df`
|
|
999
|
+
|
|
1000
|
+
if not df_scaled.columns.equals(df.columns):
|
|
1001
|
+
raise Exception("Columns are not in the same order after scaling.")
|
|
1002
|
+
|
|
1003
|
+
return df_scaled, scaler_x, scalers_y
|
|
1004
|
+
|
|
1005
|
+
# Reshape into 3D tensors for recurrent models
|
|
1006
|
+
def reshape_time_series(
|
|
1007
|
+
self,
|
|
1008
|
+
train: pd.DataFrame,
|
|
1009
|
+
val: pd.DataFrame,
|
|
1010
|
+
test: pd.DataFrame,
|
|
1011
|
+
features: list,
|
|
1012
|
+
timesteps: int = 120,
|
|
1013
|
+
):
|
|
1014
|
+
# always scale for recurrent layers : train should be scaled
|
|
1015
|
+
group_column = self.group_column
|
|
1016
|
+
|
|
1017
|
+
target_columns = train.columns.intersection(
|
|
1018
|
+
[f"TARGET_{i}" for i in self.target_numbers]
|
|
1019
|
+
)
|
|
1020
|
+
|
|
1021
|
+
data = pd.concat([train, val, test], axis=0)
|
|
1022
|
+
|
|
1023
|
+
def reshape_df(df: pd.DataFrame, group_series: pd.Series, timesteps: int):
|
|
1024
|
+
fill_value = [[[0] * len(df.columns)]]
|
|
1025
|
+
|
|
1026
|
+
def shiftsum(x, timesteps: int):
|
|
1027
|
+
tmp = x.copy()
|
|
1028
|
+
for i in range(1, timesteps):
|
|
1029
|
+
tmp = x.shift(i, fill_value=fill_value) + tmp
|
|
1030
|
+
return tmp
|
|
1031
|
+
|
|
1032
|
+
logger.info("Grouping each feature in a unique column with list...")
|
|
1033
|
+
df_reshaped = df.apply(list, axis=1).apply(lambda x: [list(x)])
|
|
1034
|
+
df_reshaped = pd.concat([df_reshaped, group_series], axis=1)
|
|
1035
|
+
|
|
1036
|
+
logger.info("Grouping method stock and creating timesteps...")
|
|
1037
|
+
df_reshaped = (
|
|
1038
|
+
df_reshaped.groupby(group_column)[0]
|
|
1039
|
+
.apply(lambda x: shiftsum(x, timesteps))
|
|
1040
|
+
.reset_index(group_column, drop=True)
|
|
1041
|
+
.rename("RECURRENT_FEATURES")
|
|
1042
|
+
)
|
|
1043
|
+
df_reshaped = pd.DataFrame(df_reshaped)
|
|
1044
|
+
|
|
1045
|
+
return df_reshaped
|
|
1046
|
+
|
|
1047
|
+
data_reshaped = reshape_df(data[features], data[group_column], timesteps)
|
|
1048
|
+
|
|
1049
|
+
data_reshaped[target_columns] = data[target_columns]
|
|
1050
|
+
|
|
1051
|
+
logger.info("Separating train, val, test data and creating np arrays...")
|
|
1052
|
+
train_reshaped = data_reshaped.loc[train.index]
|
|
1053
|
+
val_reshaped = data_reshaped.loc[val.index]
|
|
1054
|
+
test_reshaped = data_reshaped.loc[test.index]
|
|
1055
|
+
|
|
1056
|
+
x_train_reshaped = np.array(
|
|
1057
|
+
train_reshaped["RECURRENT_FEATURES"].values.tolist()
|
|
1058
|
+
)
|
|
1059
|
+
y_train_reshaped = np.array(train_reshaped[target_columns].reset_index())
|
|
1060
|
+
x_val_reshaped = np.array(val_reshaped["RECURRENT_FEATURES"].values.tolist())
|
|
1061
|
+
y_val_reshaped = np.array(val_reshaped[target_columns].reset_index())
|
|
1062
|
+
x_test_reshaped = np.array(test_reshaped["RECURRENT_FEATURES"].values.tolist())
|
|
1063
|
+
y_test_reshaped = np.array(test_reshaped[target_columns].reset_index())
|
|
1064
|
+
|
|
1065
|
+
reshaped_data = {
|
|
1066
|
+
"x_train_reshaped": x_train_reshaped,
|
|
1067
|
+
"y_train_reshaped": y_train_reshaped,
|
|
1068
|
+
"x_val_reshaped": x_val_reshaped,
|
|
1069
|
+
"y_val_reshaped": y_val_reshaped,
|
|
1070
|
+
"x_test_reshaped": x_test_reshaped,
|
|
1071
|
+
"y_test_reshaped": y_test_reshaped,
|
|
1072
|
+
}
|
|
1073
|
+
|
|
1074
|
+
return reshaped_data
|
|
1075
|
+
|
|
1076
|
+
|
|
1077
|
+
# utils
|
|
1078
|
+
# TODO : can we use this to select the ideal number of features ?
|
|
1079
|
+
def feature_selection_analysis(feature_selection_id: int, n_components: int = 5):
|
|
1080
|
+
|
|
1081
|
+
feature_selection = FeatureSelection.get(feature_selection_id)
|
|
1082
|
+
dataset_dir = feature_selection.dataset.path
|
|
1083
|
+
features = [f.name for f in feature_selection.features]
|
|
1084
|
+
target = feature_selection.target.name
|
|
1085
|
+
target_number = target.split("_")[1]
|
|
1086
|
+
|
|
1087
|
+
train, val, train_scaled, val_scaled, _scaler_y = load_train_data(
|
|
1088
|
+
dataset_dir, target_number, target_type=feature_selection.target.type
|
|
1089
|
+
)
|
|
1090
|
+
train = train[features + [target]]
|
|
1091
|
+
train_scaled = train_scaled[features + [target]]
|
|
1092
|
+
|
|
1093
|
+
logger.info("Plot features correlation with target variable...")
|
|
1094
|
+
|
|
1095
|
+
correlations = train.corr()[target].sort_values(ascending=False)
|
|
1096
|
+
|
|
1097
|
+
plt.figure(figsize=(12, 6))
|
|
1098
|
+
sns.barplot(x=correlations.index, y=correlations.values, palette="coolwarm")
|
|
1099
|
+
plt.xticks(rotation=90)
|
|
1100
|
+
plt.title("Feature correlation with target variable")
|
|
1101
|
+
plt.ylabel("Correlation")
|
|
1102
|
+
plt.xlabel("Features")
|
|
1103
|
+
plt.grid(axis="y", linestyle="--", alpha=0.7)
|
|
1104
|
+
plt.show()
|
|
1105
|
+
|
|
1106
|
+
plt.figure(figsize=(14, 10))
|
|
1107
|
+
sns.heatmap(train.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
|
|
1108
|
+
plt.title("Correlation Matrix")
|
|
1109
|
+
plt.show()
|
|
1110
|
+
|
|
1111
|
+
logger.info("Plot explained variance by components...")
|
|
1112
|
+
n_components = min(len(features), n_components)
|
|
1113
|
+
pca = PCA(n_components=n_components)
|
|
1114
|
+
X_pca = pca.fit_transform(train_scaled)
|
|
1115
|
+
|
|
1116
|
+
explained_variance = pca.explained_variance_ratio_
|
|
1117
|
+
|
|
1118
|
+
plt.figure(figsize=(10, 7))
|
|
1119
|
+
plt.bar(
|
|
1120
|
+
range(1, len(explained_variance) + 1),
|
|
1121
|
+
explained_variance,
|
|
1122
|
+
label="Explained Variance",
|
|
1123
|
+
)
|
|
1124
|
+
plt.plot(
|
|
1125
|
+
range(1, len(explained_variance) + 1),
|
|
1126
|
+
np.cumsum(explained_variance),
|
|
1127
|
+
label="Cumulative Explained Variance",
|
|
1128
|
+
color="orange",
|
|
1129
|
+
marker="o",
|
|
1130
|
+
)
|
|
1131
|
+
plt.title("Explained Variance by Components")
|
|
1132
|
+
plt.xlabel("Number of Components")
|
|
1133
|
+
plt.ylabel("Explained Variance")
|
|
1134
|
+
plt.legend()
|
|
1135
|
+
plt.grid(axis="y", linestyle="--", alpha=0.7)
|
|
1136
|
+
plt.show()
|
|
1137
|
+
|
|
1138
|
+
logger.info("Main PCA vs target variable...")
|
|
1139
|
+
plt.scatter(
|
|
1140
|
+
X_pca[:, 0],
|
|
1141
|
+
X_pca[:, 1],
|
|
1142
|
+
c=train[target],
|
|
1143
|
+
cmap="coolwarm",
|
|
1144
|
+
alpha=0.7,
|
|
1145
|
+
)
|
|
1146
|
+
plt.title("PCA of target variable")
|
|
1147
|
+
plt.xlabel("First Principal Component")
|
|
1148
|
+
plt.ylabel("Second Principal Component")
|
|
1149
|
+
plt.colorbar()
|
|
1150
|
+
plt.show()
|
|
1151
|
+
|
|
1152
|
+
|
|
1153
|
+
def get_features_by_types(df: pd.DataFrame, sample_categorical_threshold: int = 15):
|
|
1154
|
+
categorical_features = [
|
|
1155
|
+
col
|
|
1156
|
+
for col in df.columns
|
|
1157
|
+
if df[col].nunique() <= sample_categorical_threshold
|
|
1158
|
+
and df[col].dtype in ["int64", "Int64"]
|
|
1159
|
+
]
|
|
1160
|
+
df_categorical = df[categorical_features]
|
|
1161
|
+
logger.info(f"Number of categorical features: {len(categorical_features)}")
|
|
1162
|
+
|
|
1163
|
+
numerical_features = list(set(df.columns).difference(set(categorical_features)))
|
|
1164
|
+
df_numerical = df[numerical_features]
|
|
1165
|
+
logger.info(f"Number of numerical features: {len(numerical_features)}")
|
|
1166
|
+
|
|
1167
|
+
return df_categorical, df_numerical
|