lecrapaud 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lecrapaud might be problematic. Click here for more details.
- lecrapaud/__init__.py +0 -0
- lecrapaud/config.py +16 -0
- lecrapaud/db/__init__.py +0 -0
- lecrapaud/db/alembic/README +1 -0
- lecrapaud/db/alembic/env.py +78 -0
- lecrapaud/db/alembic/script.py.mako +26 -0
- lecrapaud/db/alembic/versions/2025_04_06_1738-7390745388e4_initial_setup.py +295 -0
- lecrapaud/db/alembic/versions/2025_04_06_1755-40cd8d3e798e_unique_constraint_for_data.py +30 -0
- lecrapaud/db/alembic/versions/2025_05_23_1724-2360941fa0bd_longer_string.py +52 -0
- lecrapaud/db/alembic/versions/2025_05_27_1159-b96396dcfaff_add_env_to_trading_tables.py +34 -0
- lecrapaud/db/alembic/versions/2025_05_27_1337-40cbfc215f7c_fix_nb_character_on_portfolio.py +39 -0
- lecrapaud/db/alembic/versions/2025_05_27_1526-3de994115317_to_datetime.py +36 -0
- lecrapaud/db/alembic/versions/2025_05_27_2003-25c227c684f8_add_fees_to_transactions.py +30 -0
- lecrapaud/db/alembic/versions/2025_05_27_2047-6b6f2d38e9bc_double_instead_of_float.py +132 -0
- lecrapaud/db/alembic/versions/2025_05_31_1111-c175e4a36d68_generalise_stock_to_group.py +36 -0
- lecrapaud/db/alembic/versions/2025_05_31_1256-5681095bfc27_create_investment_run_and_portfolio_.py +62 -0
- lecrapaud/db/alembic/versions/2025_05_31_1806-339927587383_add_investment_run_id.py +107 -0
- lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +38 -0
- lecrapaud/db/alembic/versions/2025_05_31_1849-3b8550297e8e_change_date_to_datetime.py +44 -0
- lecrapaud/db/alembic/versions/2025_05_31_1852-e6b8c95d8243_add_date_to_portfolio_history.py +30 -0
- lecrapaud/db/alembic/versions/2025_06_10_1136-db8cdd83563a_addnewsandoptiontodata.py +32 -0
- lecrapaud/db/crud.py +179 -0
- lecrapaud/db/models/__init__.py +11 -0
- lecrapaud/db/models/base.py +6 -0
- lecrapaud/db/models/dataset.py +124 -0
- lecrapaud/db/models/feature.py +46 -0
- lecrapaud/db/models/feature_selection.py +126 -0
- lecrapaud/db/models/feature_selection_rank.py +80 -0
- lecrapaud/db/models/model.py +41 -0
- lecrapaud/db/models/model_selection.py +56 -0
- lecrapaud/db/models/model_training.py +54 -0
- lecrapaud/db/models/score.py +62 -0
- lecrapaud/db/models/target.py +59 -0
- lecrapaud/db/services.py +0 -0
- lecrapaud/db/setup.py +58 -0
- lecrapaud/directory_management.py +28 -0
- lecrapaud/feature_engineering.py +1119 -0
- lecrapaud/feature_selection.py +1229 -0
- lecrapaud/jobs/__init__.py +13 -0
- lecrapaud/jobs/config.py +17 -0
- lecrapaud/jobs/scheduler.py +36 -0
- lecrapaud/jobs/tasks.py +57 -0
- lecrapaud/model_selection.py +1571 -0
- lecrapaud/predictions.py +292 -0
- lecrapaud/search_space.py +844 -0
- lecrapaud/services/__init__.py +0 -0
- lecrapaud/services/embedding_categorical.py +71 -0
- lecrapaud/services/indicators.py +309 -0
- lecrapaud/speed_tests/experiments.py +139 -0
- lecrapaud/speed_tests/test-gpu-bilstm.ipynb +261 -0
- lecrapaud/speed_tests/test-gpu-resnet.ipynb +166 -0
- lecrapaud/speed_tests/test-gpu-transformers.ipynb +254 -0
- lecrapaud/speed_tests/tests.ipynb +145 -0
- lecrapaud/speed_tests/trash.py +37 -0
- lecrapaud/training.py +151 -0
- lecrapaud/utils.py +246 -0
- lecrapaud-0.4.0.dist-info/LICENSE +201 -0
- lecrapaud-0.4.0.dist-info/METADATA +103 -0
- lecrapaud-0.4.0.dist-info/RECORD +60 -0
- lecrapaud-0.4.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,1229 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from datetime import timedelta
|
|
4
|
+
import matplotlib.pyplot as plt
|
|
5
|
+
import seaborn as sns
|
|
6
|
+
import os
|
|
7
|
+
import time
|
|
8
|
+
from typing import Optional
|
|
9
|
+
from tqdm import tqdm
|
|
10
|
+
import warnings
|
|
11
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
12
|
+
import joblib
|
|
13
|
+
import re
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
|
|
17
|
+
|
|
18
|
+
# feature selection
|
|
19
|
+
from sklearn.feature_selection import (
|
|
20
|
+
f_classif,
|
|
21
|
+
f_regression,
|
|
22
|
+
mutual_info_classif,
|
|
23
|
+
mutual_info_regression,
|
|
24
|
+
chi2,
|
|
25
|
+
SelectPercentile,
|
|
26
|
+
SelectFpr,
|
|
27
|
+
RFE,
|
|
28
|
+
SelectFromModel,
|
|
29
|
+
)
|
|
30
|
+
from sklearn.decomposition import PCA
|
|
31
|
+
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
|
|
32
|
+
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
|
33
|
+
from sklearn.model_selection import TimeSeriesSplit
|
|
34
|
+
from sklearn.metrics import root_mean_squared_error, log_loss, make_scorer
|
|
35
|
+
from mlxtend.feature_selection import SequentialFeatureSelector
|
|
36
|
+
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
|
|
37
|
+
from sklearn.compose import ColumnTransformer
|
|
38
|
+
import category_encoders as ce
|
|
39
|
+
from scipy.stats import spearmanr, kendalltau
|
|
40
|
+
|
|
41
|
+
# Scaling
|
|
42
|
+
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
|
43
|
+
|
|
44
|
+
# Internal
|
|
45
|
+
from src.directory_management import tmp_dir, clean_directory
|
|
46
|
+
from src.utils import logger
|
|
47
|
+
from src.config import PYTHON_ENV
|
|
48
|
+
from src.db.models import (
|
|
49
|
+
Dataset,
|
|
50
|
+
Target,
|
|
51
|
+
Feature,
|
|
52
|
+
FeatureSelection,
|
|
53
|
+
FeatureSelectionRank,
|
|
54
|
+
)
|
|
55
|
+
from src.db.setup import get_db
|
|
56
|
+
|
|
57
|
+
# Variables for targets handling
|
|
58
|
+
TARGETS_NUMBER = range(1, 15)
|
|
59
|
+
TARGETS_CLF = [2, 4, 6, 8, 9, 10, 11]
|
|
60
|
+
TARGETS_MCLF = [11]
|
|
61
|
+
GROUPING_COLUMN = "STOCK"
|
|
62
|
+
DATE_COLUMN = "DATE"
|
|
63
|
+
|
|
64
|
+
# Annoying Warnings
|
|
65
|
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def get_dataset_name(
|
|
69
|
+
df, corr_threshold: int = 80, percentile: int = 20, max_features: int = 20
|
|
70
|
+
):
|
|
71
|
+
number_of_groups = df[GROUPING_COLUMN].nunique()
|
|
72
|
+
|
|
73
|
+
# Try to convert DATE column to datetime safely
|
|
74
|
+
if pd.api.types.is_integer_dtype(df[DATE_COLUMN]):
|
|
75
|
+
df_date = df[DATE_COLUMN].map(pd.Timestamp.fromordinal)
|
|
76
|
+
else:
|
|
77
|
+
df_date = pd.to_datetime(
|
|
78
|
+
df[DATE_COLUMN], errors="coerce"
|
|
79
|
+
) # convert strings, datetime, etc.
|
|
80
|
+
|
|
81
|
+
name = f"data_{number_of_groups}_{corr_threshold}_{percentile}_{max_features}_{df_date.min().date()}_{df_date.max().date()}"
|
|
82
|
+
if PYTHON_ENV == "Test":
|
|
83
|
+
name = f"test_{name}"
|
|
84
|
+
return name
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def create_sets_from_data(
|
|
88
|
+
df: pd.DataFrame,
|
|
89
|
+
corr_threshold: int = 80,
|
|
90
|
+
percentile: int = 20,
|
|
91
|
+
max_features: int = 20,
|
|
92
|
+
):
|
|
93
|
+
|
|
94
|
+
df.sort_values([DATE_COLUMN, GROUPING_COLUMN], inplace=True)
|
|
95
|
+
|
|
96
|
+
# Drop non-useful column for training
|
|
97
|
+
if "ISIN" in df.columns:
|
|
98
|
+
df.drop(labels=["ISIN"], axis=1, inplace=True)
|
|
99
|
+
if "SECURITY" in df.columns:
|
|
100
|
+
df.drop(labels=["SECURITY"], axis=1, inplace=True)
|
|
101
|
+
|
|
102
|
+
dates = df[DATE_COLUMN].unique()
|
|
103
|
+
|
|
104
|
+
val_first_id = int(len(dates) * 0.6) + 1
|
|
105
|
+
test_first_id = int(len(dates) * 0.8) + 1
|
|
106
|
+
|
|
107
|
+
train = df[df[DATE_COLUMN].isin(dates[:val_first_id])]
|
|
108
|
+
val = df[df[DATE_COLUMN].isin(dates[val_first_id:test_first_id])]
|
|
109
|
+
test = df[df[DATE_COLUMN].isin(dates[test_first_id:])]
|
|
110
|
+
|
|
111
|
+
dates = {}
|
|
112
|
+
dates["start_date"] = pd.to_datetime(df[DATE_COLUMN].iat[0])
|
|
113
|
+
dates["end_date"] = pd.to_datetime(df[DATE_COLUMN].iat[-1])
|
|
114
|
+
for name, data in zip(["train", "val", "test"], [train, val, test]):
|
|
115
|
+
dates[f"{name}_start_date"] = pd.to_datetime(data[DATE_COLUMN].iat[0])
|
|
116
|
+
dates[f"{name}_end_date"] = pd.to_datetime(data[DATE_COLUMN].iat[-1])
|
|
117
|
+
|
|
118
|
+
logger.info(
|
|
119
|
+
f"{len(data['DATE'])} {name} data from {dates[f"{name}_start_date"].strftime('%d/%m/%Y')} to {dates[f"{name}_end_date"].strftime('%d/%m/%Y')}"
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
datasets = {}
|
|
123
|
+
|
|
124
|
+
with get_db() as db:
|
|
125
|
+
all_targets = Target.get_all(db=db)
|
|
126
|
+
matched_targets = [
|
|
127
|
+
target for target in all_targets if target.name in train.columns
|
|
128
|
+
]
|
|
129
|
+
dataset_name = get_dataset_name(train, corr_threshold, percentile, max_features)
|
|
130
|
+
dataset_dir = f"{tmp_dir}/{dataset_name}"
|
|
131
|
+
preprocessing_dir = f"{dataset_dir}/preprocessing"
|
|
132
|
+
train_data_dir = f"{dataset_dir}/data"
|
|
133
|
+
os.makedirs(dataset_dir, exist_ok=True)
|
|
134
|
+
os.makedirs(preprocessing_dir, exist_ok=True)
|
|
135
|
+
os.makedirs(train_data_dir, exist_ok=True)
|
|
136
|
+
|
|
137
|
+
dataset = datasets[name] = Dataset.upsert(
|
|
138
|
+
match_fields=["name"],
|
|
139
|
+
db=db,
|
|
140
|
+
name=dataset_name,
|
|
141
|
+
path=Path(dataset_dir).resolve(),
|
|
142
|
+
type="training",
|
|
143
|
+
size=df.shape[0],
|
|
144
|
+
train_size=train.shape[0],
|
|
145
|
+
val_size=val.shape[0],
|
|
146
|
+
test_size=test.shape[0],
|
|
147
|
+
number_of_groups=data[GROUPING_COLUMN].nunique(),
|
|
148
|
+
list_of_groups=data[GROUPING_COLUMN].unique().tolist(),
|
|
149
|
+
corr_threshold=corr_threshold,
|
|
150
|
+
percentile=percentile,
|
|
151
|
+
max_features=max_features,
|
|
152
|
+
**dates,
|
|
153
|
+
targets=matched_targets,
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# encode categoricals
|
|
157
|
+
train = encode_categorical_features(train, fit=True, save_dir=preprocessing_dir)
|
|
158
|
+
val = encode_categorical_features(val, save_dir=preprocessing_dir)
|
|
159
|
+
test = encode_categorical_features(test, save_dir=preprocessing_dir)
|
|
160
|
+
|
|
161
|
+
# save the full data
|
|
162
|
+
if PYTHON_ENV != "Test":
|
|
163
|
+
joblib.dump(df, f"{train_data_dir}/full.pkl")
|
|
164
|
+
|
|
165
|
+
return train, val, test, dataset
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def encode_categorical_features(df: pd.DataFrame, save_dir: str, fit: bool = False):
|
|
169
|
+
|
|
170
|
+
X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
|
|
171
|
+
y = df.loc[:, df.columns.str.contains("^TARGET_")]
|
|
172
|
+
|
|
173
|
+
# 1. Timestamps for 'DATE'
|
|
174
|
+
X.loc[:, DATE_COLUMN] = pd.to_datetime(X[DATE_COLUMN]).map(pd.Timestamp.toordinal)
|
|
175
|
+
|
|
176
|
+
if fit:
|
|
177
|
+
# Define columns for ordinal and binary encoding (we should have all possible values in training set, unless we accept unknown values processing)
|
|
178
|
+
ordinal_encoding_features = ["STOCK"]
|
|
179
|
+
|
|
180
|
+
binary_encoding_features = ["SECTOR", "SUBINDUSTRY", "LOCATION"]
|
|
181
|
+
|
|
182
|
+
# Fit and save the ColumnTransformer with OrdinalEncoder and OneHotEncoder
|
|
183
|
+
column_transformer = ColumnTransformer(
|
|
184
|
+
transformers=[
|
|
185
|
+
(
|
|
186
|
+
"ordinal",
|
|
187
|
+
OrdinalEncoder(
|
|
188
|
+
handle_unknown="use_encoded_value",
|
|
189
|
+
unknown_value=-1, # rows with unseen STOCK values will be encoded as -1
|
|
190
|
+
),
|
|
191
|
+
ordinal_encoding_features,
|
|
192
|
+
),
|
|
193
|
+
(
|
|
194
|
+
"binary_encoder",
|
|
195
|
+
ce.BinaryEncoder(
|
|
196
|
+
handle_unknown="value",
|
|
197
|
+
), # rows with unseen values will be encoded as all-zeros in the binary columns
|
|
198
|
+
binary_encoding_features,
|
|
199
|
+
),
|
|
200
|
+
],
|
|
201
|
+
remainder="passthrough", # Keep the non-encoded columns like 'DATE'
|
|
202
|
+
)
|
|
203
|
+
transformed_data = column_transformer.fit_transform(X)
|
|
204
|
+
if PYTHON_ENV != "Test":
|
|
205
|
+
joblib.dump(column_transformer, f"{save_dir}/column_transformer.pkl")
|
|
206
|
+
else:
|
|
207
|
+
# Load the ColumnTransformer and apply it
|
|
208
|
+
column_transformer = joblib.load(f"{save_dir}/column_transformer.pkl")
|
|
209
|
+
|
|
210
|
+
transformed_data = column_transformer.transform(X)
|
|
211
|
+
|
|
212
|
+
# Convert to DataFrame for readability and return
|
|
213
|
+
transformed_X = pd.DataFrame(
|
|
214
|
+
transformed_data,
|
|
215
|
+
columns=[
|
|
216
|
+
feature.split("__")[1]
|
|
217
|
+
for feature in column_transformer.get_feature_names_out()
|
|
218
|
+
],
|
|
219
|
+
index=X.index,
|
|
220
|
+
)
|
|
221
|
+
transformed_X = transformed_X.apply(pd.to_numeric)
|
|
222
|
+
for col in [
|
|
223
|
+
feature.split("__")[1]
|
|
224
|
+
for feature in column_transformer.get_feature_names_out()
|
|
225
|
+
if "remainder" not in feature
|
|
226
|
+
] + [DATE_COLUMN]:
|
|
227
|
+
transformed_X[col] = transformed_X[col].astype(int)
|
|
228
|
+
|
|
229
|
+
# Insert features in db
|
|
230
|
+
if fit:
|
|
231
|
+
# TODO: in bulk
|
|
232
|
+
for feature in transformed_X.columns:
|
|
233
|
+
dtype = transformed_X[feature].dtype
|
|
234
|
+
if pd.api.types.is_integer_dtype(dtype):
|
|
235
|
+
feature_type = "categorical"
|
|
236
|
+
elif pd.api.types.is_float_dtype(dtype):
|
|
237
|
+
feature_type = "numerical"
|
|
238
|
+
else:
|
|
239
|
+
feature_type = "other"
|
|
240
|
+
Feature.upsert(match_fields=["name"], name=feature, type=feature_type)
|
|
241
|
+
for target in y.columns:
|
|
242
|
+
type = (
|
|
243
|
+
"classification"
|
|
244
|
+
if int(target.split("_")[1]) in TARGETS_CLF
|
|
245
|
+
else "regression"
|
|
246
|
+
)
|
|
247
|
+
# TODO: what about description here ?
|
|
248
|
+
Target.upsert(match_fields=["name", "type"], name=target, type=type)
|
|
249
|
+
|
|
250
|
+
return pd.concat([transformed_X, y], axis=1)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
# only work with all features from feat eng in the right order (unused for now)
|
|
254
|
+
def decode_categorical_features(df: pd.DataFrame, save_dir: str):
|
|
255
|
+
X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
|
|
256
|
+
y = df.loc[:, df.columns.str.contains("^TARGET_")]
|
|
257
|
+
index = X.index
|
|
258
|
+
original_dtypes = X.dtypes.to_dict()
|
|
259
|
+
|
|
260
|
+
column_transformer = joblib.load(f"{save_dir}/column_transformer.pkl")
|
|
261
|
+
|
|
262
|
+
X = X.to_numpy()
|
|
263
|
+
arrays = []
|
|
264
|
+
for name, indices in column_transformer.output_indices_.items():
|
|
265
|
+
transformer = column_transformer.named_transformers_.get(name, None)
|
|
266
|
+
arr = X[:, indices.start : indices.stop]
|
|
267
|
+
|
|
268
|
+
if transformer in (None, "passthrough", "drop"):
|
|
269
|
+
pass
|
|
270
|
+
|
|
271
|
+
else:
|
|
272
|
+
arr = transformer.inverse_transform(arr)
|
|
273
|
+
|
|
274
|
+
arrays.append(arr)
|
|
275
|
+
|
|
276
|
+
retarr = np.concatenate(arrays, axis=1)
|
|
277
|
+
|
|
278
|
+
columns_ordinal = [
|
|
279
|
+
feature.split("__")[1]
|
|
280
|
+
for feature in column_transformer.get_feature_names_out()
|
|
281
|
+
if feature.split("__")[0] == "ordinal"
|
|
282
|
+
]
|
|
283
|
+
columns_binary_encoder = [
|
|
284
|
+
feature.split("__")[1]
|
|
285
|
+
for feature in column_transformer.get_feature_names_out()
|
|
286
|
+
if feature.split("__")[0] == "binary_encoder"
|
|
287
|
+
]
|
|
288
|
+
# Remove trailing "_number" using regex
|
|
289
|
+
columns_binary_encoder = {
|
|
290
|
+
re.sub(r"_\d+$", "", col) for col in columns_binary_encoder
|
|
291
|
+
}
|
|
292
|
+
columns_binary_encoder = list(columns_binary_encoder)
|
|
293
|
+
|
|
294
|
+
columns_remainder = [
|
|
295
|
+
feature.split("__")[1]
|
|
296
|
+
for feature in column_transformer.get_feature_names_out()
|
|
297
|
+
if feature.split("__")[0] == "remainder"
|
|
298
|
+
]
|
|
299
|
+
columns = columns_ordinal + columns_binary_encoder + columns_remainder
|
|
300
|
+
decoded_X = pd.DataFrame(
|
|
301
|
+
retarr,
|
|
302
|
+
columns=columns,
|
|
303
|
+
index=index,
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
for col in decoded_X.columns:
|
|
307
|
+
if col in columns_ordinal or col in columns_binary_encoder:
|
|
308
|
+
decoded_X[col] = decoded_X[col].astype(str)
|
|
309
|
+
elif col in original_dtypes:
|
|
310
|
+
decoded_X[col] = decoded_X[col].astype(original_dtypes[col])
|
|
311
|
+
|
|
312
|
+
# revert timestamps to dates
|
|
313
|
+
decoded_X.loc[:, DATE_COLUMN] = decoded_X[DATE_COLUMN].map(pd.Timestamp.fromordinal)
|
|
314
|
+
|
|
315
|
+
return pd.concat([decoded_X, y], axis=1)
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
# Filter methods
|
|
319
|
+
# ----------------
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
# Linear correlation (Person's R for regression and ANOVA for classification)
|
|
323
|
+
def select_feature_by_linear_correlation(
|
|
324
|
+
X, y, target_type, percentile: int = 20, save_dir: Optional[str] = None
|
|
325
|
+
):
|
|
326
|
+
start = time.time()
|
|
327
|
+
test_type = "Person’s R" if target_type == "regression" else "ANOVA"
|
|
328
|
+
logger.debug(f"Running {test_type}...")
|
|
329
|
+
|
|
330
|
+
model = f_regression if target_type == "regression" else f_classif
|
|
331
|
+
feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
|
|
332
|
+
feat_scores = pd.DataFrame()
|
|
333
|
+
feat_scores["score"] = feat_selector.scores_
|
|
334
|
+
feat_scores["pvalue"] = feat_selector.pvalues_
|
|
335
|
+
feat_scores["support"] = feat_selector.get_support()
|
|
336
|
+
feat_scores["features"] = X.columns
|
|
337
|
+
feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
|
|
338
|
+
feat_scores["method"] = test_type
|
|
339
|
+
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
340
|
+
stop = time.time()
|
|
341
|
+
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
342
|
+
feat_scores["training_time"] = training_time
|
|
343
|
+
|
|
344
|
+
logger.debug(
|
|
345
|
+
f"{test_type} evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
feat_scores.to_csv(
|
|
349
|
+
f"{save_dir}/{test_type}.csv",
|
|
350
|
+
index=True,
|
|
351
|
+
header=True,
|
|
352
|
+
index_label="ID",
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
return feat_scores
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
# Non-Linear correlation (Spearsman's R for regression and Kendall’s Tau for classification)
|
|
359
|
+
def select_feature_by_nonlinear_correlation(
|
|
360
|
+
X, y, target_type, percentile: int = 20, save_dir: Optional[str] = None
|
|
361
|
+
):
|
|
362
|
+
start = time.time()
|
|
363
|
+
|
|
364
|
+
def model(X_model, y_model):
|
|
365
|
+
X_model = pd.DataFrame(X_model)
|
|
366
|
+
y_model = pd.Series(y_model)
|
|
367
|
+
|
|
368
|
+
method = "spearman" if target_type == "regression" else "kendall"
|
|
369
|
+
|
|
370
|
+
corr_scores = []
|
|
371
|
+
p_values = []
|
|
372
|
+
|
|
373
|
+
for col in X_model.columns:
|
|
374
|
+
if method == "spearman":
|
|
375
|
+
corr, pval = spearmanr(X_model[col], y_model)
|
|
376
|
+
else: # Kendall's Tau for classification
|
|
377
|
+
corr, pval = kendalltau(X_model[col], y_model)
|
|
378
|
+
|
|
379
|
+
corr_scores.append(abs(corr)) # Keeping absolute correlation
|
|
380
|
+
p_values.append(pval)
|
|
381
|
+
|
|
382
|
+
return np.array(corr_scores), np.array(p_values)
|
|
383
|
+
|
|
384
|
+
test_type = "Spearman’s R" if target_type == "regression" else "Kendall’s Tau"
|
|
385
|
+
logger.debug(f"Running {test_type}...")
|
|
386
|
+
|
|
387
|
+
feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
|
|
388
|
+
feat_scores = pd.DataFrame()
|
|
389
|
+
feat_scores["score"] = feat_selector.scores_
|
|
390
|
+
feat_scores["pvalue"] = feat_selector.pvalues_
|
|
391
|
+
feat_scores["support"] = feat_selector.get_support()
|
|
392
|
+
feat_scores["features"] = X.columns
|
|
393
|
+
feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
|
|
394
|
+
feat_scores["method"] = test_type
|
|
395
|
+
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
396
|
+
stop = time.time()
|
|
397
|
+
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
398
|
+
feat_scores["training_time"] = training_time
|
|
399
|
+
|
|
400
|
+
logger.debug(
|
|
401
|
+
f"{test_type} evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
feat_scores.to_csv(
|
|
405
|
+
f"{save_dir}/{test_type}.csv",
|
|
406
|
+
index=True,
|
|
407
|
+
header=True,
|
|
408
|
+
index_label="ID",
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
return feat_scores
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
# Mutual Information
|
|
415
|
+
def select_feature_by_mi(
|
|
416
|
+
X, y, target_type, percentile: int = 20, save_dir: Optional[str] = None
|
|
417
|
+
):
|
|
418
|
+
start = time.time()
|
|
419
|
+
logger.debug("Running Mutual Information...")
|
|
420
|
+
model = (
|
|
421
|
+
mutual_info_regression if target_type == "regression" else mutual_info_classif
|
|
422
|
+
)
|
|
423
|
+
feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
|
|
424
|
+
feat_scores = pd.DataFrame()
|
|
425
|
+
feat_scores["score"] = feat_selector.scores_
|
|
426
|
+
feat_scores["support"] = feat_selector.get_support()
|
|
427
|
+
feat_scores["features"] = X.columns
|
|
428
|
+
feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
|
|
429
|
+
feat_scores["method"] = "Mutual Information"
|
|
430
|
+
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
431
|
+
stop = time.time()
|
|
432
|
+
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
433
|
+
feat_scores["training_time"] = training_time
|
|
434
|
+
|
|
435
|
+
logger.debug(
|
|
436
|
+
f"MI evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
feat_scores.to_csv(f"{save_dir}/MI.csv", index=True, header=True, index_label="ID")
|
|
440
|
+
|
|
441
|
+
return feat_scores
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def select_categorical_features(X, y, percentile, save_dir: Optional[str] = None):
|
|
445
|
+
start = time.time()
|
|
446
|
+
logger.debug("Running Chi2 for categorical features...")
|
|
447
|
+
feat_selector = SelectPercentile(chi2, percentile=percentile).fit(X, y)
|
|
448
|
+
feat_scores = pd.DataFrame()
|
|
449
|
+
feat_scores["score"] = feat_selector.scores_
|
|
450
|
+
feat_scores["pvalue"] = feat_selector.pvalues_
|
|
451
|
+
feat_scores["support"] = feat_selector.get_support()
|
|
452
|
+
feat_scores["features"] = X.columns
|
|
453
|
+
feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
|
|
454
|
+
feat_scores["method"] = "Chi2"
|
|
455
|
+
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
456
|
+
stop = time.time()
|
|
457
|
+
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
458
|
+
feat_scores["training_time"] = training_time
|
|
459
|
+
|
|
460
|
+
logger.debug(
|
|
461
|
+
f"Chi2 evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
feat_scores.to_csv(
|
|
465
|
+
f"{save_dir}/Chi2.csv", index=True, header=True, index_label="ID"
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
return feat_scores
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
# Intrisic/embeedded method
|
|
472
|
+
# ----------------
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
# feature importance
|
|
476
|
+
def select_feature_by_feat_imp(
|
|
477
|
+
X, y, target_type, percentile: int = 20, save_dir: Optional[str] = None
|
|
478
|
+
):
|
|
479
|
+
start = time.time()
|
|
480
|
+
logger.debug("Running Feature importance...")
|
|
481
|
+
|
|
482
|
+
params = {"n_estimators": 500, "max_depth": 2**3, "random_state": 42, "n_jobs": -1}
|
|
483
|
+
|
|
484
|
+
estimator = (
|
|
485
|
+
RandomForestClassifier(**params)
|
|
486
|
+
if target_type == "classification"
|
|
487
|
+
else RandomForestRegressor(**params)
|
|
488
|
+
)
|
|
489
|
+
|
|
490
|
+
feat_selector = SelectFromModel(
|
|
491
|
+
estimator=estimator,
|
|
492
|
+
threshold=-np.inf,
|
|
493
|
+
max_features=int(percentile * X.shape[1] / 100),
|
|
494
|
+
).fit(X, y)
|
|
495
|
+
|
|
496
|
+
feat_scores = pd.DataFrame()
|
|
497
|
+
feat_scores["score"] = feat_selector.estimator_.feature_importances_
|
|
498
|
+
feat_scores["support"] = feat_selector.get_support()
|
|
499
|
+
feat_scores["features"] = X.columns
|
|
500
|
+
feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
|
|
501
|
+
feat_scores["method"] = "FI"
|
|
502
|
+
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
503
|
+
|
|
504
|
+
stop = time.time()
|
|
505
|
+
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
506
|
+
feat_scores["training_time"] = training_time
|
|
507
|
+
|
|
508
|
+
logger.debug(
|
|
509
|
+
f"Feat importance evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
feat_scores.to_csv(f"{save_dir}/FI.csv", index=True, header=True, index_label="ID")
|
|
513
|
+
|
|
514
|
+
return feat_scores
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
# Wrapper method
|
|
518
|
+
# ----------------
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
# recursive feature elimination
|
|
522
|
+
def select_feature_by_rfe(
|
|
523
|
+
X, y, target_type, percentile: int = 20, save_dir: Optional[str] = None
|
|
524
|
+
):
|
|
525
|
+
start = time.time()
|
|
526
|
+
logger.debug("Running Recursive Feature Elimination...")
|
|
527
|
+
|
|
528
|
+
params = {
|
|
529
|
+
"max_depth": 2**3,
|
|
530
|
+
"random_state": 42,
|
|
531
|
+
}
|
|
532
|
+
estimator = (
|
|
533
|
+
DecisionTreeClassifier(**params)
|
|
534
|
+
if target_type == "classification"
|
|
535
|
+
else DecisionTreeRegressor(**params)
|
|
536
|
+
)
|
|
537
|
+
rfe = RFE(estimator, n_features_to_select=percentile / 100, step=4, verbose=0)
|
|
538
|
+
feat_selector = rfe.fit(X, y)
|
|
539
|
+
|
|
540
|
+
feat_scores = pd.DataFrame(
|
|
541
|
+
{
|
|
542
|
+
"score": 0.0, # Default feature importance
|
|
543
|
+
"support": feat_selector.get_support(),
|
|
544
|
+
"features": X.columns,
|
|
545
|
+
"rank": 0,
|
|
546
|
+
"method": "RFE",
|
|
547
|
+
}
|
|
548
|
+
)
|
|
549
|
+
feat_scores.loc[
|
|
550
|
+
feat_scores["features"].isin(feat_selector.get_feature_names_out()), "score"
|
|
551
|
+
] = list(feat_selector.estimator_.feature_importances_)
|
|
552
|
+
feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
|
|
553
|
+
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
554
|
+
|
|
555
|
+
stop = time.time()
|
|
556
|
+
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
557
|
+
feat_scores["training_time"] = training_time
|
|
558
|
+
|
|
559
|
+
logger.debug(
|
|
560
|
+
f"RFE evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
561
|
+
)
|
|
562
|
+
|
|
563
|
+
feat_scores.to_csv(f"{save_dir}/RFE.csv", index=True, header=True, index_label="ID")
|
|
564
|
+
|
|
565
|
+
return feat_scores
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
# SequentialFeatureSelector (loss based, possibility to do forwards or backwards selection or removal)
|
|
569
|
+
def select_feature_by_sfs(
|
|
570
|
+
X, y, target_type, percentile: int = 20, save_dir: Optional[str] = None
|
|
571
|
+
):
|
|
572
|
+
start = time.time()
|
|
573
|
+
logger.debug("Running Sequential Feature Selection...")
|
|
574
|
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
575
|
+
|
|
576
|
+
params = {
|
|
577
|
+
"max_depth": 2**3,
|
|
578
|
+
"random_state": 42,
|
|
579
|
+
}
|
|
580
|
+
estimator = (
|
|
581
|
+
DecisionTreeClassifier(**params)
|
|
582
|
+
if target_type == "classification"
|
|
583
|
+
else DecisionTreeRegressor(**params)
|
|
584
|
+
)
|
|
585
|
+
|
|
586
|
+
n_splits = 3
|
|
587
|
+
n_samples = len(X)
|
|
588
|
+
test_size = int(n_samples / (n_splits + 4))
|
|
589
|
+
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
|
|
590
|
+
|
|
591
|
+
score_function = (
|
|
592
|
+
make_scorer(
|
|
593
|
+
log_loss, response_method="predict_proba"
|
|
594
|
+
) # logloss needs probabilities
|
|
595
|
+
if target_type == "classification"
|
|
596
|
+
else make_scorer(root_mean_squared_error)
|
|
597
|
+
) # we avoid greater_is_better = False because it make the score negative and mess up ranking
|
|
598
|
+
|
|
599
|
+
sfs = SequentialFeatureSelector(
|
|
600
|
+
estimator,
|
|
601
|
+
k_features=int(percentile * X.shape[1] / 100),
|
|
602
|
+
forward=True,
|
|
603
|
+
floating=True, # Enables dynamic feature elimination
|
|
604
|
+
scoring=score_function,
|
|
605
|
+
cv=tscv,
|
|
606
|
+
n_jobs=-1,
|
|
607
|
+
verbose=0,
|
|
608
|
+
)
|
|
609
|
+
|
|
610
|
+
feat_selector = sfs.fit(X, y)
|
|
611
|
+
|
|
612
|
+
# Extract selected features and their scores
|
|
613
|
+
selected_features = set(feat_selector.k_feature_names_)
|
|
614
|
+
feat_subsets = feat_selector.subsets_
|
|
615
|
+
|
|
616
|
+
# Create DataFrame for feature scores
|
|
617
|
+
feat_scores = pd.DataFrame(
|
|
618
|
+
{
|
|
619
|
+
"features": X.columns,
|
|
620
|
+
"support": X.columns.isin(
|
|
621
|
+
selected_features
|
|
622
|
+
), # TODO: comprendre pourquoi le support n'est pas correct (les bons scores ne sont pas toujours choisis)
|
|
623
|
+
"score": 1000,
|
|
624
|
+
"rank": None,
|
|
625
|
+
"method": "SFS",
|
|
626
|
+
}
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
# Sort subsets by score (lower is better)
|
|
630
|
+
sorted_subsets = sorted(feat_subsets.items(), key=lambda item: item[1]["avg_score"])
|
|
631
|
+
|
|
632
|
+
# Record score per feature (first appearance)
|
|
633
|
+
feature_score_map = {}
|
|
634
|
+
for step in sorted_subsets:
|
|
635
|
+
step = step[1]
|
|
636
|
+
for feature in step["feature_names"]:
|
|
637
|
+
if feature not in feature_score_map:
|
|
638
|
+
feature_score_map[feature] = step["avg_score"]
|
|
639
|
+
|
|
640
|
+
# Assign scores
|
|
641
|
+
for feature, score in feature_score_map.items():
|
|
642
|
+
feat_scores.loc[feat_scores["features"] == feature, "score"] = score
|
|
643
|
+
|
|
644
|
+
# rank by score (lower = better)
|
|
645
|
+
feat_scores["rank"] = (
|
|
646
|
+
feat_scores["score"].rank(method="first", ascending=True).astype(int)
|
|
647
|
+
)
|
|
648
|
+
|
|
649
|
+
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
650
|
+
|
|
651
|
+
stop = time.time()
|
|
652
|
+
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
653
|
+
feat_scores["training_time"] = training_time
|
|
654
|
+
|
|
655
|
+
logger.debug(
|
|
656
|
+
f"SFS evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
657
|
+
)
|
|
658
|
+
|
|
659
|
+
feat_scores.to_csv(f"{save_dir}/SFS.csv", index=True, header=True, index_label="ID")
|
|
660
|
+
|
|
661
|
+
return feat_scores
|
|
662
|
+
|
|
663
|
+
|
|
664
|
+
# Remove correlation
|
|
665
|
+
# ------------------
|
|
666
|
+
|
|
667
|
+
|
|
668
|
+
def remove_correlated_features(
|
|
669
|
+
X: pd.DataFrame, features: list, corr_threshold: int, vizualize: bool = False
|
|
670
|
+
):
|
|
671
|
+
# Create correlation matrix, select upper triangle & remove features with correlation greater than threshold
|
|
672
|
+
corr_matrix = X[features].corr().abs()
|
|
673
|
+
|
|
674
|
+
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
|
675
|
+
features_uncorrelated = [
|
|
676
|
+
column
|
|
677
|
+
for column in upper.columns
|
|
678
|
+
if all(upper[column].dropna() <= corr_threshold / 100)
|
|
679
|
+
]
|
|
680
|
+
features_correlated = [
|
|
681
|
+
column for column in upper.columns if any(upper[column] > corr_threshold / 100)
|
|
682
|
+
]
|
|
683
|
+
|
|
684
|
+
if vizualize:
|
|
685
|
+
features_selected_visualization = (
|
|
686
|
+
X[features]
|
|
687
|
+
.corr()
|
|
688
|
+
.where(np.triu(np.ones(len(features)), k=1).astype(bool))
|
|
689
|
+
.fillna(0)
|
|
690
|
+
)
|
|
691
|
+
# Plot the heatmap
|
|
692
|
+
plt.figure(figsize=(10, 8))
|
|
693
|
+
sns.heatmap(
|
|
694
|
+
corr_matrix,
|
|
695
|
+
annot=True,
|
|
696
|
+
cmap="coolwarm",
|
|
697
|
+
center=0,
|
|
698
|
+
linewidths=1,
|
|
699
|
+
linecolor="black",
|
|
700
|
+
)
|
|
701
|
+
plt.title(f"Correlation Matrix")
|
|
702
|
+
plt.show()
|
|
703
|
+
|
|
704
|
+
logger.info(f"\n{features_selected_visualization.describe().to_string()}")
|
|
705
|
+
logger.info(f"\n{features_selected_visualization.to_string()}")
|
|
706
|
+
return features_uncorrelated, features_correlated
|
|
707
|
+
|
|
708
|
+
|
|
709
|
+
# Main feature selection function
|
|
710
|
+
def feature_selection(
|
|
711
|
+
dataset_id: int,
|
|
712
|
+
train: pd.DataFrame,
|
|
713
|
+
target_number: int,
|
|
714
|
+
single_process: bool = False,
|
|
715
|
+
):
|
|
716
|
+
"""Function to do feature selection with a range of different feature selection technics
|
|
717
|
+
|
|
718
|
+
Args:
|
|
719
|
+
- train (pd.DataFrame): a pandas train set
|
|
720
|
+
- target_number (in): a target, targets need to be name ``TARGET_{n}```
|
|
721
|
+
- single_process (bool): if True, run all feature selection methods in a single process. If False, run them in parallel.
|
|
722
|
+
"""
|
|
723
|
+
|
|
724
|
+
# Create the feature selection in db
|
|
725
|
+
target = Target.find_by(name=f"TARGET_{target_number}")
|
|
726
|
+
dataset = Dataset.get(dataset_id)
|
|
727
|
+
percentile = dataset.percentile
|
|
728
|
+
corr_threshold = dataset.corr_threshold
|
|
729
|
+
max_features = dataset.max_features
|
|
730
|
+
|
|
731
|
+
feature_selection = FeatureSelection.upsert(
|
|
732
|
+
match_fields=["target_id", "dataset_id"],
|
|
733
|
+
target_id=target.id,
|
|
734
|
+
dataset_id=dataset.id,
|
|
735
|
+
)
|
|
736
|
+
|
|
737
|
+
X = train.loc[:, ~train.columns.str.contains("^TARGET_")]
|
|
738
|
+
y = train[f"TARGET_{target_number}"]
|
|
739
|
+
|
|
740
|
+
logger.info(f"Starting feature selection for TARGET_{target_number}...")
|
|
741
|
+
|
|
742
|
+
target_type = "classification" if target_number in TARGETS_CLF else "regression"
|
|
743
|
+
|
|
744
|
+
fs_dir_target = f"{dataset.path}/{y.name}/feature_selection"
|
|
745
|
+
preprocessing_dir = f"{dataset.path}/preprocessing"
|
|
746
|
+
os.makedirs(fs_dir_target, exist_ok=True)
|
|
747
|
+
clean_directory(fs_dir_target)
|
|
748
|
+
|
|
749
|
+
# Let's start by removing extremly correlated features
|
|
750
|
+
# This is needed to reduce nb of feature but also for methods such as anova or chi2 that requires independent features
|
|
751
|
+
# TODO: we could also remove low variance features
|
|
752
|
+
features_uncorrelated, features_correlated = remove_correlated_features(
|
|
753
|
+
X, X.columns, 90, vizualize=False
|
|
754
|
+
)
|
|
755
|
+
X = X[features_uncorrelated]
|
|
756
|
+
|
|
757
|
+
logger.debug(
|
|
758
|
+
f"""
|
|
759
|
+
\nWe first have removed {len(features_correlated)} features with correlation greater than 90%
|
|
760
|
+
\nWe are looking to capture {percentile}% of {len(X.columns)} features, i.e. {int(len(X.columns)*percentile/100)} features, with different feature selection methods
|
|
761
|
+
\nWe will then remove above {corr_threshold}% correlated features, keeping the one with the best ranks
|
|
762
|
+
\nFinally, we will keep only the {max_features} best ranked features
|
|
763
|
+
"""
|
|
764
|
+
)
|
|
765
|
+
|
|
766
|
+
start = time.time()
|
|
767
|
+
|
|
768
|
+
# handling categorical features (only if classification)
|
|
769
|
+
categorical_features = X.select_dtypes(include=["int64", "Int64"]).columns.tolist()
|
|
770
|
+
X_categorical = X[categorical_features]
|
|
771
|
+
|
|
772
|
+
if target_type == "classification":
|
|
773
|
+
feat_scores = select_categorical_features(
|
|
774
|
+
X_categorical, y, percentile, save_dir=fs_dir_target
|
|
775
|
+
)
|
|
776
|
+
with get_db() as db:
|
|
777
|
+
for row in feat_scores.itertuples(index=False):
|
|
778
|
+
feature = Feature.find_by(name=row.features, db=db)
|
|
779
|
+
FeatureSelectionRank.upsert(
|
|
780
|
+
["feature_selection_id", "feature_id", "method"],
|
|
781
|
+
db=db,
|
|
782
|
+
score=row.score,
|
|
783
|
+
pvalue=row.pvalue,
|
|
784
|
+
support=row.support,
|
|
785
|
+
rank=row.rank,
|
|
786
|
+
method=row.method,
|
|
787
|
+
training_time=row.training_time,
|
|
788
|
+
feature_selection_id=feature_selection.id,
|
|
789
|
+
feature_id=feature.id,
|
|
790
|
+
)
|
|
791
|
+
categorical_features_selected = feat_scores[feat_scores["support"] == True][
|
|
792
|
+
"features"
|
|
793
|
+
].values.tolist()
|
|
794
|
+
|
|
795
|
+
# removing categorical features from X
|
|
796
|
+
numerical_features = list(set(X.columns).difference(set(categorical_features)))
|
|
797
|
+
X_numerical = X[numerical_features]
|
|
798
|
+
|
|
799
|
+
results = []
|
|
800
|
+
if single_process:
|
|
801
|
+
results = [
|
|
802
|
+
select_feature_by_linear_correlation(
|
|
803
|
+
X_numerical, y, target_type, percentile, save_dir=fs_dir_target
|
|
804
|
+
),
|
|
805
|
+
select_feature_by_nonlinear_correlation(
|
|
806
|
+
X_numerical, y, target_type, percentile, save_dir=fs_dir_target
|
|
807
|
+
),
|
|
808
|
+
select_feature_by_mi(
|
|
809
|
+
X_numerical, y, target_type, percentile, save_dir=fs_dir_target
|
|
810
|
+
),
|
|
811
|
+
select_feature_by_feat_imp(
|
|
812
|
+
X_numerical, y, target_type, percentile, save_dir=fs_dir_target
|
|
813
|
+
),
|
|
814
|
+
select_feature_by_rfe(
|
|
815
|
+
X_numerical, y, target_type, percentile, save_dir=fs_dir_target
|
|
816
|
+
),
|
|
817
|
+
# select_feature_by_sfs(
|
|
818
|
+
# X_numerical, y, target_type, percentile, save_dir=fs_dir_target
|
|
819
|
+
# ), # TODO: this is taking too long
|
|
820
|
+
]
|
|
821
|
+
else:
|
|
822
|
+
# Use ProcessPoolExecutor to run tasks in parallel
|
|
823
|
+
with ProcessPoolExecutor() as executor:
|
|
824
|
+
# Submit different functions to be executed in parallel
|
|
825
|
+
futures = [
|
|
826
|
+
executor.submit(
|
|
827
|
+
select_feature_by_linear_correlation,
|
|
828
|
+
X_numerical,
|
|
829
|
+
y,
|
|
830
|
+
target_type,
|
|
831
|
+
percentile,
|
|
832
|
+
save_dir=fs_dir_target,
|
|
833
|
+
),
|
|
834
|
+
executor.submit(
|
|
835
|
+
select_feature_by_nonlinear_correlation,
|
|
836
|
+
X_numerical,
|
|
837
|
+
y,
|
|
838
|
+
target_type,
|
|
839
|
+
percentile,
|
|
840
|
+
save_dir=fs_dir_target,
|
|
841
|
+
),
|
|
842
|
+
executor.submit(
|
|
843
|
+
select_feature_by_mi,
|
|
844
|
+
X_numerical,
|
|
845
|
+
y,
|
|
846
|
+
target_type,
|
|
847
|
+
percentile,
|
|
848
|
+
save_dir=fs_dir_target,
|
|
849
|
+
),
|
|
850
|
+
executor.submit(
|
|
851
|
+
select_feature_by_feat_imp,
|
|
852
|
+
X_numerical,
|
|
853
|
+
y,
|
|
854
|
+
target_type,
|
|
855
|
+
percentile,
|
|
856
|
+
save_dir=fs_dir_target,
|
|
857
|
+
),
|
|
858
|
+
executor.submit(
|
|
859
|
+
select_feature_by_rfe,
|
|
860
|
+
X_numerical,
|
|
861
|
+
y,
|
|
862
|
+
target_type,
|
|
863
|
+
percentile,
|
|
864
|
+
save_dir=fs_dir_target,
|
|
865
|
+
),
|
|
866
|
+
executor.submit(
|
|
867
|
+
select_feature_by_sfs,
|
|
868
|
+
X_numerical,
|
|
869
|
+
y,
|
|
870
|
+
target_type,
|
|
871
|
+
percentile,
|
|
872
|
+
save_dir=fs_dir_target,
|
|
873
|
+
),
|
|
874
|
+
]
|
|
875
|
+
|
|
876
|
+
# Wait for all futures to complete and gather the results
|
|
877
|
+
with tqdm(total=len(futures)) as pbar:
|
|
878
|
+
for future in as_completed(futures):
|
|
879
|
+
results.append(future.result())
|
|
880
|
+
pbar.update(1)
|
|
881
|
+
logger.info(f"Finished feature selection for target {target_number}")
|
|
882
|
+
|
|
883
|
+
stop = time.time()
|
|
884
|
+
|
|
885
|
+
# Once all tasks are completed, start by inserting results to db
|
|
886
|
+
feat_scores = pd.concat(
|
|
887
|
+
results,
|
|
888
|
+
axis=0,
|
|
889
|
+
)
|
|
890
|
+
|
|
891
|
+
logger.info("Inserting feature selection results to db...")
|
|
892
|
+
rows = []
|
|
893
|
+
|
|
894
|
+
with get_db() as db:
|
|
895
|
+
feature_map = {f.name: f.id for f in Feature.get_all(db=db, limit=20000)}
|
|
896
|
+
for row in feat_scores.itertuples(index=False):
|
|
897
|
+
feature_id = feature_map.get(row.features)
|
|
898
|
+
if not feature_id:
|
|
899
|
+
continue # or raise if feature must exist
|
|
900
|
+
|
|
901
|
+
rows.append(
|
|
902
|
+
{
|
|
903
|
+
"feature_selection_id": feature_selection.id,
|
|
904
|
+
"feature_id": feature_id,
|
|
905
|
+
"method": row.method,
|
|
906
|
+
"score": row.score,
|
|
907
|
+
"pvalue": None if pd.isna(row.pvalue) else row.pvalue,
|
|
908
|
+
"support": row.support,
|
|
909
|
+
"rank": row.rank,
|
|
910
|
+
"training_time": row.training_time,
|
|
911
|
+
}
|
|
912
|
+
)
|
|
913
|
+
|
|
914
|
+
if len(rows) == 0:
|
|
915
|
+
raise ValueError(f"No features selected for TARGET_{target_number}")
|
|
916
|
+
|
|
917
|
+
FeatureSelectionRank.bulk_upsert(rows=rows, db=db)
|
|
918
|
+
|
|
919
|
+
# Merge the results
|
|
920
|
+
features_selected = feat_scores[feat_scores["support"] == True][
|
|
921
|
+
["features", "rank"]
|
|
922
|
+
]
|
|
923
|
+
features_selected.sort_values("rank", inplace=True)
|
|
924
|
+
features_selected.drop_duplicates("features", inplace=True)
|
|
925
|
+
|
|
926
|
+
features_selected_list = features_selected["features"].values.tolist()
|
|
927
|
+
|
|
928
|
+
logger.info("Merging feature selection methods...")
|
|
929
|
+
# features_selected = list(dict.fromkeys(features_selected_by_mi + features_selected_by_nonlinear_correlation + features_selected_by_linear_correlation))
|
|
930
|
+
features_selected_by_every_methods = set(results[0]["features"].values.tolist())
|
|
931
|
+
|
|
932
|
+
for df in results[1:]:
|
|
933
|
+
features_selected_by_every_methods &= set(
|
|
934
|
+
df["features"].values.tolist()
|
|
935
|
+
) # intersection
|
|
936
|
+
|
|
937
|
+
features_selected_by_every_methods = list(features_selected_by_every_methods)
|
|
938
|
+
|
|
939
|
+
logger.debug(
|
|
940
|
+
f"We selected {len(features_selected_list)} features and {len(features_selected_by_every_methods)} were selected unanimously:"
|
|
941
|
+
)
|
|
942
|
+
logger.debug(features_selected_by_every_methods)
|
|
943
|
+
|
|
944
|
+
pd.Series(features_selected_list).to_csv(
|
|
945
|
+
f"{fs_dir_target}/features_before_corr.csv",
|
|
946
|
+
index=True,
|
|
947
|
+
header=True,
|
|
948
|
+
index_label="ID",
|
|
949
|
+
)
|
|
950
|
+
features, features_correlated = remove_correlated_features(
|
|
951
|
+
X, features_selected_list, corr_threshold
|
|
952
|
+
)
|
|
953
|
+
pd.Series(features).to_csv(
|
|
954
|
+
f"{fs_dir_target}/features_before_max.csv",
|
|
955
|
+
index=True,
|
|
956
|
+
header=True,
|
|
957
|
+
index_label="ID",
|
|
958
|
+
)
|
|
959
|
+
features = features[:max_features]
|
|
960
|
+
|
|
961
|
+
features += categorical_features_selected if target_type == "classification" else []
|
|
962
|
+
logger.debug(
|
|
963
|
+
f"Final pre-selection: {len(features)} features below {corr_threshold}% out of {len(features_selected_list)} features, and rejected {len(features_correlated)} features, {100*len(features)/len(features_selected_list):.2f}% features selected"
|
|
964
|
+
)
|
|
965
|
+
|
|
966
|
+
features_selected_by_every_methods_uncorrelated = list(
|
|
967
|
+
set(features) & set(features_selected_by_every_methods)
|
|
968
|
+
)
|
|
969
|
+
logger.debug(
|
|
970
|
+
f"In this pre-selection, there is {len(features_selected_by_every_methods_uncorrelated)} features from the {len(features_selected_by_every_methods)} selected unanimously\n"
|
|
971
|
+
)
|
|
972
|
+
|
|
973
|
+
logger.debug(
|
|
974
|
+
features_selected[features_selected["features"].isin(features)].to_markdown()
|
|
975
|
+
)
|
|
976
|
+
|
|
977
|
+
best_features_path = Path(
|
|
978
|
+
f"{preprocessing_dir}/features_{target_number}.pkl"
|
|
979
|
+
).resolve()
|
|
980
|
+
if PYTHON_ENV != "Test":
|
|
981
|
+
joblib.dump(features, best_features_path)
|
|
982
|
+
|
|
983
|
+
db_features = Feature.filter(name__in=features)
|
|
984
|
+
# Order matters, to keep the same order in db as in features, we need: map features by name
|
|
985
|
+
feature_by_name = {f.name: f for f in db_features}
|
|
986
|
+
# Reorder them according to original `features` list
|
|
987
|
+
ordered_db_features = [
|
|
988
|
+
feature_by_name[name] for name in features if name in feature_by_name
|
|
989
|
+
]
|
|
990
|
+
|
|
991
|
+
feature_selection = FeatureSelection.get(feature_selection.id)
|
|
992
|
+
feature_selection = feature_selection.add_features(ordered_db_features)
|
|
993
|
+
feature_selection.training_time = stop - start
|
|
994
|
+
feature_selection.best_features_path = best_features_path
|
|
995
|
+
feature_selection.save()
|
|
996
|
+
|
|
997
|
+
return features
|
|
998
|
+
|
|
999
|
+
|
|
1000
|
+
# TODO : can we use this to select the ideal number of features ?
|
|
1001
|
+
def feature_selection_analysis(feature_selection_id: int, n_components: int = 5):
|
|
1002
|
+
|
|
1003
|
+
feature_selection = FeatureSelection.get(feature_selection_id)
|
|
1004
|
+
dataset_dir = feature_selection.dataset.path
|
|
1005
|
+
features = [f.name for f in feature_selection.features]
|
|
1006
|
+
target = feature_selection.target.name
|
|
1007
|
+
target_number = target.split("_")[1]
|
|
1008
|
+
|
|
1009
|
+
train, val, train_scaled, val_scaled, _scaler_y = load_train_data(
|
|
1010
|
+
dataset_dir, target_number, target_type=feature_selection.target.type
|
|
1011
|
+
)
|
|
1012
|
+
train = train[features + [target]]
|
|
1013
|
+
train_scaled = train_scaled[features + [target]]
|
|
1014
|
+
|
|
1015
|
+
logger.info("Plot features correlation with target variable...")
|
|
1016
|
+
|
|
1017
|
+
correlations = train.corr()[target].sort_values(ascending=False)
|
|
1018
|
+
|
|
1019
|
+
plt.figure(figsize=(12, 6))
|
|
1020
|
+
sns.barplot(x=correlations.index, y=correlations.values, palette="coolwarm")
|
|
1021
|
+
plt.xticks(rotation=90)
|
|
1022
|
+
plt.title("Feature correlation with target variable")
|
|
1023
|
+
plt.ylabel("Correlation")
|
|
1024
|
+
plt.xlabel("Features")
|
|
1025
|
+
plt.grid(axis="y", linestyle="--", alpha=0.7)
|
|
1026
|
+
plt.show()
|
|
1027
|
+
|
|
1028
|
+
plt.figure(figsize=(14, 10))
|
|
1029
|
+
sns.heatmap(train.corr(), annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
|
|
1030
|
+
plt.title("Correlation Matrix")
|
|
1031
|
+
plt.show()
|
|
1032
|
+
|
|
1033
|
+
logger.info("Plot explained variance by components...")
|
|
1034
|
+
n_components = min(len(features), n_components)
|
|
1035
|
+
pca = PCA(n_components=n_components)
|
|
1036
|
+
X_pca = pca.fit_transform(train_scaled)
|
|
1037
|
+
|
|
1038
|
+
explained_variance = pca.explained_variance_ratio_
|
|
1039
|
+
|
|
1040
|
+
plt.figure(figsize=(10, 7))
|
|
1041
|
+
plt.bar(
|
|
1042
|
+
range(1, len(explained_variance) + 1),
|
|
1043
|
+
explained_variance,
|
|
1044
|
+
label="Explained Variance",
|
|
1045
|
+
)
|
|
1046
|
+
plt.plot(
|
|
1047
|
+
range(1, len(explained_variance) + 1),
|
|
1048
|
+
np.cumsum(explained_variance),
|
|
1049
|
+
label="Cumulative Explained Variance",
|
|
1050
|
+
color="orange",
|
|
1051
|
+
marker="o",
|
|
1052
|
+
)
|
|
1053
|
+
plt.title("Explained Variance by Components")
|
|
1054
|
+
plt.xlabel("Number of Components")
|
|
1055
|
+
plt.ylabel("Explained Variance")
|
|
1056
|
+
plt.legend()
|
|
1057
|
+
plt.grid(axis="y", linestyle="--", alpha=0.7)
|
|
1058
|
+
plt.show()
|
|
1059
|
+
|
|
1060
|
+
logger.info("Main PCA vs target variable...")
|
|
1061
|
+
plt.scatter(
|
|
1062
|
+
X_pca[:, 0],
|
|
1063
|
+
X_pca[:, 1],
|
|
1064
|
+
c=train[target],
|
|
1065
|
+
cmap="coolwarm",
|
|
1066
|
+
alpha=0.7,
|
|
1067
|
+
)
|
|
1068
|
+
plt.title("PCA of target variable")
|
|
1069
|
+
plt.xlabel("First Principal Component")
|
|
1070
|
+
plt.ylabel("Second Principal Component")
|
|
1071
|
+
plt.colorbar()
|
|
1072
|
+
plt.show()
|
|
1073
|
+
|
|
1074
|
+
|
|
1075
|
+
# scaling
|
|
1076
|
+
def scale_data(
|
|
1077
|
+
df: pd.DataFrame, save_dir: str, scaler_x=None, scalers_y: Optional[list] = None
|
|
1078
|
+
):
|
|
1079
|
+
logger.info("Scale data...")
|
|
1080
|
+
X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
|
|
1081
|
+
|
|
1082
|
+
if scaler_x:
|
|
1083
|
+
X_scaled = pd.DataFrame(
|
|
1084
|
+
scaler_x.transform(X), columns=list(X.columns), index=X.index
|
|
1085
|
+
)
|
|
1086
|
+
else:
|
|
1087
|
+
scaler_x = StandardScaler() # MinMaxScaler(feature_range=(-1,1))
|
|
1088
|
+
X_scaled = pd.DataFrame(
|
|
1089
|
+
scaler_x.fit_transform(X), columns=list(X.columns), index=X.index
|
|
1090
|
+
)
|
|
1091
|
+
if PYTHON_ENV != "Test":
|
|
1092
|
+
joblib.dump(scaler_x, f"{save_dir}/scaler_x.pkl")
|
|
1093
|
+
|
|
1094
|
+
# Determine which targets need to be scaled
|
|
1095
|
+
targets_numbers_to_scale = [i for i in TARGETS_NUMBER if i not in TARGETS_CLF]
|
|
1096
|
+
|
|
1097
|
+
# Dictionary to store scaled target data
|
|
1098
|
+
scaled_targets = {}
|
|
1099
|
+
|
|
1100
|
+
if scalers_y:
|
|
1101
|
+
for target_number in targets_numbers_to_scale:
|
|
1102
|
+
y = df[[f"TARGET_{target_number}"]]
|
|
1103
|
+
scaled_targets[target_number] = pd.DataFrame(
|
|
1104
|
+
scalers_y[f"scaler_y_{target_number}"].transform(y.values),
|
|
1105
|
+
columns=y.columns,
|
|
1106
|
+
index=y.index,
|
|
1107
|
+
)
|
|
1108
|
+
else:
|
|
1109
|
+
scalers_y = {}
|
|
1110
|
+
for target_number in targets_numbers_to_scale:
|
|
1111
|
+
scaler_y = StandardScaler()
|
|
1112
|
+
y = df[[f"TARGET_{target_number}"]]
|
|
1113
|
+
|
|
1114
|
+
scaled_y = pd.DataFrame(
|
|
1115
|
+
scaler_y.fit_transform(y.values),
|
|
1116
|
+
columns=y.columns,
|
|
1117
|
+
index=y.index,
|
|
1118
|
+
)
|
|
1119
|
+
if PYTHON_ENV != "Test":
|
|
1120
|
+
joblib.dump(scaler_y, f"{save_dir}/scaler_y_{target_number}.pkl")
|
|
1121
|
+
|
|
1122
|
+
scalers_y[f"scaler_y_{target_number}"] = scaler_y
|
|
1123
|
+
scaled_targets[target_number] = scaled_y
|
|
1124
|
+
|
|
1125
|
+
# Reconstruct y_scaled in the original order
|
|
1126
|
+
y_scaled = pd.concat(
|
|
1127
|
+
[scaled_targets[target_number] for target_number in targets_numbers_to_scale],
|
|
1128
|
+
axis=1,
|
|
1129
|
+
)
|
|
1130
|
+
y_not_scaled = df[df.columns.intersection([f"TARGET_{i}" for i in TARGETS_CLF])]
|
|
1131
|
+
|
|
1132
|
+
# Ensure the final DataFrame keeps the original order
|
|
1133
|
+
df_scaled = pd.concat(
|
|
1134
|
+
[X_scaled, y_scaled, y_not_scaled],
|
|
1135
|
+
axis=1,
|
|
1136
|
+
)[
|
|
1137
|
+
df.columns
|
|
1138
|
+
] # Reorder columns to match original `df`
|
|
1139
|
+
|
|
1140
|
+
if not df_scaled.columns.equals(df.columns):
|
|
1141
|
+
raise Exception("Columns are not in the same order after scaling.")
|
|
1142
|
+
|
|
1143
|
+
return df_scaled, scaler_x, scalers_y
|
|
1144
|
+
|
|
1145
|
+
|
|
1146
|
+
# Reshape into 3D tensors for recurrent models
|
|
1147
|
+
def reshape_time_series(
|
|
1148
|
+
train: pd.DataFrame,
|
|
1149
|
+
val: pd.DataFrame,
|
|
1150
|
+
test: pd.DataFrame,
|
|
1151
|
+
features: list,
|
|
1152
|
+
timesteps: int = 120,
|
|
1153
|
+
):
|
|
1154
|
+
# always scale for recurrent layers : train should be scaled
|
|
1155
|
+
|
|
1156
|
+
target_columns = train.columns.intersection([f"TARGET_{i}" for i in TARGETS_NUMBER])
|
|
1157
|
+
|
|
1158
|
+
data = pd.concat([train, val, test], axis=0)
|
|
1159
|
+
|
|
1160
|
+
data_reshaped = reshape_df(data[features], data[GROUPING_COLUMN], timesteps)
|
|
1161
|
+
|
|
1162
|
+
data_reshaped[target_columns] = data[target_columns]
|
|
1163
|
+
|
|
1164
|
+
logger.info("Separating train, val, test data and creating np arrays...")
|
|
1165
|
+
train_reshaped = data_reshaped.loc[train.index]
|
|
1166
|
+
val_reshaped = data_reshaped.loc[val.index]
|
|
1167
|
+
test_reshaped = data_reshaped.loc[test.index]
|
|
1168
|
+
|
|
1169
|
+
x_train_reshaped = np.array(train_reshaped["RECURRENT_FEATURES"].values.tolist())
|
|
1170
|
+
y_train_reshaped = np.array(train_reshaped[target_columns].reset_index())
|
|
1171
|
+
x_val_reshaped = np.array(val_reshaped["RECURRENT_FEATURES"].values.tolist())
|
|
1172
|
+
y_val_reshaped = np.array(val_reshaped[target_columns].reset_index())
|
|
1173
|
+
x_test_reshaped = np.array(test_reshaped["RECURRENT_FEATURES"].values.tolist())
|
|
1174
|
+
y_test_reshaped = np.array(test_reshaped[target_columns].reset_index())
|
|
1175
|
+
|
|
1176
|
+
reshaped_data = {
|
|
1177
|
+
"x_train_reshaped": x_train_reshaped,
|
|
1178
|
+
"y_train_reshaped": y_train_reshaped,
|
|
1179
|
+
"x_val_reshaped": x_val_reshaped,
|
|
1180
|
+
"y_val_reshaped": y_val_reshaped,
|
|
1181
|
+
"x_test_reshaped": x_test_reshaped,
|
|
1182
|
+
"y_test_reshaped": y_test_reshaped,
|
|
1183
|
+
}
|
|
1184
|
+
|
|
1185
|
+
return reshaped_data
|
|
1186
|
+
|
|
1187
|
+
|
|
1188
|
+
def reshape_df(df: pd.DataFrame, stock_column: pd.DataFrame, timesteps: int):
|
|
1189
|
+
fill_value = [[[0] * len(df.columns)]]
|
|
1190
|
+
|
|
1191
|
+
def shiftsum(x, timesteps: int):
|
|
1192
|
+
tmp = x.copy()
|
|
1193
|
+
for i in range(1, timesteps):
|
|
1194
|
+
tmp = x.shift(i, fill_value=fill_value) + tmp
|
|
1195
|
+
return tmp
|
|
1196
|
+
|
|
1197
|
+
logger.info("Grouping each feature in a unique column with list...")
|
|
1198
|
+
df_reshaped = df.apply(list, axis=1).apply(lambda x: [list(x)])
|
|
1199
|
+
df_reshaped = pd.concat([df_reshaped, stock_column], axis=1)
|
|
1200
|
+
|
|
1201
|
+
logger.info("Grouping method stock and creating timesteps...")
|
|
1202
|
+
df_reshaped = (
|
|
1203
|
+
df_reshaped.groupby(GROUPING_COLUMN)[0]
|
|
1204
|
+
.apply(lambda x: shiftsum(x, timesteps))
|
|
1205
|
+
.reset_index(GROUPING_COLUMN, drop=True)
|
|
1206
|
+
.rename("RECURRENT_FEATURES")
|
|
1207
|
+
)
|
|
1208
|
+
df_reshaped = pd.DataFrame(df_reshaped)
|
|
1209
|
+
|
|
1210
|
+
return df_reshaped
|
|
1211
|
+
|
|
1212
|
+
|
|
1213
|
+
def load_train_data(dataset_dir, target_number, target_type="regression"):
|
|
1214
|
+
train_data_dir = f"{dataset_dir}/data"
|
|
1215
|
+
preprocessing_dir = f"{dataset_dir}/preprocessing"
|
|
1216
|
+
|
|
1217
|
+
_scaler_y = (
|
|
1218
|
+
joblib.load(f"{preprocessing_dir}/scaler_y_{target_number}.pkl")
|
|
1219
|
+
if target_type == "regression"
|
|
1220
|
+
else None
|
|
1221
|
+
)
|
|
1222
|
+
|
|
1223
|
+
logger.info("Loading data...")
|
|
1224
|
+
train = joblib.load(f"{train_data_dir}/train.pkl")
|
|
1225
|
+
val = joblib.load(f"{train_data_dir}/val.pkl")
|
|
1226
|
+
train_scaled = joblib.load(f"{train_data_dir}/train_scaled.pkl")
|
|
1227
|
+
val_scaled = joblib.load(f"{train_data_dir}/val_scaled.pkl")
|
|
1228
|
+
|
|
1229
|
+
return train, val, train_scaled, val_scaled, _scaler_y
|