lecrapaud 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lecrapaud might be problematic. Click here for more details.
- lecrapaud/__init__.py +1 -0
- lecrapaud/api.py +277 -0
- lecrapaud/config.py +10 -0
- lecrapaud/db/__init__.py +1 -0
- lecrapaud/db/alembic/env.py +2 -2
- lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +24 -12
- lecrapaud/db/alembic/versions/2025_06_17_1652-c45f5e49fa2c_make_fields_nullable.py +89 -0
- lecrapaud/db/alembic.ini +116 -0
- lecrapaud/db/models/__init__.py +10 -10
- lecrapaud/db/models/base.py +176 -1
- lecrapaud/db/models/dataset.py +25 -20
- lecrapaud/db/models/feature.py +5 -6
- lecrapaud/db/models/feature_selection.py +3 -4
- lecrapaud/db/models/feature_selection_rank.py +3 -4
- lecrapaud/db/models/model.py +3 -4
- lecrapaud/db/models/model_selection.py +15 -8
- lecrapaud/db/models/model_training.py +15 -7
- lecrapaud/db/models/score.py +9 -6
- lecrapaud/db/models/target.py +16 -8
- lecrapaud/db/session.py +66 -0
- lecrapaud/experiment.py +64 -0
- lecrapaud/feature_engineering.py +747 -1022
- lecrapaud/feature_selection.py +915 -998
- lecrapaud/integrations/openai_integration.py +225 -0
- lecrapaud/jobs/__init__.py +2 -2
- lecrapaud/jobs/config.py +1 -1
- lecrapaud/jobs/scheduler.py +1 -1
- lecrapaud/jobs/tasks.py +6 -6
- lecrapaud/model_selection.py +1060 -960
- lecrapaud/search_space.py +4 -0
- lecrapaud/utils.py +2 -2
- lecrapaud-0.4.1.dist-info/METADATA +171 -0
- {lecrapaud-0.4.0.dist-info → lecrapaud-0.4.1.dist-info}/RECORD +36 -35
- {lecrapaud-0.4.0.dist-info → lecrapaud-0.4.1.dist-info}/WHEEL +1 -1
- lecrapaud/db/crud.py +0 -179
- lecrapaud/db/services.py +0 -0
- lecrapaud/db/setup.py +0 -58
- lecrapaud/predictions.py +0 -292
- lecrapaud/training.py +0 -151
- lecrapaud-0.4.0.dist-info/METADATA +0 -103
- /lecrapaud/{directory_management.py → directories.py} +0 -0
- {lecrapaud-0.4.0.dist-info → lecrapaud-0.4.1.dist-info}/LICENSE +0 -0
lecrapaud/feature_selection.py
CHANGED
|
@@ -33,970 +33,1027 @@ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
|
|
33
33
|
from sklearn.model_selection import TimeSeriesSplit
|
|
34
34
|
from sklearn.metrics import root_mean_squared_error, log_loss, make_scorer
|
|
35
35
|
from mlxtend.feature_selection import SequentialFeatureSelector
|
|
36
|
-
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
|
|
37
|
-
from sklearn.compose import ColumnTransformer
|
|
38
|
-
import category_encoders as ce
|
|
39
|
-
from scipy.stats import spearmanr, kendalltau
|
|
40
|
-
|
|
41
|
-
# Scaling
|
|
42
36
|
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
|
37
|
+
from scipy.stats import spearmanr, kendalltau
|
|
43
38
|
|
|
44
39
|
# Internal
|
|
45
|
-
from
|
|
46
|
-
from
|
|
47
|
-
from
|
|
48
|
-
from
|
|
40
|
+
from lecrapaud.directories import tmp_dir, clean_directory
|
|
41
|
+
from lecrapaud.utils import logger
|
|
42
|
+
from lecrapaud.config import PYTHON_ENV
|
|
43
|
+
from lecrapaud.db import (
|
|
49
44
|
Dataset,
|
|
50
45
|
Target,
|
|
51
46
|
Feature,
|
|
52
47
|
FeatureSelection,
|
|
53
48
|
FeatureSelectionRank,
|
|
54
49
|
)
|
|
55
|
-
from
|
|
56
|
-
|
|
57
|
-
# Variables for targets handling
|
|
58
|
-
TARGETS_NUMBER = range(1, 15)
|
|
59
|
-
TARGETS_CLF = [2, 4, 6, 8, 9, 10, 11]
|
|
60
|
-
TARGETS_MCLF = [11]
|
|
61
|
-
GROUPING_COLUMN = "STOCK"
|
|
62
|
-
DATE_COLUMN = "DATE"
|
|
50
|
+
from lecrapaud.db.session import get_db
|
|
51
|
+
from lecrapaud.search_space import all_models
|
|
63
52
|
|
|
64
53
|
# Annoying Warnings
|
|
65
54
|
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
66
55
|
|
|
67
56
|
|
|
68
|
-
def
|
|
69
|
-
|
|
70
|
-
):
|
|
71
|
-
number_of_groups = df[GROUPING_COLUMN].nunique()
|
|
72
|
-
|
|
73
|
-
# Try to convert DATE column to datetime safely
|
|
74
|
-
if pd.api.types.is_integer_dtype(df[DATE_COLUMN]):
|
|
75
|
-
df_date = df[DATE_COLUMN].map(pd.Timestamp.fromordinal)
|
|
76
|
-
else:
|
|
77
|
-
df_date = pd.to_datetime(
|
|
78
|
-
df[DATE_COLUMN], errors="coerce"
|
|
79
|
-
) # convert strings, datetime, etc.
|
|
80
|
-
|
|
81
|
-
name = f"data_{number_of_groups}_{corr_threshold}_{percentile}_{max_features}_{df_date.min().date()}_{df_date.max().date()}"
|
|
82
|
-
if PYTHON_ENV == "Test":
|
|
83
|
-
name = f"test_{name}"
|
|
84
|
-
return name
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def create_sets_from_data(
|
|
88
|
-
df: pd.DataFrame,
|
|
89
|
-
corr_threshold: int = 80,
|
|
90
|
-
percentile: int = 20,
|
|
91
|
-
max_features: int = 20,
|
|
92
|
-
):
|
|
93
|
-
|
|
94
|
-
df.sort_values([DATE_COLUMN, GROUPING_COLUMN], inplace=True)
|
|
95
|
-
|
|
96
|
-
# Drop non-useful column for training
|
|
97
|
-
if "ISIN" in df.columns:
|
|
98
|
-
df.drop(labels=["ISIN"], axis=1, inplace=True)
|
|
99
|
-
if "SECURITY" in df.columns:
|
|
100
|
-
df.drop(labels=["SECURITY"], axis=1, inplace=True)
|
|
57
|
+
def load_train_data(dataset_dir, target_number, target_type="regression"):
|
|
58
|
+
data_dir = f"{dataset_dir}/data"
|
|
101
59
|
|
|
102
|
-
|
|
60
|
+
logger.info("Loading data...")
|
|
61
|
+
train = joblib.load(f"{data_dir}/train.pkl")
|
|
62
|
+
val = joblib.load(f"{data_dir}/val.pkl")
|
|
63
|
+
test = joblib.load(f"{data_dir}/test.pkl")
|
|
64
|
+
try:
|
|
65
|
+
train_scaled = joblib.load(f"{data_dir}/train_scaled.pkl")
|
|
66
|
+
val_scaled = joblib.load(f"{data_dir}/val_scaled.pkl")
|
|
67
|
+
test_scaled = joblib.load(f"{data_dir}/test_scaled.pkl")
|
|
68
|
+
except FileNotFoundError:
|
|
69
|
+
train_scaled = None
|
|
70
|
+
val_scaled = None
|
|
71
|
+
test_scaled = None
|
|
72
|
+
|
|
73
|
+
return train, val, test, train_scaled, val_scaled, test_scaled
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class FeatureSelectionEngine:
|
|
77
|
+
def __init__(self, train, dataset, target_number, target_clf, **kwargs):
|
|
78
|
+
self.dataset = dataset
|
|
79
|
+
self.train = train
|
|
80
|
+
self.target_number = target_number
|
|
81
|
+
self.target_clf = target_clf
|
|
82
|
+
|
|
83
|
+
self.target_type = (
|
|
84
|
+
"classification" if self.target_number in self.target_clf else "regression"
|
|
85
|
+
)
|
|
86
|
+
self.percentile = self.dataset.percentile
|
|
87
|
+
self.corr_threshold = self.dataset.corr_threshold
|
|
88
|
+
self.max_features = self.dataset.max_features
|
|
89
|
+
|
|
90
|
+
self.dataset_dir = self.dataset.path
|
|
91
|
+
self.dataset_id = self.dataset.id
|
|
92
|
+
self.data_dir = f"{self.dataset_dir}/data"
|
|
93
|
+
self.preprocessing_dir = f"{self.dataset_dir}/preprocessing"
|
|
94
|
+
self.fs_dir_target = (
|
|
95
|
+
f"{self.dataset_dir}/{f"TARGET_{self.target_number}"}/feature_selection"
|
|
96
|
+
)
|
|
97
|
+
os.makedirs(self.fs_dir_target, exist_ok=True)
|
|
98
|
+
|
|
99
|
+
# Main feature selection function
|
|
100
|
+
def run(
|
|
101
|
+
self,
|
|
102
|
+
single_process: bool = True,
|
|
103
|
+
):
|
|
104
|
+
"""Function to do feature selection with a range of different feature selection technics
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
- train (pd.DataFrame): a pandas train set
|
|
108
|
+
- target_number (in): a target, targets need to be name ``TARGET_{n}```
|
|
109
|
+
- single_process (bool): if True, run all feature selection methods in a single process. If False, run them in parallel.
|
|
110
|
+
"""
|
|
111
|
+
target_number = self.target_number
|
|
112
|
+
target_type = self.target_type
|
|
113
|
+
fs_dir_target = self.fs_dir_target
|
|
114
|
+
|
|
115
|
+
# Create the feature selection in db
|
|
116
|
+
target = Target.find_by(name=f"TARGET_{target_number}")
|
|
117
|
+
percentile = self.percentile
|
|
118
|
+
corr_threshold = self.corr_threshold
|
|
119
|
+
max_features = self.max_features
|
|
120
|
+
|
|
121
|
+
feature_selection = FeatureSelection.upsert(
|
|
122
|
+
match_fields=["target_id", "dataset_id"],
|
|
123
|
+
target_id=target.id,
|
|
124
|
+
dataset_id=self.dataset_id,
|
|
125
|
+
)
|
|
103
126
|
|
|
104
|
-
|
|
105
|
-
|
|
127
|
+
if feature_selection.best_features_path:
|
|
128
|
+
return joblib.load(feature_selection.best_features_path)
|
|
106
129
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
test = df[df[DATE_COLUMN].isin(dates[test_first_id:])]
|
|
130
|
+
self.X = self.train.loc[:, ~self.train.columns.str.contains("^TARGET_")]
|
|
131
|
+
self.y = self.train[f"TARGET_{target_number}"]
|
|
110
132
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
dates["end_date"] = pd.to_datetime(df[DATE_COLUMN].iat[-1])
|
|
114
|
-
for name, data in zip(["train", "val", "test"], [train, val, test]):
|
|
115
|
-
dates[f"{name}_start_date"] = pd.to_datetime(data[DATE_COLUMN].iat[0])
|
|
116
|
-
dates[f"{name}_end_date"] = pd.to_datetime(data[DATE_COLUMN].iat[-1])
|
|
133
|
+
logger.info(f"Starting feature selection for TARGET_{target_number}...")
|
|
134
|
+
clean_directory(self.fs_dir_target)
|
|
117
135
|
|
|
118
|
-
|
|
119
|
-
|
|
136
|
+
# Let's start by removing extremly correlated features
|
|
137
|
+
# This is needed to reduce nb of feature but also for methods such as anova or chi2 that requires independent features
|
|
138
|
+
# TODO: we could also remove low variance features
|
|
139
|
+
features_uncorrelated, features_correlated = self.remove_correlated_features(
|
|
140
|
+
90, vizualize=False
|
|
120
141
|
)
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
dataset_dir = f"{tmp_dir}/{dataset_name}"
|
|
131
|
-
preprocessing_dir = f"{dataset_dir}/preprocessing"
|
|
132
|
-
train_data_dir = f"{dataset_dir}/data"
|
|
133
|
-
os.makedirs(dataset_dir, exist_ok=True)
|
|
134
|
-
os.makedirs(preprocessing_dir, exist_ok=True)
|
|
135
|
-
os.makedirs(train_data_dir, exist_ok=True)
|
|
136
|
-
|
|
137
|
-
dataset = datasets[name] = Dataset.upsert(
|
|
138
|
-
match_fields=["name"],
|
|
139
|
-
db=db,
|
|
140
|
-
name=dataset_name,
|
|
141
|
-
path=Path(dataset_dir).resolve(),
|
|
142
|
-
type="training",
|
|
143
|
-
size=df.shape[0],
|
|
144
|
-
train_size=train.shape[0],
|
|
145
|
-
val_size=val.shape[0],
|
|
146
|
-
test_size=test.shape[0],
|
|
147
|
-
number_of_groups=data[GROUPING_COLUMN].nunique(),
|
|
148
|
-
list_of_groups=data[GROUPING_COLUMN].unique().tolist(),
|
|
149
|
-
corr_threshold=corr_threshold,
|
|
150
|
-
percentile=percentile,
|
|
151
|
-
max_features=max_features,
|
|
152
|
-
**dates,
|
|
153
|
-
targets=matched_targets,
|
|
142
|
+
self.X = self.X[features_uncorrelated]
|
|
143
|
+
|
|
144
|
+
logger.debug(
|
|
145
|
+
f"""
|
|
146
|
+
\nWe first have removed {len(features_correlated)} features with correlation greater than 90%
|
|
147
|
+
\nWe are looking to capture {percentile}% of {len(self.X.columns)} features, i.e. {int(len(self.X.columns)*percentile/100)} features, with different feature selection methods
|
|
148
|
+
\nWe will then remove above {corr_threshold}% correlated features, keeping the one with the best ranks
|
|
149
|
+
\nFinally, we will keep only the {max_features} best ranked features
|
|
150
|
+
"""
|
|
154
151
|
)
|
|
155
152
|
|
|
156
|
-
|
|
157
|
-
train = encode_categorical_features(train, fit=True, save_dir=preprocessing_dir)
|
|
158
|
-
val = encode_categorical_features(val, save_dir=preprocessing_dir)
|
|
159
|
-
test = encode_categorical_features(test, save_dir=preprocessing_dir)
|
|
160
|
-
|
|
161
|
-
# save the full data
|
|
162
|
-
if PYTHON_ENV != "Test":
|
|
163
|
-
joblib.dump(df, f"{train_data_dir}/full.pkl")
|
|
153
|
+
start = time.time()
|
|
164
154
|
|
|
165
|
-
|
|
155
|
+
# handling categorical features (only if classification)
|
|
156
|
+
self.X_categorical, self.X_numerical = get_features_by_types(self.X)
|
|
166
157
|
|
|
158
|
+
if target_type == "classification" and self.X_categorical.shape[1] > 0:
|
|
159
|
+
feat_scores = self.select_categorical_features(
|
|
160
|
+
percentile=percentile, save_dir=fs_dir_target
|
|
161
|
+
)
|
|
162
|
+
with get_db() as db:
|
|
163
|
+
for row in feat_scores.itertuples(index=False):
|
|
164
|
+
feature = Feature.find_by(name=row.features, db=db)
|
|
165
|
+
FeatureSelectionRank.upsert(
|
|
166
|
+
["feature_selection_id", "feature_id", "method"],
|
|
167
|
+
db=db,
|
|
168
|
+
score=row.score,
|
|
169
|
+
pvalue=row.pvalue,
|
|
170
|
+
support=row.support,
|
|
171
|
+
rank=row.rank,
|
|
172
|
+
method=row.method,
|
|
173
|
+
training_time=row.training_time,
|
|
174
|
+
feature_selection_id=feature_selection.id,
|
|
175
|
+
feature_id=feature.id,
|
|
176
|
+
)
|
|
177
|
+
categorical_features_selected = feat_scores[feat_scores["support"]][
|
|
178
|
+
"features"
|
|
179
|
+
].values.tolist()
|
|
180
|
+
|
|
181
|
+
results = []
|
|
182
|
+
params = {"percentile": percentile, "save_dir": fs_dir_target}
|
|
183
|
+
if single_process:
|
|
184
|
+
results = [
|
|
185
|
+
self.select_feature_by_linear_correlation(**params),
|
|
186
|
+
self.select_feature_by_nonlinear_correlation(**params),
|
|
187
|
+
self.select_feature_by_mi(**params),
|
|
188
|
+
self.select_feature_by_feat_imp(**params),
|
|
189
|
+
self.select_feature_by_rfe(**params),
|
|
190
|
+
# self.select_feature_by_sfs(
|
|
191
|
+
# **params
|
|
192
|
+
# ), # TODO: this is taking too long
|
|
193
|
+
]
|
|
194
|
+
else:
|
|
195
|
+
# Use ProcessPoolExecutor to run tasks in parallel
|
|
196
|
+
# TODO: not sure it's efficient from previous tests... especially because rfe and sfs methods are doing parallel processing already, this can create overhead
|
|
197
|
+
with ProcessPoolExecutor() as executor:
|
|
198
|
+
# Submit different functions to be executed in parallel
|
|
199
|
+
futures = [
|
|
200
|
+
executor.submit(
|
|
201
|
+
self.select_feature_by_linear_correlation,
|
|
202
|
+
**params,
|
|
203
|
+
),
|
|
204
|
+
executor.submit(
|
|
205
|
+
self.select_feature_by_nonlinear_correlation,
|
|
206
|
+
**params,
|
|
207
|
+
),
|
|
208
|
+
executor.submit(
|
|
209
|
+
self.select_feature_by_mi,
|
|
210
|
+
**params,
|
|
211
|
+
),
|
|
212
|
+
executor.submit(
|
|
213
|
+
self.select_feature_by_feat_imp,
|
|
214
|
+
**params,
|
|
215
|
+
),
|
|
216
|
+
executor.submit(
|
|
217
|
+
self.select_feature_by_rfe,
|
|
218
|
+
**params,
|
|
219
|
+
),
|
|
220
|
+
# executor.submit(
|
|
221
|
+
# self.select_feature_by_sfs,
|
|
222
|
+
# **params,
|
|
223
|
+
# ), # TODO: this is taking too long
|
|
224
|
+
]
|
|
225
|
+
|
|
226
|
+
# Wait for all futures to complete and gather the results
|
|
227
|
+
with tqdm(total=len(futures)) as pbar:
|
|
228
|
+
for future in as_completed(futures):
|
|
229
|
+
results.append(future.result())
|
|
230
|
+
pbar.update(1)
|
|
231
|
+
|
|
232
|
+
logger.info(f"Finished feature selection for target {target_number}")
|
|
233
|
+
|
|
234
|
+
stop = time.time()
|
|
235
|
+
|
|
236
|
+
# Once all tasks are completed, start by inserting results to db
|
|
237
|
+
feat_scores = pd.concat(
|
|
238
|
+
results,
|
|
239
|
+
axis=0,
|
|
240
|
+
)
|
|
167
241
|
|
|
168
|
-
|
|
242
|
+
logger.info("Inserting feature selection results to db...")
|
|
243
|
+
rows = []
|
|
244
|
+
with get_db() as db:
|
|
245
|
+
feature_map = {f.name: f.id for f in Feature.get_all(db=db, limit=20000)}
|
|
246
|
+
for row in feat_scores.itertuples(index=False):
|
|
247
|
+
feature_id = feature_map.get(row.features)
|
|
248
|
+
if not feature_id:
|
|
249
|
+
continue # or raise if feature must exist
|
|
250
|
+
|
|
251
|
+
rows.append(
|
|
252
|
+
{
|
|
253
|
+
"feature_selection_id": feature_selection.id,
|
|
254
|
+
"feature_id": feature_id,
|
|
255
|
+
"method": row.method,
|
|
256
|
+
"score": row.score,
|
|
257
|
+
"pvalue": None if pd.isna(row.pvalue) else row.pvalue,
|
|
258
|
+
"support": row.support,
|
|
259
|
+
"rank": row.rank,
|
|
260
|
+
"training_time": row.training_time,
|
|
261
|
+
}
|
|
262
|
+
)
|
|
169
263
|
|
|
170
|
-
|
|
171
|
-
|
|
264
|
+
if len(rows) == 0:
|
|
265
|
+
raise ValueError(f"No features selected for TARGET_{target_number}")
|
|
172
266
|
|
|
173
|
-
|
|
174
|
-
X.loc[:, DATE_COLUMN] = pd.to_datetime(X[DATE_COLUMN]).map(pd.Timestamp.toordinal)
|
|
267
|
+
FeatureSelectionRank.bulk_upsert(rows=rows, db=db)
|
|
175
268
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
269
|
+
# Merge the results
|
|
270
|
+
logger.info("Merging feature selection methods...")
|
|
271
|
+
features_selected = feat_scores[feat_scores["support"]][["features", "rank"]]
|
|
272
|
+
features_selected.sort_values("rank", inplace=True)
|
|
273
|
+
features_selected.drop_duplicates("features", inplace=True)
|
|
179
274
|
|
|
180
|
-
|
|
275
|
+
features_selected_list = features_selected["features"].values.tolist()
|
|
181
276
|
|
|
182
|
-
#
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
binary_encoding_features,
|
|
199
|
-
),
|
|
200
|
-
],
|
|
201
|
-
remainder="passthrough", # Keep the non-encoded columns like 'DATE'
|
|
277
|
+
# analysis 1
|
|
278
|
+
features_selected_by_every_methods = set(results[0]["features"].values.tolist())
|
|
279
|
+
for df in results[1:]:
|
|
280
|
+
features_selected_by_every_methods &= set(
|
|
281
|
+
df["features"].values.tolist()
|
|
282
|
+
) # intersection
|
|
283
|
+
features_selected_by_every_methods = list(features_selected_by_every_methods)
|
|
284
|
+
logger.debug(
|
|
285
|
+
f"We selected {len(features_selected_list)} features and {len(features_selected_by_every_methods)} were selected unanimously:"
|
|
286
|
+
)
|
|
287
|
+
logger.debug(features_selected_by_every_methods)
|
|
288
|
+
pd.Series(features_selected_list).to_csv(
|
|
289
|
+
f"{fs_dir_target}/features_before_corr.csv",
|
|
290
|
+
index=True,
|
|
291
|
+
header=True,
|
|
292
|
+
index_label="ID",
|
|
202
293
|
)
|
|
203
|
-
transformed_data = column_transformer.fit_transform(X)
|
|
204
|
-
if PYTHON_ENV != "Test":
|
|
205
|
-
joblib.dump(column_transformer, f"{save_dir}/column_transformer.pkl")
|
|
206
|
-
else:
|
|
207
|
-
# Load the ColumnTransformer and apply it
|
|
208
|
-
column_transformer = joblib.load(f"{save_dir}/column_transformer.pkl")
|
|
209
|
-
|
|
210
|
-
transformed_data = column_transformer.transform(X)
|
|
211
|
-
|
|
212
|
-
# Convert to DataFrame for readability and return
|
|
213
|
-
transformed_X = pd.DataFrame(
|
|
214
|
-
transformed_data,
|
|
215
|
-
columns=[
|
|
216
|
-
feature.split("__")[1]
|
|
217
|
-
for feature in column_transformer.get_feature_names_out()
|
|
218
|
-
],
|
|
219
|
-
index=X.index,
|
|
220
|
-
)
|
|
221
|
-
transformed_X = transformed_X.apply(pd.to_numeric)
|
|
222
|
-
for col in [
|
|
223
|
-
feature.split("__")[1]
|
|
224
|
-
for feature in column_transformer.get_feature_names_out()
|
|
225
|
-
if "remainder" not in feature
|
|
226
|
-
] + [DATE_COLUMN]:
|
|
227
|
-
transformed_X[col] = transformed_X[col].astype(int)
|
|
228
|
-
|
|
229
|
-
# Insert features in db
|
|
230
|
-
if fit:
|
|
231
|
-
# TODO: in bulk
|
|
232
|
-
for feature in transformed_X.columns:
|
|
233
|
-
dtype = transformed_X[feature].dtype
|
|
234
|
-
if pd.api.types.is_integer_dtype(dtype):
|
|
235
|
-
feature_type = "categorical"
|
|
236
|
-
elif pd.api.types.is_float_dtype(dtype):
|
|
237
|
-
feature_type = "numerical"
|
|
238
|
-
else:
|
|
239
|
-
feature_type = "other"
|
|
240
|
-
Feature.upsert(match_fields=["name"], name=feature, type=feature_type)
|
|
241
|
-
for target in y.columns:
|
|
242
|
-
type = (
|
|
243
|
-
"classification"
|
|
244
|
-
if int(target.split("_")[1]) in TARGETS_CLF
|
|
245
|
-
else "regression"
|
|
246
|
-
)
|
|
247
|
-
# TODO: what about description here ?
|
|
248
|
-
Target.upsert(match_fields=["name", "type"], name=target, type=type)
|
|
249
|
-
|
|
250
|
-
return pd.concat([transformed_X, y], axis=1)
|
|
251
294
|
|
|
295
|
+
# removing correlated features
|
|
296
|
+
self.X = self.X[features_selected_list]
|
|
297
|
+
features, features_correlated = self.remove_correlated_features(corr_threshold)
|
|
298
|
+
pd.Series(features).to_csv(
|
|
299
|
+
f"{fs_dir_target}/features_before_max.csv",
|
|
300
|
+
index=True,
|
|
301
|
+
header=True,
|
|
302
|
+
index_label="ID",
|
|
303
|
+
)
|
|
304
|
+
features = features[:max_features]
|
|
252
305
|
|
|
253
|
-
#
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
306
|
+
# adding categorical features selected
|
|
307
|
+
features += (
|
|
308
|
+
categorical_features_selected if target_type == "classification" else []
|
|
309
|
+
)
|
|
310
|
+
logger.debug(
|
|
311
|
+
f"Final pre-selection: {len(features)} features below {corr_threshold}% out of {len(features_selected_list)} features, and rejected {len(features_correlated)} features, {100*len(features)/len(features_selected_list):.2f}% features selected"
|
|
312
|
+
)
|
|
259
313
|
|
|
260
|
-
|
|
314
|
+
# analysis 2
|
|
315
|
+
features_selected_by_every_methods_uncorrelated = list(
|
|
316
|
+
set(features) & set(features_selected_by_every_methods)
|
|
317
|
+
)
|
|
318
|
+
logger.debug(
|
|
319
|
+
f"In this pre-selection, there is {len(features_selected_by_every_methods_uncorrelated)} features from the {len(features_selected_by_every_methods)} selected unanimously\n"
|
|
320
|
+
)
|
|
321
|
+
logger.debug(
|
|
322
|
+
features_selected[
|
|
323
|
+
features_selected["features"].isin(features)
|
|
324
|
+
].to_markdown()
|
|
325
|
+
)
|
|
261
326
|
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
327
|
+
# save to path
|
|
328
|
+
best_features_path = Path(
|
|
329
|
+
f"{self.preprocessing_dir}/features_{target_number}.pkl"
|
|
330
|
+
).resolve()
|
|
331
|
+
joblib.dump(features, best_features_path)
|
|
267
332
|
|
|
268
|
-
|
|
269
|
-
|
|
333
|
+
# save in db
|
|
334
|
+
db_features = Feature.filter(name__in=features)
|
|
335
|
+
# Order matters, to keep the same order in db as in features, we need: map features by name
|
|
336
|
+
feature_by_name = {f.name: f for f in db_features}
|
|
337
|
+
# Reorder them according to original `features` list
|
|
338
|
+
ordered_db_features = [
|
|
339
|
+
feature_by_name[name] for name in features if name in feature_by_name
|
|
340
|
+
]
|
|
270
341
|
|
|
271
|
-
|
|
272
|
-
|
|
342
|
+
feature_selection = FeatureSelection.get(feature_selection.id)
|
|
343
|
+
feature_selection = feature_selection.add_features(ordered_db_features)
|
|
344
|
+
feature_selection.training_time = stop - start
|
|
345
|
+
feature_selection.best_features_path = best_features_path
|
|
346
|
+
feature_selection.save()
|
|
273
347
|
|
|
274
|
-
|
|
348
|
+
return features
|
|
275
349
|
|
|
276
|
-
|
|
350
|
+
# Remove correlation
|
|
351
|
+
# ------------------
|
|
277
352
|
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
columns_binary_encoder = [
|
|
284
|
-
feature.split("__")[1]
|
|
285
|
-
for feature in column_transformer.get_feature_names_out()
|
|
286
|
-
if feature.split("__")[0] == "binary_encoder"
|
|
287
|
-
]
|
|
288
|
-
# Remove trailing "_number" using regex
|
|
289
|
-
columns_binary_encoder = {
|
|
290
|
-
re.sub(r"_\d+$", "", col) for col in columns_binary_encoder
|
|
291
|
-
}
|
|
292
|
-
columns_binary_encoder = list(columns_binary_encoder)
|
|
293
|
-
|
|
294
|
-
columns_remainder = [
|
|
295
|
-
feature.split("__")[1]
|
|
296
|
-
for feature in column_transformer.get_feature_names_out()
|
|
297
|
-
if feature.split("__")[0] == "remainder"
|
|
298
|
-
]
|
|
299
|
-
columns = columns_ordinal + columns_binary_encoder + columns_remainder
|
|
300
|
-
decoded_X = pd.DataFrame(
|
|
301
|
-
retarr,
|
|
302
|
-
columns=columns,
|
|
303
|
-
index=index,
|
|
304
|
-
)
|
|
353
|
+
def remove_correlated_features(self, corr_threshold: int, vizualize: bool = False):
|
|
354
|
+
X = self.X
|
|
355
|
+
features = X.columns
|
|
356
|
+
# Create correlation matrix, select upper triangle & remove features with correlation greater than threshold
|
|
357
|
+
corr_matrix = X[features].corr().abs()
|
|
305
358
|
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
# Filter methods
|
|
319
|
-
# ----------------
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
# Linear correlation (Person's R for regression and ANOVA for classification)
|
|
323
|
-
def select_feature_by_linear_correlation(
|
|
324
|
-
X, y, target_type, percentile: int = 20, save_dir: Optional[str] = None
|
|
325
|
-
):
|
|
326
|
-
start = time.time()
|
|
327
|
-
test_type = "Person’s R" if target_type == "regression" else "ANOVA"
|
|
328
|
-
logger.debug(f"Running {test_type}...")
|
|
329
|
-
|
|
330
|
-
model = f_regression if target_type == "regression" else f_classif
|
|
331
|
-
feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
|
|
332
|
-
feat_scores = pd.DataFrame()
|
|
333
|
-
feat_scores["score"] = feat_selector.scores_
|
|
334
|
-
feat_scores["pvalue"] = feat_selector.pvalues_
|
|
335
|
-
feat_scores["support"] = feat_selector.get_support()
|
|
336
|
-
feat_scores["features"] = X.columns
|
|
337
|
-
feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
|
|
338
|
-
feat_scores["method"] = test_type
|
|
339
|
-
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
340
|
-
stop = time.time()
|
|
341
|
-
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
342
|
-
feat_scores["training_time"] = training_time
|
|
343
|
-
|
|
344
|
-
logger.debug(
|
|
345
|
-
f"{test_type} evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
346
|
-
)
|
|
359
|
+
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
|
360
|
+
features_uncorrelated = [
|
|
361
|
+
column
|
|
362
|
+
for column in upper.columns
|
|
363
|
+
if all(upper[column].dropna() <= corr_threshold / 100)
|
|
364
|
+
]
|
|
365
|
+
features_correlated = [
|
|
366
|
+
column
|
|
367
|
+
for column in upper.columns
|
|
368
|
+
if any(upper[column] > corr_threshold / 100)
|
|
369
|
+
]
|
|
347
370
|
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
371
|
+
if vizualize:
|
|
372
|
+
features_selected_visualization = (
|
|
373
|
+
X[features]
|
|
374
|
+
.corr()
|
|
375
|
+
.where(np.triu(np.ones(len(features)), k=1).astype(bool))
|
|
376
|
+
.fillna(0)
|
|
377
|
+
)
|
|
378
|
+
# Plot the heatmap
|
|
379
|
+
plt.figure(figsize=(10, 8))
|
|
380
|
+
sns.heatmap(
|
|
381
|
+
corr_matrix,
|
|
382
|
+
annot=True,
|
|
383
|
+
cmap="coolwarm",
|
|
384
|
+
center=0,
|
|
385
|
+
linewidths=1,
|
|
386
|
+
linecolor="black",
|
|
387
|
+
)
|
|
388
|
+
plt.title(f"Correlation Matrix")
|
|
389
|
+
plt.show()
|
|
390
|
+
|
|
391
|
+
logger.info(f"\n{features_selected_visualization.describe().to_string()}")
|
|
392
|
+
logger.info(f"\n{features_selected_visualization.to_string()}")
|
|
393
|
+
return features_uncorrelated, features_correlated
|
|
394
|
+
|
|
395
|
+
# Filter methods
|
|
396
|
+
# ----------------
|
|
397
|
+
|
|
398
|
+
def select_categorical_features(self, percentile, save_dir: Optional[str] = None):
|
|
399
|
+
X, y = self.X_categorical, self.y
|
|
400
|
+
|
|
401
|
+
start = time.time()
|
|
402
|
+
logger.debug("Running Chi2 for categorical features...")
|
|
403
|
+
feat_selector = SelectPercentile(chi2, percentile=percentile).fit(X, y)
|
|
404
|
+
feat_scores = pd.DataFrame()
|
|
405
|
+
feat_scores["score"] = feat_selector.scores_
|
|
406
|
+
feat_scores["pvalue"] = feat_selector.pvalues_
|
|
407
|
+
feat_scores["support"] = feat_selector.get_support()
|
|
408
|
+
feat_scores["features"] = X.columns
|
|
409
|
+
feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
|
|
410
|
+
feat_scores["method"] = "Chi2"
|
|
411
|
+
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
412
|
+
stop = time.time()
|
|
413
|
+
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
414
|
+
feat_scores["training_time"] = training_time
|
|
415
|
+
|
|
416
|
+
logger.debug(
|
|
417
|
+
f"Chi2 evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
418
|
+
)
|
|
354
419
|
|
|
355
|
-
|
|
420
|
+
feat_scores.to_csv(
|
|
421
|
+
f"{save_dir}/Chi2.csv", index=True, header=True, index_label="ID"
|
|
422
|
+
)
|
|
356
423
|
|
|
424
|
+
return feat_scores
|
|
425
|
+
|
|
426
|
+
# Linear correlation (Person's R for regression and ANOVA for classification)
|
|
427
|
+
def select_feature_by_linear_correlation(
|
|
428
|
+
self, percentile: int = 20, save_dir: Optional[str] = None
|
|
429
|
+
):
|
|
430
|
+
X, y, target_type = self.X_numerical, self.y, self.target_type
|
|
431
|
+
|
|
432
|
+
start = time.time()
|
|
433
|
+
test_type = "Person's R" if target_type == "regression" else "ANOVA"
|
|
434
|
+
logger.debug(f"Running {test_type}...")
|
|
435
|
+
|
|
436
|
+
model = f_regression if target_type == "regression" else f_classif
|
|
437
|
+
feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
|
|
438
|
+
feat_scores = pd.DataFrame()
|
|
439
|
+
feat_scores["score"] = feat_selector.scores_
|
|
440
|
+
feat_scores["pvalue"] = feat_selector.pvalues_
|
|
441
|
+
feat_scores["support"] = feat_selector.get_support()
|
|
442
|
+
feat_scores["features"] = X.columns
|
|
443
|
+
feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
|
|
444
|
+
feat_scores["method"] = test_type
|
|
445
|
+
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
446
|
+
stop = time.time()
|
|
447
|
+
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
448
|
+
feat_scores["training_time"] = training_time
|
|
449
|
+
|
|
450
|
+
logger.debug(
|
|
451
|
+
f"{test_type} evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
452
|
+
)
|
|
357
453
|
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
454
|
+
feat_scores.to_csv(
|
|
455
|
+
f"{save_dir}/{test_type}.csv",
|
|
456
|
+
index=True,
|
|
457
|
+
header=True,
|
|
458
|
+
index_label="ID",
|
|
459
|
+
)
|
|
363
460
|
|
|
364
|
-
|
|
365
|
-
X_model = pd.DataFrame(X_model)
|
|
366
|
-
y_model = pd.Series(y_model)
|
|
461
|
+
return feat_scores
|
|
367
462
|
|
|
368
|
-
|
|
463
|
+
# Non-Linear correlation (Spearsman's R for regression and Kendall's Tau for classification)
|
|
464
|
+
def select_feature_by_nonlinear_correlation(
|
|
465
|
+
self, percentile: int = 20, save_dir: Optional[str] = None
|
|
466
|
+
):
|
|
467
|
+
X, y, target_type = self.X_numerical, self.y, self.target_type
|
|
369
468
|
|
|
370
|
-
|
|
371
|
-
p_values = []
|
|
469
|
+
start = time.time()
|
|
372
470
|
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
else: # Kendall's Tau for classification
|
|
377
|
-
corr, pval = kendalltau(X_model[col], y_model)
|
|
471
|
+
def model(X_model, y_model):
|
|
472
|
+
X_model = pd.DataFrame(X_model)
|
|
473
|
+
y_model = pd.Series(y_model)
|
|
378
474
|
|
|
379
|
-
|
|
380
|
-
p_values.append(pval)
|
|
475
|
+
method = "spearman" if target_type == "regression" else "kendall"
|
|
381
476
|
|
|
382
|
-
|
|
477
|
+
corr_scores = []
|
|
478
|
+
p_values = []
|
|
383
479
|
|
|
384
|
-
|
|
385
|
-
|
|
480
|
+
for col in X_model.columns:
|
|
481
|
+
if method == "spearman":
|
|
482
|
+
corr, pval = spearmanr(X_model[col], y_model)
|
|
483
|
+
else: # Kendall's Tau for classification
|
|
484
|
+
corr, pval = kendalltau(X_model[col], y_model)
|
|
386
485
|
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
feat_scores["score"] = feat_selector.scores_
|
|
390
|
-
feat_scores["pvalue"] = feat_selector.pvalues_
|
|
391
|
-
feat_scores["support"] = feat_selector.get_support()
|
|
392
|
-
feat_scores["features"] = X.columns
|
|
393
|
-
feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
|
|
394
|
-
feat_scores["method"] = test_type
|
|
395
|
-
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
396
|
-
stop = time.time()
|
|
397
|
-
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
398
|
-
feat_scores["training_time"] = training_time
|
|
486
|
+
corr_scores.append(abs(corr)) # Keeping absolute correlation
|
|
487
|
+
p_values.append(pval)
|
|
399
488
|
|
|
400
|
-
|
|
401
|
-
f"{test_type} evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
402
|
-
)
|
|
489
|
+
return np.array(corr_scores), np.array(p_values)
|
|
403
490
|
|
|
404
|
-
|
|
405
|
-
f"{
|
|
406
|
-
index=True,
|
|
407
|
-
header=True,
|
|
408
|
-
index_label="ID",
|
|
409
|
-
)
|
|
491
|
+
test_type = "Spearman's R" if target_type == "regression" else "Kendall's Tau"
|
|
492
|
+
logger.debug(f"Running {test_type}...")
|
|
410
493
|
|
|
411
|
-
|
|
494
|
+
feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
|
|
495
|
+
feat_scores = pd.DataFrame()
|
|
496
|
+
feat_scores["score"] = feat_selector.scores_
|
|
497
|
+
feat_scores["pvalue"] = feat_selector.pvalues_
|
|
498
|
+
feat_scores["support"] = feat_selector.get_support()
|
|
499
|
+
feat_scores["features"] = X.columns
|
|
500
|
+
feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
|
|
501
|
+
feat_scores["method"] = test_type
|
|
502
|
+
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
503
|
+
stop = time.time()
|
|
504
|
+
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
505
|
+
feat_scores["training_time"] = training_time
|
|
412
506
|
|
|
507
|
+
logger.debug(
|
|
508
|
+
f"{test_type} evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
509
|
+
)
|
|
413
510
|
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
model = (
|
|
421
|
-
mutual_info_regression if target_type == "regression" else mutual_info_classif
|
|
422
|
-
)
|
|
423
|
-
feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
|
|
424
|
-
feat_scores = pd.DataFrame()
|
|
425
|
-
feat_scores["score"] = feat_selector.scores_
|
|
426
|
-
feat_scores["support"] = feat_selector.get_support()
|
|
427
|
-
feat_scores["features"] = X.columns
|
|
428
|
-
feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
|
|
429
|
-
feat_scores["method"] = "Mutual Information"
|
|
430
|
-
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
431
|
-
stop = time.time()
|
|
432
|
-
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
433
|
-
feat_scores["training_time"] = training_time
|
|
434
|
-
|
|
435
|
-
logger.debug(
|
|
436
|
-
f"MI evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
437
|
-
)
|
|
511
|
+
feat_scores.to_csv(
|
|
512
|
+
f"{save_dir}/{test_type}.csv",
|
|
513
|
+
index=True,
|
|
514
|
+
header=True,
|
|
515
|
+
index_label="ID",
|
|
516
|
+
)
|
|
438
517
|
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
return feat_scores
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
def select_categorical_features(X, y, percentile, save_dir: Optional[str] = None):
|
|
445
|
-
start = time.time()
|
|
446
|
-
logger.debug("Running Chi2 for categorical features...")
|
|
447
|
-
feat_selector = SelectPercentile(chi2, percentile=percentile).fit(X, y)
|
|
448
|
-
feat_scores = pd.DataFrame()
|
|
449
|
-
feat_scores["score"] = feat_selector.scores_
|
|
450
|
-
feat_scores["pvalue"] = feat_selector.pvalues_
|
|
451
|
-
feat_scores["support"] = feat_selector.get_support()
|
|
452
|
-
feat_scores["features"] = X.columns
|
|
453
|
-
feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
|
|
454
|
-
feat_scores["method"] = "Chi2"
|
|
455
|
-
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
456
|
-
stop = time.time()
|
|
457
|
-
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
458
|
-
feat_scores["training_time"] = training_time
|
|
459
|
-
|
|
460
|
-
logger.debug(
|
|
461
|
-
f"Chi2 evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
462
|
-
)
|
|
518
|
+
return feat_scores
|
|
463
519
|
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
520
|
+
# Mutual Information
|
|
521
|
+
def select_feature_by_mi(
|
|
522
|
+
self, percentile: int = 20, save_dir: Optional[str] = None
|
|
523
|
+
):
|
|
524
|
+
X, y, target_type = self.X_numerical, self.y, self.target_type
|
|
467
525
|
|
|
468
|
-
|
|
526
|
+
start = time.time()
|
|
527
|
+
logger.debug("Running Mutual Information...")
|
|
528
|
+
model = (
|
|
529
|
+
mutual_info_regression
|
|
530
|
+
if target_type == "regression"
|
|
531
|
+
else mutual_info_classif
|
|
532
|
+
)
|
|
533
|
+
feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
|
|
534
|
+
feat_scores = pd.DataFrame()
|
|
535
|
+
feat_scores["score"] = feat_selector.scores_
|
|
536
|
+
feat_scores["support"] = feat_selector.get_support()
|
|
537
|
+
feat_scores["features"] = X.columns
|
|
538
|
+
feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
|
|
539
|
+
feat_scores["method"] = "Mutual Information"
|
|
540
|
+
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
541
|
+
stop = time.time()
|
|
542
|
+
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
543
|
+
feat_scores["training_time"] = training_time
|
|
544
|
+
|
|
545
|
+
logger.debug(
|
|
546
|
+
f"MI evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
547
|
+
)
|
|
469
548
|
|
|
549
|
+
feat_scores.to_csv(
|
|
550
|
+
f"{save_dir}/MI.csv", index=True, header=True, index_label="ID"
|
|
551
|
+
)
|
|
470
552
|
|
|
471
|
-
|
|
472
|
-
# ----------------
|
|
553
|
+
return feat_scores
|
|
473
554
|
|
|
555
|
+
# Intrisic/embeedded method
|
|
556
|
+
# ----------------
|
|
474
557
|
|
|
475
|
-
# feature importance
|
|
476
|
-
def select_feature_by_feat_imp(
|
|
477
|
-
|
|
478
|
-
):
|
|
479
|
-
|
|
480
|
-
logger.debug("Running Feature importance...")
|
|
558
|
+
# feature importance
|
|
559
|
+
def select_feature_by_feat_imp(
|
|
560
|
+
self, percentile: int = 20, save_dir: Optional[str] = None
|
|
561
|
+
):
|
|
562
|
+
X, y, target_type = self.X_numerical, self.y, self.target_type
|
|
481
563
|
|
|
482
|
-
|
|
564
|
+
start = time.time()
|
|
565
|
+
logger.debug("Running Feature importance...")
|
|
483
566
|
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
567
|
+
params = {
|
|
568
|
+
"n_estimators": 500,
|
|
569
|
+
"max_depth": 2**3,
|
|
570
|
+
"random_state": 42,
|
|
571
|
+
"n_jobs": -1,
|
|
572
|
+
}
|
|
489
573
|
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
feat_scores = pd.DataFrame()
|
|
497
|
-
feat_scores["score"] = feat_selector.estimator_.feature_importances_
|
|
498
|
-
feat_scores["support"] = feat_selector.get_support()
|
|
499
|
-
feat_scores["features"] = X.columns
|
|
500
|
-
feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
|
|
501
|
-
feat_scores["method"] = "FI"
|
|
502
|
-
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
503
|
-
|
|
504
|
-
stop = time.time()
|
|
505
|
-
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
506
|
-
feat_scores["training_time"] = training_time
|
|
507
|
-
|
|
508
|
-
logger.debug(
|
|
509
|
-
f"Feat importance evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
510
|
-
)
|
|
574
|
+
estimator = (
|
|
575
|
+
RandomForestClassifier(**params)
|
|
576
|
+
if target_type == "classification"
|
|
577
|
+
else RandomForestRegressor(**params)
|
|
578
|
+
)
|
|
511
579
|
|
|
512
|
-
|
|
580
|
+
feat_selector = SelectFromModel(
|
|
581
|
+
estimator=estimator,
|
|
582
|
+
threshold=-np.inf,
|
|
583
|
+
max_features=int(percentile * X.shape[1] / 100),
|
|
584
|
+
).fit(X, y)
|
|
585
|
+
|
|
586
|
+
feat_scores = pd.DataFrame()
|
|
587
|
+
feat_scores["score"] = feat_selector.estimator_.feature_importances_
|
|
588
|
+
feat_scores["support"] = feat_selector.get_support()
|
|
589
|
+
feat_scores["features"] = X.columns
|
|
590
|
+
feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
|
|
591
|
+
feat_scores["method"] = "FI"
|
|
592
|
+
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
593
|
+
|
|
594
|
+
stop = time.time()
|
|
595
|
+
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
596
|
+
feat_scores["training_time"] = training_time
|
|
597
|
+
|
|
598
|
+
logger.debug(
|
|
599
|
+
f"Feat importance evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
600
|
+
)
|
|
513
601
|
|
|
514
|
-
|
|
602
|
+
feat_scores.to_csv(
|
|
603
|
+
f"{save_dir}/FI.csv", index=True, header=True, index_label="ID"
|
|
604
|
+
)
|
|
515
605
|
|
|
606
|
+
return feat_scores
|
|
516
607
|
|
|
517
|
-
# Wrapper method
|
|
518
|
-
# ----------------
|
|
608
|
+
# Wrapper method
|
|
609
|
+
# ----------------
|
|
519
610
|
|
|
611
|
+
# recursive feature elimination
|
|
612
|
+
def select_feature_by_rfe(
|
|
613
|
+
self, percentile: int = 20, save_dir: Optional[str] = None
|
|
614
|
+
):
|
|
615
|
+
X, y, target_type = self.X_numerical, self.y, self.target_type
|
|
520
616
|
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
X, y, target_type, percentile: int = 20, save_dir: Optional[str] = None
|
|
524
|
-
):
|
|
525
|
-
start = time.time()
|
|
526
|
-
logger.debug("Running Recursive Feature Elimination...")
|
|
617
|
+
start = time.time()
|
|
618
|
+
logger.debug("Running Recursive Feature Elimination...")
|
|
527
619
|
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
}
|
|
532
|
-
estimator = (
|
|
533
|
-
DecisionTreeClassifier(**params)
|
|
534
|
-
if target_type == "classification"
|
|
535
|
-
else DecisionTreeRegressor(**params)
|
|
536
|
-
)
|
|
537
|
-
rfe = RFE(estimator, n_features_to_select=percentile / 100, step=4, verbose=0)
|
|
538
|
-
feat_selector = rfe.fit(X, y)
|
|
539
|
-
|
|
540
|
-
feat_scores = pd.DataFrame(
|
|
541
|
-
{
|
|
542
|
-
"score": 0.0, # Default feature importance
|
|
543
|
-
"support": feat_selector.get_support(),
|
|
544
|
-
"features": X.columns,
|
|
545
|
-
"rank": 0,
|
|
546
|
-
"method": "RFE",
|
|
620
|
+
params = {
|
|
621
|
+
"max_depth": 2**3,
|
|
622
|
+
"random_state": 42,
|
|
547
623
|
}
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
624
|
+
estimator = (
|
|
625
|
+
DecisionTreeClassifier(**params)
|
|
626
|
+
if target_type == "classification"
|
|
627
|
+
else DecisionTreeRegressor(**params)
|
|
628
|
+
)
|
|
629
|
+
rfe = RFE(estimator, n_features_to_select=percentile / 100, step=4, verbose=0)
|
|
630
|
+
feat_selector = rfe.fit(X, y)
|
|
631
|
+
|
|
632
|
+
feat_scores = pd.DataFrame(
|
|
633
|
+
{
|
|
634
|
+
"score": 0.0, # Default feature importance
|
|
635
|
+
"support": feat_selector.get_support(),
|
|
636
|
+
"features": X.columns,
|
|
637
|
+
"rank": 0,
|
|
638
|
+
"method": "RFE",
|
|
639
|
+
}
|
|
640
|
+
)
|
|
641
|
+
feat_scores.loc[
|
|
642
|
+
feat_scores["features"].isin(feat_selector.get_feature_names_out()), "score"
|
|
643
|
+
] = list(feat_selector.estimator_.feature_importances_)
|
|
644
|
+
feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
|
|
645
|
+
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
646
|
+
|
|
647
|
+
stop = time.time()
|
|
648
|
+
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
649
|
+
feat_scores["training_time"] = training_time
|
|
650
|
+
|
|
651
|
+
logger.debug(
|
|
652
|
+
f"RFE evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
653
|
+
)
|
|
566
654
|
|
|
655
|
+
feat_scores.to_csv(
|
|
656
|
+
f"{save_dir}/RFE.csv", index=True, header=True, index_label="ID"
|
|
657
|
+
)
|
|
567
658
|
|
|
568
|
-
|
|
569
|
-
def select_feature_by_sfs(
|
|
570
|
-
X, y, target_type, percentile: int = 20, save_dir: Optional[str] = None
|
|
571
|
-
):
|
|
572
|
-
start = time.time()
|
|
573
|
-
logger.debug("Running Sequential Feature Selection...")
|
|
574
|
-
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
659
|
+
return feat_scores
|
|
575
660
|
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
DecisionTreeClassifier(**params)
|
|
582
|
-
if target_type == "classification"
|
|
583
|
-
else DecisionTreeRegressor(**params)
|
|
584
|
-
)
|
|
661
|
+
# SequentialFeatureSelector (loss based, possibility to do forwards or backwards selection or removal)
|
|
662
|
+
def select_feature_by_sfs(
|
|
663
|
+
self, percentile: int = 20, save_dir: Optional[str] = None
|
|
664
|
+
):
|
|
665
|
+
X, y, target_type = self.X_numerical, self.y, self.target_type
|
|
585
666
|
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
|
|
590
|
-
|
|
591
|
-
score_function = (
|
|
592
|
-
make_scorer(
|
|
593
|
-
log_loss, response_method="predict_proba"
|
|
594
|
-
) # logloss needs probabilities
|
|
595
|
-
if target_type == "classification"
|
|
596
|
-
else make_scorer(root_mean_squared_error)
|
|
597
|
-
) # we avoid greater_is_better = False because it make the score negative and mess up ranking
|
|
598
|
-
|
|
599
|
-
sfs = SequentialFeatureSelector(
|
|
600
|
-
estimator,
|
|
601
|
-
k_features=int(percentile * X.shape[1] / 100),
|
|
602
|
-
forward=True,
|
|
603
|
-
floating=True, # Enables dynamic feature elimination
|
|
604
|
-
scoring=score_function,
|
|
605
|
-
cv=tscv,
|
|
606
|
-
n_jobs=-1,
|
|
607
|
-
verbose=0,
|
|
608
|
-
)
|
|
667
|
+
start = time.time()
|
|
668
|
+
logger.debug("Running Sequential Feature Selection...")
|
|
669
|
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
609
670
|
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
selected_features = set(feat_selector.k_feature_names_)
|
|
614
|
-
feat_subsets = feat_selector.subsets_
|
|
615
|
-
|
|
616
|
-
# Create DataFrame for feature scores
|
|
617
|
-
feat_scores = pd.DataFrame(
|
|
618
|
-
{
|
|
619
|
-
"features": X.columns,
|
|
620
|
-
"support": X.columns.isin(
|
|
621
|
-
selected_features
|
|
622
|
-
), # TODO: comprendre pourquoi le support n'est pas correct (les bons scores ne sont pas toujours choisis)
|
|
623
|
-
"score": 1000,
|
|
624
|
-
"rank": None,
|
|
625
|
-
"method": "SFS",
|
|
671
|
+
params = {
|
|
672
|
+
"max_depth": 2**3,
|
|
673
|
+
"random_state": 42,
|
|
626
674
|
}
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
# Record score per feature (first appearance)
|
|
633
|
-
feature_score_map = {}
|
|
634
|
-
for step in sorted_subsets:
|
|
635
|
-
step = step[1]
|
|
636
|
-
for feature in step["feature_names"]:
|
|
637
|
-
if feature not in feature_score_map:
|
|
638
|
-
feature_score_map[feature] = step["avg_score"]
|
|
639
|
-
|
|
640
|
-
# Assign scores
|
|
641
|
-
for feature, score in feature_score_map.items():
|
|
642
|
-
feat_scores.loc[feat_scores["features"] == feature, "score"] = score
|
|
675
|
+
estimator = (
|
|
676
|
+
DecisionTreeClassifier(**params)
|
|
677
|
+
if target_type == "classification"
|
|
678
|
+
else DecisionTreeRegressor(**params)
|
|
679
|
+
)
|
|
643
680
|
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
681
|
+
n_splits = 3
|
|
682
|
+
n_samples = len(X)
|
|
683
|
+
test_size = int(n_samples / (n_splits + 4))
|
|
684
|
+
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
|
|
685
|
+
|
|
686
|
+
score_function = (
|
|
687
|
+
make_scorer(
|
|
688
|
+
log_loss, response_method="predict_proba"
|
|
689
|
+
) # logloss needs probabilities
|
|
690
|
+
if target_type == "classification"
|
|
691
|
+
else make_scorer(root_mean_squared_error)
|
|
692
|
+
) # we avoid greater_is_better = False because it make the score negative and mess up ranking
|
|
693
|
+
|
|
694
|
+
sfs = SequentialFeatureSelector(
|
|
695
|
+
estimator,
|
|
696
|
+
k_features=int(percentile * X.shape[1] / 100),
|
|
697
|
+
forward=True,
|
|
698
|
+
floating=True, # Enables dynamic feature elimination
|
|
699
|
+
scoring=score_function,
|
|
700
|
+
cv=tscv,
|
|
701
|
+
n_jobs=-1,
|
|
702
|
+
verbose=0,
|
|
703
|
+
)
|
|
648
704
|
|
|
649
|
-
|
|
705
|
+
feat_selector = sfs.fit(X, y)
|
|
706
|
+
|
|
707
|
+
# Extract selected features and their scores
|
|
708
|
+
selected_features = set(feat_selector.k_feature_names_)
|
|
709
|
+
feat_subsets = feat_selector.subsets_
|
|
710
|
+
|
|
711
|
+
# Create DataFrame for feature scores
|
|
712
|
+
feat_scores = pd.DataFrame(
|
|
713
|
+
{
|
|
714
|
+
"features": X.columns,
|
|
715
|
+
"support": X.columns.isin(
|
|
716
|
+
selected_features
|
|
717
|
+
), # TODO: comprendre pourquoi le support n'est pas correct (les bons scores ne sont pas toujours choisis)
|
|
718
|
+
"score": 1000,
|
|
719
|
+
"rank": None,
|
|
720
|
+
"method": "SFS",
|
|
721
|
+
}
|
|
722
|
+
)
|
|
650
723
|
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
724
|
+
# Sort subsets by score (lower is better)
|
|
725
|
+
sorted_subsets = sorted(
|
|
726
|
+
feat_subsets.items(), key=lambda item: item[1]["avg_score"]
|
|
727
|
+
)
|
|
654
728
|
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
729
|
+
# Record score per feature (first appearance)
|
|
730
|
+
feature_score_map = {}
|
|
731
|
+
for step in sorted_subsets:
|
|
732
|
+
step = step[1]
|
|
733
|
+
for feature in step["feature_names"]:
|
|
734
|
+
if feature not in feature_score_map:
|
|
735
|
+
feature_score_map[feature] = step["avg_score"]
|
|
736
|
+
|
|
737
|
+
# Assign scores
|
|
738
|
+
for feature, score in feature_score_map.items():
|
|
739
|
+
feat_scores.loc[feat_scores["features"] == feature, "score"] = score
|
|
740
|
+
|
|
741
|
+
# rank by score (lower = better)
|
|
742
|
+
feat_scores["rank"] = (
|
|
743
|
+
feat_scores["score"].rank(method="first", ascending=True).astype(int)
|
|
744
|
+
)
|
|
658
745
|
|
|
659
|
-
|
|
746
|
+
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
660
747
|
|
|
661
|
-
|
|
748
|
+
stop = time.time()
|
|
749
|
+
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
750
|
+
feat_scores["training_time"] = training_time
|
|
662
751
|
|
|
752
|
+
logger.debug(
|
|
753
|
+
f"SFS evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
754
|
+
)
|
|
663
755
|
|
|
664
|
-
|
|
665
|
-
|
|
756
|
+
feat_scores.to_csv(
|
|
757
|
+
f"{save_dir}/SFS.csv", index=True, header=True, index_label="ID"
|
|
758
|
+
)
|
|
666
759
|
|
|
760
|
+
return feat_scores
|
|
761
|
+
|
|
762
|
+
|
|
763
|
+
class PreprocessModel:
|
|
764
|
+
|
|
765
|
+
def __init__(
|
|
766
|
+
self,
|
|
767
|
+
train,
|
|
768
|
+
val,
|
|
769
|
+
test,
|
|
770
|
+
dataset,
|
|
771
|
+
target_numbers,
|
|
772
|
+
target_clf,
|
|
773
|
+
models_idx,
|
|
774
|
+
time_series,
|
|
775
|
+
max_timesteps,
|
|
776
|
+
group_column,
|
|
777
|
+
date_column,
|
|
778
|
+
**kwargs,
|
|
779
|
+
):
|
|
780
|
+
self.dataset = dataset
|
|
781
|
+
self.target_numbers = target_numbers
|
|
782
|
+
self.target_clf = target_clf
|
|
783
|
+
self.models_idx = models_idx
|
|
784
|
+
self.time_series = time_series
|
|
785
|
+
self.max_timesteps = max_timesteps
|
|
786
|
+
self.group_column = group_column
|
|
787
|
+
self.date_column = date_column
|
|
788
|
+
|
|
789
|
+
self.dataset_dir = dataset.path
|
|
790
|
+
self.data_dir = f"{self.dataset_dir}/data"
|
|
791
|
+
self.preprocessing_dir = f"{self.dataset_dir}/preprocessing"
|
|
792
|
+
|
|
793
|
+
self.all_features = dataset.get_all_features(
|
|
794
|
+
date_column=date_column, group_column=group_column
|
|
795
|
+
)
|
|
796
|
+
columns_to_keep = self.all_features + [
|
|
797
|
+
f"TARGET_{i}" for i in self.target_numbers
|
|
798
|
+
]
|
|
799
|
+
duplicates = [
|
|
800
|
+
col for col in set(columns_to_keep) if columns_to_keep.count(col) > 1
|
|
801
|
+
]
|
|
802
|
+
if duplicates:
|
|
803
|
+
raise ValueError(f"Doublons détectés dans columns_to_keep: {duplicates}")
|
|
804
|
+
|
|
805
|
+
self.train = train[columns_to_keep]
|
|
806
|
+
if isinstance(val, pd.DataFrame):
|
|
807
|
+
self.val = val[columns_to_keep]
|
|
808
|
+
if isinstance(test, pd.DataFrame):
|
|
809
|
+
self.test = test[columns_to_keep]
|
|
810
|
+
|
|
811
|
+
def run(self):
|
|
812
|
+
# save data
|
|
813
|
+
joblib.dump(self.train, f"{self.data_dir}/train.pkl")
|
|
814
|
+
joblib.dump(self.val, f"{self.data_dir}/val.pkl")
|
|
815
|
+
joblib.dump(self.test, f"{self.data_dir}/test.pkl")
|
|
816
|
+
|
|
817
|
+
# scaling features
|
|
818
|
+
if any(t not in self.target_clf for t in self.target_numbers) and any(
|
|
819
|
+
all_models[i].get("need_scaling") for i in self.models_idx
|
|
820
|
+
):
|
|
821
|
+
logger.info("Scaling features...")
|
|
822
|
+
train_scaled, scaler_x, scalers_y = self.scale_data(self.train)
|
|
823
|
+
val_scaled, _, _ = self.scale_data(
|
|
824
|
+
self.val,
|
|
825
|
+
scaler_x=scaler_x,
|
|
826
|
+
scalers_y=scalers_y,
|
|
827
|
+
)
|
|
828
|
+
test_scaled, _, _ = self.scale_data(
|
|
829
|
+
self.test,
|
|
830
|
+
scaler_x=scaler_x,
|
|
831
|
+
scalers_y=scalers_y,
|
|
832
|
+
)
|
|
833
|
+
else:
|
|
834
|
+
train_scaled = None
|
|
835
|
+
val_scaled = None
|
|
836
|
+
test_scaled = None
|
|
837
|
+
|
|
838
|
+
# save data
|
|
839
|
+
joblib.dump(train_scaled, f"{self.data_dir}/train_scaled.pkl")
|
|
840
|
+
joblib.dump(val_scaled, f"{self.data_dir}/val_scaled.pkl")
|
|
841
|
+
joblib.dump(test_scaled, f"{self.data_dir}/test_scaled.pkl")
|
|
842
|
+
|
|
843
|
+
data = {
|
|
844
|
+
"train": self.train,
|
|
845
|
+
"val": self.val,
|
|
846
|
+
"test": self.test,
|
|
847
|
+
"train_scaled": train_scaled,
|
|
848
|
+
"val_scaled": val_scaled,
|
|
849
|
+
"test_scaled": test_scaled,
|
|
850
|
+
"scalers_y": scalers_y,
|
|
851
|
+
}
|
|
667
852
|
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
853
|
+
# reshape data for time series
|
|
854
|
+
reshaped_data = None
|
|
855
|
+
if (
|
|
856
|
+
any(all_models[i].get("recurrent") for i in self.models_idx)
|
|
857
|
+
and self.time_series
|
|
858
|
+
):
|
|
859
|
+
# reshaping data for recurrent models
|
|
860
|
+
logger.info("Reshaping data for recurrent models...")
|
|
861
|
+
reshaped_data = self.reshape_time_series(
|
|
862
|
+
train_scaled,
|
|
863
|
+
val_scaled,
|
|
864
|
+
test_scaled,
|
|
865
|
+
features=self.all_features,
|
|
866
|
+
timesteps=self.max_timesteps,
|
|
867
|
+
)
|
|
673
868
|
|
|
674
|
-
|
|
675
|
-
features_uncorrelated = [
|
|
676
|
-
column
|
|
677
|
-
for column in upper.columns
|
|
678
|
-
if all(upper[column].dropna() <= corr_threshold / 100)
|
|
679
|
-
]
|
|
680
|
-
features_correlated = [
|
|
681
|
-
column for column in upper.columns if any(upper[column] > corr_threshold / 100)
|
|
682
|
-
]
|
|
869
|
+
return data, reshaped_data
|
|
683
870
|
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
.
|
|
871
|
+
def inference(self):
|
|
872
|
+
# self.train is new data here
|
|
873
|
+
scaler_x = joblib.load(f"{self.preprocessing_dir}/scaler_x.pkl")
|
|
874
|
+
scaled_data = scaler_x.transform(self.train)
|
|
875
|
+
scaled_data = pd.DataFrame(
|
|
876
|
+
scaled_data, columns=self.train.columns, index=self.train.index
|
|
690
877
|
)
|
|
691
|
-
# Plot the heatmap
|
|
692
|
-
plt.figure(figsize=(10, 8))
|
|
693
|
-
sns.heatmap(
|
|
694
|
-
corr_matrix,
|
|
695
|
-
annot=True,
|
|
696
|
-
cmap="coolwarm",
|
|
697
|
-
center=0,
|
|
698
|
-
linewidths=1,
|
|
699
|
-
linecolor="black",
|
|
700
|
-
)
|
|
701
|
-
plt.title(f"Correlation Matrix")
|
|
702
|
-
plt.show()
|
|
703
|
-
|
|
704
|
-
logger.info(f"\n{features_selected_visualization.describe().to_string()}")
|
|
705
|
-
logger.info(f"\n{features_selected_visualization.to_string()}")
|
|
706
|
-
return features_uncorrelated, features_correlated
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
# Main feature selection function
|
|
710
|
-
def feature_selection(
|
|
711
|
-
dataset_id: int,
|
|
712
|
-
train: pd.DataFrame,
|
|
713
|
-
target_number: int,
|
|
714
|
-
single_process: bool = False,
|
|
715
|
-
):
|
|
716
|
-
"""Function to do feature selection with a range of different feature selection technics
|
|
717
|
-
|
|
718
|
-
Args:
|
|
719
|
-
- train (pd.DataFrame): a pandas train set
|
|
720
|
-
- target_number (in): a target, targets need to be name ``TARGET_{n}```
|
|
721
|
-
- single_process (bool): if True, run all feature selection methods in a single process. If False, run them in parallel.
|
|
722
|
-
"""
|
|
723
|
-
|
|
724
|
-
# Create the feature selection in db
|
|
725
|
-
target = Target.find_by(name=f"TARGET_{target_number}")
|
|
726
|
-
dataset = Dataset.get(dataset_id)
|
|
727
|
-
percentile = dataset.percentile
|
|
728
|
-
corr_threshold = dataset.corr_threshold
|
|
729
|
-
max_features = dataset.max_features
|
|
730
|
-
|
|
731
|
-
feature_selection = FeatureSelection.upsert(
|
|
732
|
-
match_fields=["target_id", "dataset_id"],
|
|
733
|
-
target_id=target.id,
|
|
734
|
-
dataset_id=dataset.id,
|
|
735
|
-
)
|
|
736
878
|
|
|
737
|
-
|
|
738
|
-
|
|
879
|
+
reshaped_data = None
|
|
880
|
+
if (
|
|
881
|
+
any(all_models[i].get("recurrent") for i in self.models_idx)
|
|
882
|
+
and self.time_series
|
|
883
|
+
):
|
|
884
|
+
# we need to make sur we have max_timesteps of data after grouping by group_column
|
|
885
|
+
if (
|
|
886
|
+
self.group_column
|
|
887
|
+
and scaled_data.groupby(self.group_column).size().min()
|
|
888
|
+
< self.max_timesteps
|
|
889
|
+
) or scaled_data.shape[0] < self.max_timesteps:
|
|
890
|
+
raise ValueError(
|
|
891
|
+
f"Not enough data for group_column {self.group_column} to reshape data for recurrent models"
|
|
892
|
+
)
|
|
739
893
|
|
|
740
|
-
|
|
894
|
+
# reshaping data for recurrent models
|
|
895
|
+
logger.info("Reshaping data for recurrent models...")
|
|
896
|
+
reshaped_data = self.reshape_time_series(
|
|
897
|
+
scaled_data,
|
|
898
|
+
features=self.all_features,
|
|
899
|
+
timesteps=self.max_timesteps,
|
|
900
|
+
)
|
|
741
901
|
|
|
742
|
-
|
|
902
|
+
return self.train, scaled_data, reshaped_data
|
|
903
|
+
|
|
904
|
+
# scaling
|
|
905
|
+
def scale_data(
|
|
906
|
+
self,
|
|
907
|
+
df: pd.DataFrame,
|
|
908
|
+
scaler_x=None,
|
|
909
|
+
scalers_y: Optional[list] = None,
|
|
910
|
+
):
|
|
911
|
+
logger.info("Scale data...")
|
|
912
|
+
X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
|
|
913
|
+
|
|
914
|
+
if scaler_x:
|
|
915
|
+
X_scaled = pd.DataFrame(
|
|
916
|
+
scaler_x.transform(X), columns=list(X.columns), index=X.index
|
|
917
|
+
)
|
|
918
|
+
else:
|
|
919
|
+
scaler_x = StandardScaler() # MinMaxScaler(feature_range=(-1,1))
|
|
920
|
+
X_scaled = pd.DataFrame(
|
|
921
|
+
scaler_x.fit_transform(X), columns=list(X.columns), index=X.index
|
|
922
|
+
)
|
|
923
|
+
joblib.dump(scaler_x, f"{self.preprocessing_dir}/scaler_x.pkl")
|
|
743
924
|
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
925
|
+
# Determine which targets need to be scaled
|
|
926
|
+
targets_numbers_to_scale = [
|
|
927
|
+
i for i in self.target_numbers if i not in self.target_clf
|
|
928
|
+
]
|
|
748
929
|
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
# TODO: we could also remove low variance features
|
|
752
|
-
features_uncorrelated, features_correlated = remove_correlated_features(
|
|
753
|
-
X, X.columns, 90, vizualize=False
|
|
754
|
-
)
|
|
755
|
-
X = X[features_uncorrelated]
|
|
756
|
-
|
|
757
|
-
logger.debug(
|
|
758
|
-
f"""
|
|
759
|
-
\nWe first have removed {len(features_correlated)} features with correlation greater than 90%
|
|
760
|
-
\nWe are looking to capture {percentile}% of {len(X.columns)} features, i.e. {int(len(X.columns)*percentile/100)} features, with different feature selection methods
|
|
761
|
-
\nWe will then remove above {corr_threshold}% correlated features, keeping the one with the best ranks
|
|
762
|
-
\nFinally, we will keep only the {max_features} best ranked features
|
|
763
|
-
"""
|
|
764
|
-
)
|
|
930
|
+
# Dictionary to store scaled target data
|
|
931
|
+
scaled_targets = {}
|
|
765
932
|
|
|
766
|
-
|
|
933
|
+
if scalers_y:
|
|
934
|
+
for target_number in targets_numbers_to_scale:
|
|
935
|
+
y = df[[f"TARGET_{target_number}"]]
|
|
936
|
+
scaled_targets[target_number] = pd.DataFrame(
|
|
937
|
+
scalers_y[f"scaler_y_{target_number}"].transform(y.values),
|
|
938
|
+
columns=y.columns,
|
|
939
|
+
index=y.index,
|
|
940
|
+
)
|
|
941
|
+
else:
|
|
942
|
+
scalers_y = {}
|
|
943
|
+
for target_number in targets_numbers_to_scale:
|
|
944
|
+
scaler_y = StandardScaler()
|
|
945
|
+
y = df[[f"TARGET_{target_number}"]]
|
|
946
|
+
|
|
947
|
+
scaled_y = pd.DataFrame(
|
|
948
|
+
scaler_y.fit_transform(y.values),
|
|
949
|
+
columns=y.columns,
|
|
950
|
+
index=y.index,
|
|
951
|
+
)
|
|
952
|
+
joblib.dump(
|
|
953
|
+
scaler_y, f"{self.preprocessing_dir}/scaler_y_{target_number}.pkl"
|
|
954
|
+
)
|
|
767
955
|
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
X_categorical = X[categorical_features]
|
|
956
|
+
scalers_y[f"scaler_y_{target_number}"] = scaler_y
|
|
957
|
+
scaled_targets[target_number] = scaled_y
|
|
771
958
|
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
959
|
+
# Reconstruct y_scaled in the original order
|
|
960
|
+
y_scaled = pd.concat(
|
|
961
|
+
[
|
|
962
|
+
scaled_targets[target_number]
|
|
963
|
+
for target_number in targets_numbers_to_scale
|
|
964
|
+
],
|
|
965
|
+
axis=1,
|
|
775
966
|
)
|
|
776
|
-
|
|
777
|
-
for
|
|
778
|
-
feature = Feature.find_by(name=row.features, db=db)
|
|
779
|
-
FeatureSelectionRank.upsert(
|
|
780
|
-
["feature_selection_id", "feature_id", "method"],
|
|
781
|
-
db=db,
|
|
782
|
-
score=row.score,
|
|
783
|
-
pvalue=row.pvalue,
|
|
784
|
-
support=row.support,
|
|
785
|
-
rank=row.rank,
|
|
786
|
-
method=row.method,
|
|
787
|
-
training_time=row.training_time,
|
|
788
|
-
feature_selection_id=feature_selection.id,
|
|
789
|
-
feature_id=feature.id,
|
|
790
|
-
)
|
|
791
|
-
categorical_features_selected = feat_scores[feat_scores["support"] == True][
|
|
792
|
-
"features"
|
|
793
|
-
].values.tolist()
|
|
794
|
-
|
|
795
|
-
# removing categorical features from X
|
|
796
|
-
numerical_features = list(set(X.columns).difference(set(categorical_features)))
|
|
797
|
-
X_numerical = X[numerical_features]
|
|
798
|
-
|
|
799
|
-
results = []
|
|
800
|
-
if single_process:
|
|
801
|
-
results = [
|
|
802
|
-
select_feature_by_linear_correlation(
|
|
803
|
-
X_numerical, y, target_type, percentile, save_dir=fs_dir_target
|
|
804
|
-
),
|
|
805
|
-
select_feature_by_nonlinear_correlation(
|
|
806
|
-
X_numerical, y, target_type, percentile, save_dir=fs_dir_target
|
|
807
|
-
),
|
|
808
|
-
select_feature_by_mi(
|
|
809
|
-
X_numerical, y, target_type, percentile, save_dir=fs_dir_target
|
|
810
|
-
),
|
|
811
|
-
select_feature_by_feat_imp(
|
|
812
|
-
X_numerical, y, target_type, percentile, save_dir=fs_dir_target
|
|
813
|
-
),
|
|
814
|
-
select_feature_by_rfe(
|
|
815
|
-
X_numerical, y, target_type, percentile, save_dir=fs_dir_target
|
|
816
|
-
),
|
|
817
|
-
# select_feature_by_sfs(
|
|
818
|
-
# X_numerical, y, target_type, percentile, save_dir=fs_dir_target
|
|
819
|
-
# ), # TODO: this is taking too long
|
|
967
|
+
y_not_scaled = df[
|
|
968
|
+
df.columns.intersection([f"TARGET_{i}" for i in self.target_clf])
|
|
820
969
|
]
|
|
821
|
-
else:
|
|
822
|
-
# Use ProcessPoolExecutor to run tasks in parallel
|
|
823
|
-
with ProcessPoolExecutor() as executor:
|
|
824
|
-
# Submit different functions to be executed in parallel
|
|
825
|
-
futures = [
|
|
826
|
-
executor.submit(
|
|
827
|
-
select_feature_by_linear_correlation,
|
|
828
|
-
X_numerical,
|
|
829
|
-
y,
|
|
830
|
-
target_type,
|
|
831
|
-
percentile,
|
|
832
|
-
save_dir=fs_dir_target,
|
|
833
|
-
),
|
|
834
|
-
executor.submit(
|
|
835
|
-
select_feature_by_nonlinear_correlation,
|
|
836
|
-
X_numerical,
|
|
837
|
-
y,
|
|
838
|
-
target_type,
|
|
839
|
-
percentile,
|
|
840
|
-
save_dir=fs_dir_target,
|
|
841
|
-
),
|
|
842
|
-
executor.submit(
|
|
843
|
-
select_feature_by_mi,
|
|
844
|
-
X_numerical,
|
|
845
|
-
y,
|
|
846
|
-
target_type,
|
|
847
|
-
percentile,
|
|
848
|
-
save_dir=fs_dir_target,
|
|
849
|
-
),
|
|
850
|
-
executor.submit(
|
|
851
|
-
select_feature_by_feat_imp,
|
|
852
|
-
X_numerical,
|
|
853
|
-
y,
|
|
854
|
-
target_type,
|
|
855
|
-
percentile,
|
|
856
|
-
save_dir=fs_dir_target,
|
|
857
|
-
),
|
|
858
|
-
executor.submit(
|
|
859
|
-
select_feature_by_rfe,
|
|
860
|
-
X_numerical,
|
|
861
|
-
y,
|
|
862
|
-
target_type,
|
|
863
|
-
percentile,
|
|
864
|
-
save_dir=fs_dir_target,
|
|
865
|
-
),
|
|
866
|
-
executor.submit(
|
|
867
|
-
select_feature_by_sfs,
|
|
868
|
-
X_numerical,
|
|
869
|
-
y,
|
|
870
|
-
target_type,
|
|
871
|
-
percentile,
|
|
872
|
-
save_dir=fs_dir_target,
|
|
873
|
-
),
|
|
874
|
-
]
|
|
875
|
-
|
|
876
|
-
# Wait for all futures to complete and gather the results
|
|
877
|
-
with tqdm(total=len(futures)) as pbar:
|
|
878
|
-
for future in as_completed(futures):
|
|
879
|
-
results.append(future.result())
|
|
880
|
-
pbar.update(1)
|
|
881
|
-
logger.info(f"Finished feature selection for target {target_number}")
|
|
882
|
-
|
|
883
|
-
stop = time.time()
|
|
884
|
-
|
|
885
|
-
# Once all tasks are completed, start by inserting results to db
|
|
886
|
-
feat_scores = pd.concat(
|
|
887
|
-
results,
|
|
888
|
-
axis=0,
|
|
889
|
-
)
|
|
890
|
-
|
|
891
|
-
logger.info("Inserting feature selection results to db...")
|
|
892
|
-
rows = []
|
|
893
|
-
|
|
894
|
-
with get_db() as db:
|
|
895
|
-
feature_map = {f.name: f.id for f in Feature.get_all(db=db, limit=20000)}
|
|
896
|
-
for row in feat_scores.itertuples(index=False):
|
|
897
|
-
feature_id = feature_map.get(row.features)
|
|
898
|
-
if not feature_id:
|
|
899
|
-
continue # or raise if feature must exist
|
|
900
|
-
|
|
901
|
-
rows.append(
|
|
902
|
-
{
|
|
903
|
-
"feature_selection_id": feature_selection.id,
|
|
904
|
-
"feature_id": feature_id,
|
|
905
|
-
"method": row.method,
|
|
906
|
-
"score": row.score,
|
|
907
|
-
"pvalue": None if pd.isna(row.pvalue) else row.pvalue,
|
|
908
|
-
"support": row.support,
|
|
909
|
-
"rank": row.rank,
|
|
910
|
-
"training_time": row.training_time,
|
|
911
|
-
}
|
|
912
|
-
)
|
|
913
|
-
|
|
914
|
-
if len(rows) == 0:
|
|
915
|
-
raise ValueError(f"No features selected for TARGET_{target_number}")
|
|
916
|
-
|
|
917
|
-
FeatureSelectionRank.bulk_upsert(rows=rows, db=db)
|
|
918
|
-
|
|
919
|
-
# Merge the results
|
|
920
|
-
features_selected = feat_scores[feat_scores["support"] == True][
|
|
921
|
-
["features", "rank"]
|
|
922
|
-
]
|
|
923
|
-
features_selected.sort_values("rank", inplace=True)
|
|
924
|
-
features_selected.drop_duplicates("features", inplace=True)
|
|
925
970
|
|
|
926
|
-
|
|
971
|
+
# Ensure the final DataFrame keeps the original order
|
|
972
|
+
df_scaled = pd.concat(
|
|
973
|
+
[X_scaled, y_scaled, y_not_scaled],
|
|
974
|
+
axis=1,
|
|
975
|
+
)[
|
|
976
|
+
df.columns
|
|
977
|
+
] # Reorder columns to match original `df`
|
|
978
|
+
|
|
979
|
+
if not df_scaled.columns.equals(df.columns):
|
|
980
|
+
raise Exception("Columns are not in the same order after scaling.")
|
|
981
|
+
|
|
982
|
+
return df_scaled, scaler_x, scalers_y
|
|
983
|
+
|
|
984
|
+
# Reshape into 3D tensors for recurrent models
|
|
985
|
+
def reshape_time_series(
|
|
986
|
+
self,
|
|
987
|
+
train: pd.DataFrame,
|
|
988
|
+
val: pd.DataFrame,
|
|
989
|
+
test: pd.DataFrame,
|
|
990
|
+
features: list,
|
|
991
|
+
timesteps: int = 120,
|
|
992
|
+
):
|
|
993
|
+
# always scale for recurrent layers : train should be scaled
|
|
994
|
+
group_column = self.group_column
|
|
995
|
+
|
|
996
|
+
target_columns = train.columns.intersection(
|
|
997
|
+
[f"TARGET_{i}" for i in self.target_numbers]
|
|
998
|
+
)
|
|
927
999
|
|
|
928
|
-
|
|
929
|
-
# features_selected = list(dict.fromkeys(features_selected_by_mi + features_selected_by_nonlinear_correlation + features_selected_by_linear_correlation))
|
|
930
|
-
features_selected_by_every_methods = set(results[0]["features"].values.tolist())
|
|
1000
|
+
data = pd.concat([train, val, test], axis=0)
|
|
931
1001
|
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
df["features"].values.tolist()
|
|
935
|
-
) # intersection
|
|
1002
|
+
def reshape_df(df: pd.DataFrame, group_series: pd.Series, timesteps: int):
|
|
1003
|
+
fill_value = [[[0] * len(df.columns)]]
|
|
936
1004
|
|
|
937
|
-
|
|
1005
|
+
def shiftsum(x, timesteps: int):
|
|
1006
|
+
tmp = x.copy()
|
|
1007
|
+
for i in range(1, timesteps):
|
|
1008
|
+
tmp = x.shift(i, fill_value=fill_value) + tmp
|
|
1009
|
+
return tmp
|
|
938
1010
|
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
logger.debug(features_selected_by_every_methods)
|
|
1011
|
+
logger.info("Grouping each feature in a unique column with list...")
|
|
1012
|
+
df_reshaped = df.apply(list, axis=1).apply(lambda x: [list(x)])
|
|
1013
|
+
df_reshaped = pd.concat([df_reshaped, group_series], axis=1)
|
|
943
1014
|
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
)
|
|
953
|
-
pd.Series(features).to_csv(
|
|
954
|
-
f"{fs_dir_target}/features_before_max.csv",
|
|
955
|
-
index=True,
|
|
956
|
-
header=True,
|
|
957
|
-
index_label="ID",
|
|
958
|
-
)
|
|
959
|
-
features = features[:max_features]
|
|
1015
|
+
logger.info("Grouping method stock and creating timesteps...")
|
|
1016
|
+
df_reshaped = (
|
|
1017
|
+
df_reshaped.groupby(group_column)[0]
|
|
1018
|
+
.apply(lambda x: shiftsum(x, timesteps))
|
|
1019
|
+
.reset_index(group_column, drop=True)
|
|
1020
|
+
.rename("RECURRENT_FEATURES")
|
|
1021
|
+
)
|
|
1022
|
+
df_reshaped = pd.DataFrame(df_reshaped)
|
|
960
1023
|
|
|
961
|
-
|
|
962
|
-
logger.debug(
|
|
963
|
-
f"Final pre-selection: {len(features)} features below {corr_threshold}% out of {len(features_selected_list)} features, and rejected {len(features_correlated)} features, {100*len(features)/len(features_selected_list):.2f}% features selected"
|
|
964
|
-
)
|
|
1024
|
+
return df_reshaped
|
|
965
1025
|
|
|
966
|
-
|
|
967
|
-
set(features) & set(features_selected_by_every_methods)
|
|
968
|
-
)
|
|
969
|
-
logger.debug(
|
|
970
|
-
f"In this pre-selection, there is {len(features_selected_by_every_methods_uncorrelated)} features from the {len(features_selected_by_every_methods)} selected unanimously\n"
|
|
971
|
-
)
|
|
1026
|
+
data_reshaped = reshape_df(data[features], data[group_column], timesteps)
|
|
972
1027
|
|
|
973
|
-
|
|
974
|
-
features_selected[features_selected["features"].isin(features)].to_markdown()
|
|
975
|
-
)
|
|
1028
|
+
data_reshaped[target_columns] = data[target_columns]
|
|
976
1029
|
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
joblib.dump(features, best_features_path)
|
|
1030
|
+
logger.info("Separating train, val, test data and creating np arrays...")
|
|
1031
|
+
train_reshaped = data_reshaped.loc[train.index]
|
|
1032
|
+
val_reshaped = data_reshaped.loc[val.index]
|
|
1033
|
+
test_reshaped = data_reshaped.loc[test.index]
|
|
982
1034
|
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
1035
|
+
x_train_reshaped = np.array(
|
|
1036
|
+
train_reshaped["RECURRENT_FEATURES"].values.tolist()
|
|
1037
|
+
)
|
|
1038
|
+
y_train_reshaped = np.array(train_reshaped[target_columns].reset_index())
|
|
1039
|
+
x_val_reshaped = np.array(val_reshaped["RECURRENT_FEATURES"].values.tolist())
|
|
1040
|
+
y_val_reshaped = np.array(val_reshaped[target_columns].reset_index())
|
|
1041
|
+
x_test_reshaped = np.array(test_reshaped["RECURRENT_FEATURES"].values.tolist())
|
|
1042
|
+
y_test_reshaped = np.array(test_reshaped[target_columns].reset_index())
|
|
1043
|
+
|
|
1044
|
+
reshaped_data = {
|
|
1045
|
+
"x_train_reshaped": x_train_reshaped,
|
|
1046
|
+
"y_train_reshaped": y_train_reshaped,
|
|
1047
|
+
"x_val_reshaped": x_val_reshaped,
|
|
1048
|
+
"y_val_reshaped": y_val_reshaped,
|
|
1049
|
+
"x_test_reshaped": x_test_reshaped,
|
|
1050
|
+
"y_test_reshaped": y_test_reshaped,
|
|
1051
|
+
}
|
|
996
1052
|
|
|
997
|
-
|
|
1053
|
+
return reshaped_data
|
|
998
1054
|
|
|
999
1055
|
|
|
1056
|
+
# utils
|
|
1000
1057
|
# TODO : can we use this to select the ideal number of features ?
|
|
1001
1058
|
def feature_selection_analysis(feature_selection_id: int, n_components: int = 5):
|
|
1002
1059
|
|
|
@@ -1072,158 +1129,18 @@ def feature_selection_analysis(feature_selection_id: int, n_components: int = 5)
|
|
|
1072
1129
|
plt.show()
|
|
1073
1130
|
|
|
1074
1131
|
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
scaler_x.transform(X), columns=list(X.columns), index=X.index
|
|
1085
|
-
)
|
|
1086
|
-
else:
|
|
1087
|
-
scaler_x = StandardScaler() # MinMaxScaler(feature_range=(-1,1))
|
|
1088
|
-
X_scaled = pd.DataFrame(
|
|
1089
|
-
scaler_x.fit_transform(X), columns=list(X.columns), index=X.index
|
|
1090
|
-
)
|
|
1091
|
-
if PYTHON_ENV != "Test":
|
|
1092
|
-
joblib.dump(scaler_x, f"{save_dir}/scaler_x.pkl")
|
|
1093
|
-
|
|
1094
|
-
# Determine which targets need to be scaled
|
|
1095
|
-
targets_numbers_to_scale = [i for i in TARGETS_NUMBER if i not in TARGETS_CLF]
|
|
1096
|
-
|
|
1097
|
-
# Dictionary to store scaled target data
|
|
1098
|
-
scaled_targets = {}
|
|
1099
|
-
|
|
1100
|
-
if scalers_y:
|
|
1101
|
-
for target_number in targets_numbers_to_scale:
|
|
1102
|
-
y = df[[f"TARGET_{target_number}"]]
|
|
1103
|
-
scaled_targets[target_number] = pd.DataFrame(
|
|
1104
|
-
scalers_y[f"scaler_y_{target_number}"].transform(y.values),
|
|
1105
|
-
columns=y.columns,
|
|
1106
|
-
index=y.index,
|
|
1107
|
-
)
|
|
1108
|
-
else:
|
|
1109
|
-
scalers_y = {}
|
|
1110
|
-
for target_number in targets_numbers_to_scale:
|
|
1111
|
-
scaler_y = StandardScaler()
|
|
1112
|
-
y = df[[f"TARGET_{target_number}"]]
|
|
1113
|
-
|
|
1114
|
-
scaled_y = pd.DataFrame(
|
|
1115
|
-
scaler_y.fit_transform(y.values),
|
|
1116
|
-
columns=y.columns,
|
|
1117
|
-
index=y.index,
|
|
1118
|
-
)
|
|
1119
|
-
if PYTHON_ENV != "Test":
|
|
1120
|
-
joblib.dump(scaler_y, f"{save_dir}/scaler_y_{target_number}.pkl")
|
|
1121
|
-
|
|
1122
|
-
scalers_y[f"scaler_y_{target_number}"] = scaler_y
|
|
1123
|
-
scaled_targets[target_number] = scaled_y
|
|
1124
|
-
|
|
1125
|
-
# Reconstruct y_scaled in the original order
|
|
1126
|
-
y_scaled = pd.concat(
|
|
1127
|
-
[scaled_targets[target_number] for target_number in targets_numbers_to_scale],
|
|
1128
|
-
axis=1,
|
|
1129
|
-
)
|
|
1130
|
-
y_not_scaled = df[df.columns.intersection([f"TARGET_{i}" for i in TARGETS_CLF])]
|
|
1131
|
-
|
|
1132
|
-
# Ensure the final DataFrame keeps the original order
|
|
1133
|
-
df_scaled = pd.concat(
|
|
1134
|
-
[X_scaled, y_scaled, y_not_scaled],
|
|
1135
|
-
axis=1,
|
|
1136
|
-
)[
|
|
1137
|
-
df.columns
|
|
1138
|
-
] # Reorder columns to match original `df`
|
|
1139
|
-
|
|
1140
|
-
if not df_scaled.columns.equals(df.columns):
|
|
1141
|
-
raise Exception("Columns are not in the same order after scaling.")
|
|
1142
|
-
|
|
1143
|
-
return df_scaled, scaler_x, scalers_y
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
# Reshape into 3D tensors for recurrent models
|
|
1147
|
-
def reshape_time_series(
|
|
1148
|
-
train: pd.DataFrame,
|
|
1149
|
-
val: pd.DataFrame,
|
|
1150
|
-
test: pd.DataFrame,
|
|
1151
|
-
features: list,
|
|
1152
|
-
timesteps: int = 120,
|
|
1153
|
-
):
|
|
1154
|
-
# always scale for recurrent layers : train should be scaled
|
|
1155
|
-
|
|
1156
|
-
target_columns = train.columns.intersection([f"TARGET_{i}" for i in TARGETS_NUMBER])
|
|
1157
|
-
|
|
1158
|
-
data = pd.concat([train, val, test], axis=0)
|
|
1159
|
-
|
|
1160
|
-
data_reshaped = reshape_df(data[features], data[GROUPING_COLUMN], timesteps)
|
|
1161
|
-
|
|
1162
|
-
data_reshaped[target_columns] = data[target_columns]
|
|
1163
|
-
|
|
1164
|
-
logger.info("Separating train, val, test data and creating np arrays...")
|
|
1165
|
-
train_reshaped = data_reshaped.loc[train.index]
|
|
1166
|
-
val_reshaped = data_reshaped.loc[val.index]
|
|
1167
|
-
test_reshaped = data_reshaped.loc[test.index]
|
|
1168
|
-
|
|
1169
|
-
x_train_reshaped = np.array(train_reshaped["RECURRENT_FEATURES"].values.tolist())
|
|
1170
|
-
y_train_reshaped = np.array(train_reshaped[target_columns].reset_index())
|
|
1171
|
-
x_val_reshaped = np.array(val_reshaped["RECURRENT_FEATURES"].values.tolist())
|
|
1172
|
-
y_val_reshaped = np.array(val_reshaped[target_columns].reset_index())
|
|
1173
|
-
x_test_reshaped = np.array(test_reshaped["RECURRENT_FEATURES"].values.tolist())
|
|
1174
|
-
y_test_reshaped = np.array(test_reshaped[target_columns].reset_index())
|
|
1175
|
-
|
|
1176
|
-
reshaped_data = {
|
|
1177
|
-
"x_train_reshaped": x_train_reshaped,
|
|
1178
|
-
"y_train_reshaped": y_train_reshaped,
|
|
1179
|
-
"x_val_reshaped": x_val_reshaped,
|
|
1180
|
-
"y_val_reshaped": y_val_reshaped,
|
|
1181
|
-
"x_test_reshaped": x_test_reshaped,
|
|
1182
|
-
"y_test_reshaped": y_test_reshaped,
|
|
1183
|
-
}
|
|
1184
|
-
|
|
1185
|
-
return reshaped_data
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
def reshape_df(df: pd.DataFrame, stock_column: pd.DataFrame, timesteps: int):
|
|
1189
|
-
fill_value = [[[0] * len(df.columns)]]
|
|
1190
|
-
|
|
1191
|
-
def shiftsum(x, timesteps: int):
|
|
1192
|
-
tmp = x.copy()
|
|
1193
|
-
for i in range(1, timesteps):
|
|
1194
|
-
tmp = x.shift(i, fill_value=fill_value) + tmp
|
|
1195
|
-
return tmp
|
|
1196
|
-
|
|
1197
|
-
logger.info("Grouping each feature in a unique column with list...")
|
|
1198
|
-
df_reshaped = df.apply(list, axis=1).apply(lambda x: [list(x)])
|
|
1199
|
-
df_reshaped = pd.concat([df_reshaped, stock_column], axis=1)
|
|
1200
|
-
|
|
1201
|
-
logger.info("Grouping method stock and creating timesteps...")
|
|
1202
|
-
df_reshaped = (
|
|
1203
|
-
df_reshaped.groupby(GROUPING_COLUMN)[0]
|
|
1204
|
-
.apply(lambda x: shiftsum(x, timesteps))
|
|
1205
|
-
.reset_index(GROUPING_COLUMN, drop=True)
|
|
1206
|
-
.rename("RECURRENT_FEATURES")
|
|
1207
|
-
)
|
|
1208
|
-
df_reshaped = pd.DataFrame(df_reshaped)
|
|
1209
|
-
|
|
1210
|
-
return df_reshaped
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
def load_train_data(dataset_dir, target_number, target_type="regression"):
|
|
1214
|
-
train_data_dir = f"{dataset_dir}/data"
|
|
1215
|
-
preprocessing_dir = f"{dataset_dir}/preprocessing"
|
|
1216
|
-
|
|
1217
|
-
_scaler_y = (
|
|
1218
|
-
joblib.load(f"{preprocessing_dir}/scaler_y_{target_number}.pkl")
|
|
1219
|
-
if target_type == "regression"
|
|
1220
|
-
else None
|
|
1221
|
-
)
|
|
1132
|
+
def get_features_by_types(df: pd.DataFrame, sample_categorical_threshold: int = 15):
|
|
1133
|
+
categorical_features = [
|
|
1134
|
+
col
|
|
1135
|
+
for col in df.columns
|
|
1136
|
+
if df[col].nunique() <= sample_categorical_threshold
|
|
1137
|
+
and df[col].dtype in ["int64", "Int64"]
|
|
1138
|
+
]
|
|
1139
|
+
df_categorical = df[categorical_features]
|
|
1140
|
+
logger.info(f"Number of categorical features: {len(categorical_features)}")
|
|
1222
1141
|
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
train_scaled = joblib.load(f"{train_data_dir}/train_scaled.pkl")
|
|
1227
|
-
val_scaled = joblib.load(f"{train_data_dir}/val_scaled.pkl")
|
|
1142
|
+
numerical_features = list(set(df.columns).difference(set(categorical_features)))
|
|
1143
|
+
df_numerical = df[numerical_features]
|
|
1144
|
+
logger.info(f"Number of numerical features: {len(numerical_features)}")
|
|
1228
1145
|
|
|
1229
|
-
return
|
|
1146
|
+
return df_categorical, df_numerical
|