lecrapaud 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

Files changed (42) hide show
  1. lecrapaud/__init__.py +1 -0
  2. lecrapaud/api.py +277 -0
  3. lecrapaud/config.py +10 -0
  4. lecrapaud/db/__init__.py +1 -0
  5. lecrapaud/db/alembic/env.py +2 -2
  6. lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +24 -12
  7. lecrapaud/db/alembic/versions/2025_06_17_1652-c45f5e49fa2c_make_fields_nullable.py +89 -0
  8. lecrapaud/db/alembic.ini +116 -0
  9. lecrapaud/db/models/__init__.py +10 -10
  10. lecrapaud/db/models/base.py +176 -1
  11. lecrapaud/db/models/dataset.py +25 -20
  12. lecrapaud/db/models/feature.py +5 -6
  13. lecrapaud/db/models/feature_selection.py +3 -4
  14. lecrapaud/db/models/feature_selection_rank.py +3 -4
  15. lecrapaud/db/models/model.py +3 -4
  16. lecrapaud/db/models/model_selection.py +15 -8
  17. lecrapaud/db/models/model_training.py +15 -7
  18. lecrapaud/db/models/score.py +9 -6
  19. lecrapaud/db/models/target.py +16 -8
  20. lecrapaud/db/session.py +66 -0
  21. lecrapaud/experiment.py +64 -0
  22. lecrapaud/feature_engineering.py +747 -1022
  23. lecrapaud/feature_selection.py +915 -998
  24. lecrapaud/integrations/openai_integration.py +225 -0
  25. lecrapaud/jobs/__init__.py +2 -2
  26. lecrapaud/jobs/config.py +1 -1
  27. lecrapaud/jobs/scheduler.py +1 -1
  28. lecrapaud/jobs/tasks.py +6 -6
  29. lecrapaud/model_selection.py +1060 -960
  30. lecrapaud/search_space.py +4 -0
  31. lecrapaud/utils.py +2 -2
  32. lecrapaud-0.4.1.dist-info/METADATA +171 -0
  33. {lecrapaud-0.4.0.dist-info → lecrapaud-0.4.1.dist-info}/RECORD +36 -35
  34. {lecrapaud-0.4.0.dist-info → lecrapaud-0.4.1.dist-info}/WHEEL +1 -1
  35. lecrapaud/db/crud.py +0 -179
  36. lecrapaud/db/services.py +0 -0
  37. lecrapaud/db/setup.py +0 -58
  38. lecrapaud/predictions.py +0 -292
  39. lecrapaud/training.py +0 -151
  40. lecrapaud-0.4.0.dist-info/METADATA +0 -103
  41. /lecrapaud/{directory_management.py → directories.py} +0 -0
  42. {lecrapaud-0.4.0.dist-info → lecrapaud-0.4.1.dist-info}/LICENSE +0 -0
@@ -33,970 +33,1027 @@ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
33
33
  from sklearn.model_selection import TimeSeriesSplit
34
34
  from sklearn.metrics import root_mean_squared_error, log_loss, make_scorer
35
35
  from mlxtend.feature_selection import SequentialFeatureSelector
36
- from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
37
- from sklearn.compose import ColumnTransformer
38
- import category_encoders as ce
39
- from scipy.stats import spearmanr, kendalltau
40
-
41
- # Scaling
42
36
  from sklearn.preprocessing import StandardScaler, MinMaxScaler
37
+ from scipy.stats import spearmanr, kendalltau
43
38
 
44
39
  # Internal
45
- from src.directory_management import tmp_dir, clean_directory
46
- from src.utils import logger
47
- from src.config import PYTHON_ENV
48
- from src.db.models import (
40
+ from lecrapaud.directories import tmp_dir, clean_directory
41
+ from lecrapaud.utils import logger
42
+ from lecrapaud.config import PYTHON_ENV
43
+ from lecrapaud.db import (
49
44
  Dataset,
50
45
  Target,
51
46
  Feature,
52
47
  FeatureSelection,
53
48
  FeatureSelectionRank,
54
49
  )
55
- from src.db.setup import get_db
56
-
57
- # Variables for targets handling
58
- TARGETS_NUMBER = range(1, 15)
59
- TARGETS_CLF = [2, 4, 6, 8, 9, 10, 11]
60
- TARGETS_MCLF = [11]
61
- GROUPING_COLUMN = "STOCK"
62
- DATE_COLUMN = "DATE"
50
+ from lecrapaud.db.session import get_db
51
+ from lecrapaud.search_space import all_models
63
52
 
64
53
  # Annoying Warnings
65
54
  warnings.filterwarnings("ignore", category=FutureWarning)
66
55
 
67
56
 
68
- def get_dataset_name(
69
- df, corr_threshold: int = 80, percentile: int = 20, max_features: int = 20
70
- ):
71
- number_of_groups = df[GROUPING_COLUMN].nunique()
72
-
73
- # Try to convert DATE column to datetime safely
74
- if pd.api.types.is_integer_dtype(df[DATE_COLUMN]):
75
- df_date = df[DATE_COLUMN].map(pd.Timestamp.fromordinal)
76
- else:
77
- df_date = pd.to_datetime(
78
- df[DATE_COLUMN], errors="coerce"
79
- ) # convert strings, datetime, etc.
80
-
81
- name = f"data_{number_of_groups}_{corr_threshold}_{percentile}_{max_features}_{df_date.min().date()}_{df_date.max().date()}"
82
- if PYTHON_ENV == "Test":
83
- name = f"test_{name}"
84
- return name
85
-
86
-
87
- def create_sets_from_data(
88
- df: pd.DataFrame,
89
- corr_threshold: int = 80,
90
- percentile: int = 20,
91
- max_features: int = 20,
92
- ):
93
-
94
- df.sort_values([DATE_COLUMN, GROUPING_COLUMN], inplace=True)
95
-
96
- # Drop non-useful column for training
97
- if "ISIN" in df.columns:
98
- df.drop(labels=["ISIN"], axis=1, inplace=True)
99
- if "SECURITY" in df.columns:
100
- df.drop(labels=["SECURITY"], axis=1, inplace=True)
57
+ def load_train_data(dataset_dir, target_number, target_type="regression"):
58
+ data_dir = f"{dataset_dir}/data"
101
59
 
102
- dates = df[DATE_COLUMN].unique()
60
+ logger.info("Loading data...")
61
+ train = joblib.load(f"{data_dir}/train.pkl")
62
+ val = joblib.load(f"{data_dir}/val.pkl")
63
+ test = joblib.load(f"{data_dir}/test.pkl")
64
+ try:
65
+ train_scaled = joblib.load(f"{data_dir}/train_scaled.pkl")
66
+ val_scaled = joblib.load(f"{data_dir}/val_scaled.pkl")
67
+ test_scaled = joblib.load(f"{data_dir}/test_scaled.pkl")
68
+ except FileNotFoundError:
69
+ train_scaled = None
70
+ val_scaled = None
71
+ test_scaled = None
72
+
73
+ return train, val, test, train_scaled, val_scaled, test_scaled
74
+
75
+
76
+ class FeatureSelectionEngine:
77
+ def __init__(self, train, dataset, target_number, target_clf, **kwargs):
78
+ self.dataset = dataset
79
+ self.train = train
80
+ self.target_number = target_number
81
+ self.target_clf = target_clf
82
+
83
+ self.target_type = (
84
+ "classification" if self.target_number in self.target_clf else "regression"
85
+ )
86
+ self.percentile = self.dataset.percentile
87
+ self.corr_threshold = self.dataset.corr_threshold
88
+ self.max_features = self.dataset.max_features
89
+
90
+ self.dataset_dir = self.dataset.path
91
+ self.dataset_id = self.dataset.id
92
+ self.data_dir = f"{self.dataset_dir}/data"
93
+ self.preprocessing_dir = f"{self.dataset_dir}/preprocessing"
94
+ self.fs_dir_target = (
95
+ f"{self.dataset_dir}/{f"TARGET_{self.target_number}"}/feature_selection"
96
+ )
97
+ os.makedirs(self.fs_dir_target, exist_ok=True)
98
+
99
+ # Main feature selection function
100
+ def run(
101
+ self,
102
+ single_process: bool = True,
103
+ ):
104
+ """Function to do feature selection with a range of different feature selection technics
105
+
106
+ Args:
107
+ - train (pd.DataFrame): a pandas train set
108
+ - target_number (in): a target, targets need to be name ``TARGET_{n}```
109
+ - single_process (bool): if True, run all feature selection methods in a single process. If False, run them in parallel.
110
+ """
111
+ target_number = self.target_number
112
+ target_type = self.target_type
113
+ fs_dir_target = self.fs_dir_target
114
+
115
+ # Create the feature selection in db
116
+ target = Target.find_by(name=f"TARGET_{target_number}")
117
+ percentile = self.percentile
118
+ corr_threshold = self.corr_threshold
119
+ max_features = self.max_features
120
+
121
+ feature_selection = FeatureSelection.upsert(
122
+ match_fields=["target_id", "dataset_id"],
123
+ target_id=target.id,
124
+ dataset_id=self.dataset_id,
125
+ )
103
126
 
104
- val_first_id = int(len(dates) * 0.6) + 1
105
- test_first_id = int(len(dates) * 0.8) + 1
127
+ if feature_selection.best_features_path:
128
+ return joblib.load(feature_selection.best_features_path)
106
129
 
107
- train = df[df[DATE_COLUMN].isin(dates[:val_first_id])]
108
- val = df[df[DATE_COLUMN].isin(dates[val_first_id:test_first_id])]
109
- test = df[df[DATE_COLUMN].isin(dates[test_first_id:])]
130
+ self.X = self.train.loc[:, ~self.train.columns.str.contains("^TARGET_")]
131
+ self.y = self.train[f"TARGET_{target_number}"]
110
132
 
111
- dates = {}
112
- dates["start_date"] = pd.to_datetime(df[DATE_COLUMN].iat[0])
113
- dates["end_date"] = pd.to_datetime(df[DATE_COLUMN].iat[-1])
114
- for name, data in zip(["train", "val", "test"], [train, val, test]):
115
- dates[f"{name}_start_date"] = pd.to_datetime(data[DATE_COLUMN].iat[0])
116
- dates[f"{name}_end_date"] = pd.to_datetime(data[DATE_COLUMN].iat[-1])
133
+ logger.info(f"Starting feature selection for TARGET_{target_number}...")
134
+ clean_directory(self.fs_dir_target)
117
135
 
118
- logger.info(
119
- f"{len(data['DATE'])} {name} data from {dates[f"{name}_start_date"].strftime('%d/%m/%Y')} to {dates[f"{name}_end_date"].strftime('%d/%m/%Y')}"
136
+ # Let's start by removing extremly correlated features
137
+ # This is needed to reduce nb of feature but also for methods such as anova or chi2 that requires independent features
138
+ # TODO: we could also remove low variance features
139
+ features_uncorrelated, features_correlated = self.remove_correlated_features(
140
+ 90, vizualize=False
120
141
  )
121
-
122
- datasets = {}
123
-
124
- with get_db() as db:
125
- all_targets = Target.get_all(db=db)
126
- matched_targets = [
127
- target for target in all_targets if target.name in train.columns
128
- ]
129
- dataset_name = get_dataset_name(train, corr_threshold, percentile, max_features)
130
- dataset_dir = f"{tmp_dir}/{dataset_name}"
131
- preprocessing_dir = f"{dataset_dir}/preprocessing"
132
- train_data_dir = f"{dataset_dir}/data"
133
- os.makedirs(dataset_dir, exist_ok=True)
134
- os.makedirs(preprocessing_dir, exist_ok=True)
135
- os.makedirs(train_data_dir, exist_ok=True)
136
-
137
- dataset = datasets[name] = Dataset.upsert(
138
- match_fields=["name"],
139
- db=db,
140
- name=dataset_name,
141
- path=Path(dataset_dir).resolve(),
142
- type="training",
143
- size=df.shape[0],
144
- train_size=train.shape[0],
145
- val_size=val.shape[0],
146
- test_size=test.shape[0],
147
- number_of_groups=data[GROUPING_COLUMN].nunique(),
148
- list_of_groups=data[GROUPING_COLUMN].unique().tolist(),
149
- corr_threshold=corr_threshold,
150
- percentile=percentile,
151
- max_features=max_features,
152
- **dates,
153
- targets=matched_targets,
142
+ self.X = self.X[features_uncorrelated]
143
+
144
+ logger.debug(
145
+ f"""
146
+ \nWe first have removed {len(features_correlated)} features with correlation greater than 90%
147
+ \nWe are looking to capture {percentile}% of {len(self.X.columns)} features, i.e. {int(len(self.X.columns)*percentile/100)} features, with different feature selection methods
148
+ \nWe will then remove above {corr_threshold}% correlated features, keeping the one with the best ranks
149
+ \nFinally, we will keep only the {max_features} best ranked features
150
+ """
154
151
  )
155
152
 
156
- # encode categoricals
157
- train = encode_categorical_features(train, fit=True, save_dir=preprocessing_dir)
158
- val = encode_categorical_features(val, save_dir=preprocessing_dir)
159
- test = encode_categorical_features(test, save_dir=preprocessing_dir)
160
-
161
- # save the full data
162
- if PYTHON_ENV != "Test":
163
- joblib.dump(df, f"{train_data_dir}/full.pkl")
153
+ start = time.time()
164
154
 
165
- return train, val, test, dataset
155
+ # handling categorical features (only if classification)
156
+ self.X_categorical, self.X_numerical = get_features_by_types(self.X)
166
157
 
158
+ if target_type == "classification" and self.X_categorical.shape[1] > 0:
159
+ feat_scores = self.select_categorical_features(
160
+ percentile=percentile, save_dir=fs_dir_target
161
+ )
162
+ with get_db() as db:
163
+ for row in feat_scores.itertuples(index=False):
164
+ feature = Feature.find_by(name=row.features, db=db)
165
+ FeatureSelectionRank.upsert(
166
+ ["feature_selection_id", "feature_id", "method"],
167
+ db=db,
168
+ score=row.score,
169
+ pvalue=row.pvalue,
170
+ support=row.support,
171
+ rank=row.rank,
172
+ method=row.method,
173
+ training_time=row.training_time,
174
+ feature_selection_id=feature_selection.id,
175
+ feature_id=feature.id,
176
+ )
177
+ categorical_features_selected = feat_scores[feat_scores["support"]][
178
+ "features"
179
+ ].values.tolist()
180
+
181
+ results = []
182
+ params = {"percentile": percentile, "save_dir": fs_dir_target}
183
+ if single_process:
184
+ results = [
185
+ self.select_feature_by_linear_correlation(**params),
186
+ self.select_feature_by_nonlinear_correlation(**params),
187
+ self.select_feature_by_mi(**params),
188
+ self.select_feature_by_feat_imp(**params),
189
+ self.select_feature_by_rfe(**params),
190
+ # self.select_feature_by_sfs(
191
+ # **params
192
+ # ), # TODO: this is taking too long
193
+ ]
194
+ else:
195
+ # Use ProcessPoolExecutor to run tasks in parallel
196
+ # TODO: not sure it's efficient from previous tests... especially because rfe and sfs methods are doing parallel processing already, this can create overhead
197
+ with ProcessPoolExecutor() as executor:
198
+ # Submit different functions to be executed in parallel
199
+ futures = [
200
+ executor.submit(
201
+ self.select_feature_by_linear_correlation,
202
+ **params,
203
+ ),
204
+ executor.submit(
205
+ self.select_feature_by_nonlinear_correlation,
206
+ **params,
207
+ ),
208
+ executor.submit(
209
+ self.select_feature_by_mi,
210
+ **params,
211
+ ),
212
+ executor.submit(
213
+ self.select_feature_by_feat_imp,
214
+ **params,
215
+ ),
216
+ executor.submit(
217
+ self.select_feature_by_rfe,
218
+ **params,
219
+ ),
220
+ # executor.submit(
221
+ # self.select_feature_by_sfs,
222
+ # **params,
223
+ # ), # TODO: this is taking too long
224
+ ]
225
+
226
+ # Wait for all futures to complete and gather the results
227
+ with tqdm(total=len(futures)) as pbar:
228
+ for future in as_completed(futures):
229
+ results.append(future.result())
230
+ pbar.update(1)
231
+
232
+ logger.info(f"Finished feature selection for target {target_number}")
233
+
234
+ stop = time.time()
235
+
236
+ # Once all tasks are completed, start by inserting results to db
237
+ feat_scores = pd.concat(
238
+ results,
239
+ axis=0,
240
+ )
167
241
 
168
- def encode_categorical_features(df: pd.DataFrame, save_dir: str, fit: bool = False):
242
+ logger.info("Inserting feature selection results to db...")
243
+ rows = []
244
+ with get_db() as db:
245
+ feature_map = {f.name: f.id for f in Feature.get_all(db=db, limit=20000)}
246
+ for row in feat_scores.itertuples(index=False):
247
+ feature_id = feature_map.get(row.features)
248
+ if not feature_id:
249
+ continue # or raise if feature must exist
250
+
251
+ rows.append(
252
+ {
253
+ "feature_selection_id": feature_selection.id,
254
+ "feature_id": feature_id,
255
+ "method": row.method,
256
+ "score": row.score,
257
+ "pvalue": None if pd.isna(row.pvalue) else row.pvalue,
258
+ "support": row.support,
259
+ "rank": row.rank,
260
+ "training_time": row.training_time,
261
+ }
262
+ )
169
263
 
170
- X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
171
- y = df.loc[:, df.columns.str.contains("^TARGET_")]
264
+ if len(rows) == 0:
265
+ raise ValueError(f"No features selected for TARGET_{target_number}")
172
266
 
173
- # 1. Timestamps for 'DATE'
174
- X.loc[:, DATE_COLUMN] = pd.to_datetime(X[DATE_COLUMN]).map(pd.Timestamp.toordinal)
267
+ FeatureSelectionRank.bulk_upsert(rows=rows, db=db)
175
268
 
176
- if fit:
177
- # Define columns for ordinal and binary encoding (we should have all possible values in training set, unless we accept unknown values processing)
178
- ordinal_encoding_features = ["STOCK"]
269
+ # Merge the results
270
+ logger.info("Merging feature selection methods...")
271
+ features_selected = feat_scores[feat_scores["support"]][["features", "rank"]]
272
+ features_selected.sort_values("rank", inplace=True)
273
+ features_selected.drop_duplicates("features", inplace=True)
179
274
 
180
- binary_encoding_features = ["SECTOR", "SUBINDUSTRY", "LOCATION"]
275
+ features_selected_list = features_selected["features"].values.tolist()
181
276
 
182
- # Fit and save the ColumnTransformer with OrdinalEncoder and OneHotEncoder
183
- column_transformer = ColumnTransformer(
184
- transformers=[
185
- (
186
- "ordinal",
187
- OrdinalEncoder(
188
- handle_unknown="use_encoded_value",
189
- unknown_value=-1, # rows with unseen STOCK values will be encoded as -1
190
- ),
191
- ordinal_encoding_features,
192
- ),
193
- (
194
- "binary_encoder",
195
- ce.BinaryEncoder(
196
- handle_unknown="value",
197
- ), # rows with unseen values will be encoded as all-zeros in the binary columns
198
- binary_encoding_features,
199
- ),
200
- ],
201
- remainder="passthrough", # Keep the non-encoded columns like 'DATE'
277
+ # analysis 1
278
+ features_selected_by_every_methods = set(results[0]["features"].values.tolist())
279
+ for df in results[1:]:
280
+ features_selected_by_every_methods &= set(
281
+ df["features"].values.tolist()
282
+ ) # intersection
283
+ features_selected_by_every_methods = list(features_selected_by_every_methods)
284
+ logger.debug(
285
+ f"We selected {len(features_selected_list)} features and {len(features_selected_by_every_methods)} were selected unanimously:"
286
+ )
287
+ logger.debug(features_selected_by_every_methods)
288
+ pd.Series(features_selected_list).to_csv(
289
+ f"{fs_dir_target}/features_before_corr.csv",
290
+ index=True,
291
+ header=True,
292
+ index_label="ID",
202
293
  )
203
- transformed_data = column_transformer.fit_transform(X)
204
- if PYTHON_ENV != "Test":
205
- joblib.dump(column_transformer, f"{save_dir}/column_transformer.pkl")
206
- else:
207
- # Load the ColumnTransformer and apply it
208
- column_transformer = joblib.load(f"{save_dir}/column_transformer.pkl")
209
-
210
- transformed_data = column_transformer.transform(X)
211
-
212
- # Convert to DataFrame for readability and return
213
- transformed_X = pd.DataFrame(
214
- transformed_data,
215
- columns=[
216
- feature.split("__")[1]
217
- for feature in column_transformer.get_feature_names_out()
218
- ],
219
- index=X.index,
220
- )
221
- transformed_X = transformed_X.apply(pd.to_numeric)
222
- for col in [
223
- feature.split("__")[1]
224
- for feature in column_transformer.get_feature_names_out()
225
- if "remainder" not in feature
226
- ] + [DATE_COLUMN]:
227
- transformed_X[col] = transformed_X[col].astype(int)
228
-
229
- # Insert features in db
230
- if fit:
231
- # TODO: in bulk
232
- for feature in transformed_X.columns:
233
- dtype = transformed_X[feature].dtype
234
- if pd.api.types.is_integer_dtype(dtype):
235
- feature_type = "categorical"
236
- elif pd.api.types.is_float_dtype(dtype):
237
- feature_type = "numerical"
238
- else:
239
- feature_type = "other"
240
- Feature.upsert(match_fields=["name"], name=feature, type=feature_type)
241
- for target in y.columns:
242
- type = (
243
- "classification"
244
- if int(target.split("_")[1]) in TARGETS_CLF
245
- else "regression"
246
- )
247
- # TODO: what about description here ?
248
- Target.upsert(match_fields=["name", "type"], name=target, type=type)
249
-
250
- return pd.concat([transformed_X, y], axis=1)
251
294
 
295
+ # removing correlated features
296
+ self.X = self.X[features_selected_list]
297
+ features, features_correlated = self.remove_correlated_features(corr_threshold)
298
+ pd.Series(features).to_csv(
299
+ f"{fs_dir_target}/features_before_max.csv",
300
+ index=True,
301
+ header=True,
302
+ index_label="ID",
303
+ )
304
+ features = features[:max_features]
252
305
 
253
- # only work with all features from feat eng in the right order (unused for now)
254
- def decode_categorical_features(df: pd.DataFrame, save_dir: str):
255
- X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
256
- y = df.loc[:, df.columns.str.contains("^TARGET_")]
257
- index = X.index
258
- original_dtypes = X.dtypes.to_dict()
306
+ # adding categorical features selected
307
+ features += (
308
+ categorical_features_selected if target_type == "classification" else []
309
+ )
310
+ logger.debug(
311
+ f"Final pre-selection: {len(features)} features below {corr_threshold}% out of {len(features_selected_list)} features, and rejected {len(features_correlated)} features, {100*len(features)/len(features_selected_list):.2f}% features selected"
312
+ )
259
313
 
260
- column_transformer = joblib.load(f"{save_dir}/column_transformer.pkl")
314
+ # analysis 2
315
+ features_selected_by_every_methods_uncorrelated = list(
316
+ set(features) & set(features_selected_by_every_methods)
317
+ )
318
+ logger.debug(
319
+ f"In this pre-selection, there is {len(features_selected_by_every_methods_uncorrelated)} features from the {len(features_selected_by_every_methods)} selected unanimously\n"
320
+ )
321
+ logger.debug(
322
+ features_selected[
323
+ features_selected["features"].isin(features)
324
+ ].to_markdown()
325
+ )
261
326
 
262
- X = X.to_numpy()
263
- arrays = []
264
- for name, indices in column_transformer.output_indices_.items():
265
- transformer = column_transformer.named_transformers_.get(name, None)
266
- arr = X[:, indices.start : indices.stop]
327
+ # save to path
328
+ best_features_path = Path(
329
+ f"{self.preprocessing_dir}/features_{target_number}.pkl"
330
+ ).resolve()
331
+ joblib.dump(features, best_features_path)
267
332
 
268
- if transformer in (None, "passthrough", "drop"):
269
- pass
333
+ # save in db
334
+ db_features = Feature.filter(name__in=features)
335
+ # Order matters, to keep the same order in db as in features, we need: map features by name
336
+ feature_by_name = {f.name: f for f in db_features}
337
+ # Reorder them according to original `features` list
338
+ ordered_db_features = [
339
+ feature_by_name[name] for name in features if name in feature_by_name
340
+ ]
270
341
 
271
- else:
272
- arr = transformer.inverse_transform(arr)
342
+ feature_selection = FeatureSelection.get(feature_selection.id)
343
+ feature_selection = feature_selection.add_features(ordered_db_features)
344
+ feature_selection.training_time = stop - start
345
+ feature_selection.best_features_path = best_features_path
346
+ feature_selection.save()
273
347
 
274
- arrays.append(arr)
348
+ return features
275
349
 
276
- retarr = np.concatenate(arrays, axis=1)
350
+ # Remove correlation
351
+ # ------------------
277
352
 
278
- columns_ordinal = [
279
- feature.split("__")[1]
280
- for feature in column_transformer.get_feature_names_out()
281
- if feature.split("__")[0] == "ordinal"
282
- ]
283
- columns_binary_encoder = [
284
- feature.split("__")[1]
285
- for feature in column_transformer.get_feature_names_out()
286
- if feature.split("__")[0] == "binary_encoder"
287
- ]
288
- # Remove trailing "_number" using regex
289
- columns_binary_encoder = {
290
- re.sub(r"_\d+$", "", col) for col in columns_binary_encoder
291
- }
292
- columns_binary_encoder = list(columns_binary_encoder)
293
-
294
- columns_remainder = [
295
- feature.split("__")[1]
296
- for feature in column_transformer.get_feature_names_out()
297
- if feature.split("__")[0] == "remainder"
298
- ]
299
- columns = columns_ordinal + columns_binary_encoder + columns_remainder
300
- decoded_X = pd.DataFrame(
301
- retarr,
302
- columns=columns,
303
- index=index,
304
- )
353
+ def remove_correlated_features(self, corr_threshold: int, vizualize: bool = False):
354
+ X = self.X
355
+ features = X.columns
356
+ # Create correlation matrix, select upper triangle & remove features with correlation greater than threshold
357
+ corr_matrix = X[features].corr().abs()
305
358
 
306
- for col in decoded_X.columns:
307
- if col in columns_ordinal or col in columns_binary_encoder:
308
- decoded_X[col] = decoded_X[col].astype(str)
309
- elif col in original_dtypes:
310
- decoded_X[col] = decoded_X[col].astype(original_dtypes[col])
311
-
312
- # revert timestamps to dates
313
- decoded_X.loc[:, DATE_COLUMN] = decoded_X[DATE_COLUMN].map(pd.Timestamp.fromordinal)
314
-
315
- return pd.concat([decoded_X, y], axis=1)
316
-
317
-
318
- # Filter methods
319
- # ----------------
320
-
321
-
322
- # Linear correlation (Person's R for regression and ANOVA for classification)
323
- def select_feature_by_linear_correlation(
324
- X, y, target_type, percentile: int = 20, save_dir: Optional[str] = None
325
- ):
326
- start = time.time()
327
- test_type = "Person’s R" if target_type == "regression" else "ANOVA"
328
- logger.debug(f"Running {test_type}...")
329
-
330
- model = f_regression if target_type == "regression" else f_classif
331
- feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
332
- feat_scores = pd.DataFrame()
333
- feat_scores["score"] = feat_selector.scores_
334
- feat_scores["pvalue"] = feat_selector.pvalues_
335
- feat_scores["support"] = feat_selector.get_support()
336
- feat_scores["features"] = X.columns
337
- feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
338
- feat_scores["method"] = test_type
339
- feat_scores.sort_values("rank", ascending=True, inplace=True)
340
- stop = time.time()
341
- training_time = timedelta(seconds=(stop - start)).total_seconds()
342
- feat_scores["training_time"] = training_time
343
-
344
- logger.debug(
345
- f"{test_type} evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
346
- )
359
+ upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
360
+ features_uncorrelated = [
361
+ column
362
+ for column in upper.columns
363
+ if all(upper[column].dropna() <= corr_threshold / 100)
364
+ ]
365
+ features_correlated = [
366
+ column
367
+ for column in upper.columns
368
+ if any(upper[column] > corr_threshold / 100)
369
+ ]
347
370
 
348
- feat_scores.to_csv(
349
- f"{save_dir}/{test_type}.csv",
350
- index=True,
351
- header=True,
352
- index_label="ID",
353
- )
371
+ if vizualize:
372
+ features_selected_visualization = (
373
+ X[features]
374
+ .corr()
375
+ .where(np.triu(np.ones(len(features)), k=1).astype(bool))
376
+ .fillna(0)
377
+ )
378
+ # Plot the heatmap
379
+ plt.figure(figsize=(10, 8))
380
+ sns.heatmap(
381
+ corr_matrix,
382
+ annot=True,
383
+ cmap="coolwarm",
384
+ center=0,
385
+ linewidths=1,
386
+ linecolor="black",
387
+ )
388
+ plt.title(f"Correlation Matrix")
389
+ plt.show()
390
+
391
+ logger.info(f"\n{features_selected_visualization.describe().to_string()}")
392
+ logger.info(f"\n{features_selected_visualization.to_string()}")
393
+ return features_uncorrelated, features_correlated
394
+
395
+ # Filter methods
396
+ # ----------------
397
+
398
+ def select_categorical_features(self, percentile, save_dir: Optional[str] = None):
399
+ X, y = self.X_categorical, self.y
400
+
401
+ start = time.time()
402
+ logger.debug("Running Chi2 for categorical features...")
403
+ feat_selector = SelectPercentile(chi2, percentile=percentile).fit(X, y)
404
+ feat_scores = pd.DataFrame()
405
+ feat_scores["score"] = feat_selector.scores_
406
+ feat_scores["pvalue"] = feat_selector.pvalues_
407
+ feat_scores["support"] = feat_selector.get_support()
408
+ feat_scores["features"] = X.columns
409
+ feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
410
+ feat_scores["method"] = "Chi2"
411
+ feat_scores.sort_values("rank", ascending=True, inplace=True)
412
+ stop = time.time()
413
+ training_time = timedelta(seconds=(stop - start)).total_seconds()
414
+ feat_scores["training_time"] = training_time
415
+
416
+ logger.debug(
417
+ f"Chi2 evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
418
+ )
354
419
 
355
- return feat_scores
420
+ feat_scores.to_csv(
421
+ f"{save_dir}/Chi2.csv", index=True, header=True, index_label="ID"
422
+ )
356
423
 
424
+ return feat_scores
425
+
426
+ # Linear correlation (Person's R for regression and ANOVA for classification)
427
+ def select_feature_by_linear_correlation(
428
+ self, percentile: int = 20, save_dir: Optional[str] = None
429
+ ):
430
+ X, y, target_type = self.X_numerical, self.y, self.target_type
431
+
432
+ start = time.time()
433
+ test_type = "Person's R" if target_type == "regression" else "ANOVA"
434
+ logger.debug(f"Running {test_type}...")
435
+
436
+ model = f_regression if target_type == "regression" else f_classif
437
+ feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
438
+ feat_scores = pd.DataFrame()
439
+ feat_scores["score"] = feat_selector.scores_
440
+ feat_scores["pvalue"] = feat_selector.pvalues_
441
+ feat_scores["support"] = feat_selector.get_support()
442
+ feat_scores["features"] = X.columns
443
+ feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
444
+ feat_scores["method"] = test_type
445
+ feat_scores.sort_values("rank", ascending=True, inplace=True)
446
+ stop = time.time()
447
+ training_time = timedelta(seconds=(stop - start)).total_seconds()
448
+ feat_scores["training_time"] = training_time
449
+
450
+ logger.debug(
451
+ f"{test_type} evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
452
+ )
357
453
 
358
- # Non-Linear correlation (Spearsman's R for regression and Kendall’s Tau for classification)
359
- def select_feature_by_nonlinear_correlation(
360
- X, y, target_type, percentile: int = 20, save_dir: Optional[str] = None
361
- ):
362
- start = time.time()
454
+ feat_scores.to_csv(
455
+ f"{save_dir}/{test_type}.csv",
456
+ index=True,
457
+ header=True,
458
+ index_label="ID",
459
+ )
363
460
 
364
- def model(X_model, y_model):
365
- X_model = pd.DataFrame(X_model)
366
- y_model = pd.Series(y_model)
461
+ return feat_scores
367
462
 
368
- method = "spearman" if target_type == "regression" else "kendall"
463
+ # Non-Linear correlation (Spearsman's R for regression and Kendall's Tau for classification)
464
+ def select_feature_by_nonlinear_correlation(
465
+ self, percentile: int = 20, save_dir: Optional[str] = None
466
+ ):
467
+ X, y, target_type = self.X_numerical, self.y, self.target_type
369
468
 
370
- corr_scores = []
371
- p_values = []
469
+ start = time.time()
372
470
 
373
- for col in X_model.columns:
374
- if method == "spearman":
375
- corr, pval = spearmanr(X_model[col], y_model)
376
- else: # Kendall's Tau for classification
377
- corr, pval = kendalltau(X_model[col], y_model)
471
+ def model(X_model, y_model):
472
+ X_model = pd.DataFrame(X_model)
473
+ y_model = pd.Series(y_model)
378
474
 
379
- corr_scores.append(abs(corr)) # Keeping absolute correlation
380
- p_values.append(pval)
475
+ method = "spearman" if target_type == "regression" else "kendall"
381
476
 
382
- return np.array(corr_scores), np.array(p_values)
477
+ corr_scores = []
478
+ p_values = []
383
479
 
384
- test_type = "Spearman’s R" if target_type == "regression" else "Kendall’s Tau"
385
- logger.debug(f"Running {test_type}...")
480
+ for col in X_model.columns:
481
+ if method == "spearman":
482
+ corr, pval = spearmanr(X_model[col], y_model)
483
+ else: # Kendall's Tau for classification
484
+ corr, pval = kendalltau(X_model[col], y_model)
386
485
 
387
- feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
388
- feat_scores = pd.DataFrame()
389
- feat_scores["score"] = feat_selector.scores_
390
- feat_scores["pvalue"] = feat_selector.pvalues_
391
- feat_scores["support"] = feat_selector.get_support()
392
- feat_scores["features"] = X.columns
393
- feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
394
- feat_scores["method"] = test_type
395
- feat_scores.sort_values("rank", ascending=True, inplace=True)
396
- stop = time.time()
397
- training_time = timedelta(seconds=(stop - start)).total_seconds()
398
- feat_scores["training_time"] = training_time
486
+ corr_scores.append(abs(corr)) # Keeping absolute correlation
487
+ p_values.append(pval)
399
488
 
400
- logger.debug(
401
- f"{test_type} evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
402
- )
489
+ return np.array(corr_scores), np.array(p_values)
403
490
 
404
- feat_scores.to_csv(
405
- f"{save_dir}/{test_type}.csv",
406
- index=True,
407
- header=True,
408
- index_label="ID",
409
- )
491
+ test_type = "Spearman's R" if target_type == "regression" else "Kendall's Tau"
492
+ logger.debug(f"Running {test_type}...")
410
493
 
411
- return feat_scores
494
+ feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
495
+ feat_scores = pd.DataFrame()
496
+ feat_scores["score"] = feat_selector.scores_
497
+ feat_scores["pvalue"] = feat_selector.pvalues_
498
+ feat_scores["support"] = feat_selector.get_support()
499
+ feat_scores["features"] = X.columns
500
+ feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
501
+ feat_scores["method"] = test_type
502
+ feat_scores.sort_values("rank", ascending=True, inplace=True)
503
+ stop = time.time()
504
+ training_time = timedelta(seconds=(stop - start)).total_seconds()
505
+ feat_scores["training_time"] = training_time
412
506
 
507
+ logger.debug(
508
+ f"{test_type} evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
509
+ )
413
510
 
414
- # Mutual Information
415
- def select_feature_by_mi(
416
- X, y, target_type, percentile: int = 20, save_dir: Optional[str] = None
417
- ):
418
- start = time.time()
419
- logger.debug("Running Mutual Information...")
420
- model = (
421
- mutual_info_regression if target_type == "regression" else mutual_info_classif
422
- )
423
- feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
424
- feat_scores = pd.DataFrame()
425
- feat_scores["score"] = feat_selector.scores_
426
- feat_scores["support"] = feat_selector.get_support()
427
- feat_scores["features"] = X.columns
428
- feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
429
- feat_scores["method"] = "Mutual Information"
430
- feat_scores.sort_values("rank", ascending=True, inplace=True)
431
- stop = time.time()
432
- training_time = timedelta(seconds=(stop - start)).total_seconds()
433
- feat_scores["training_time"] = training_time
434
-
435
- logger.debug(
436
- f"MI evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
437
- )
511
+ feat_scores.to_csv(
512
+ f"{save_dir}/{test_type}.csv",
513
+ index=True,
514
+ header=True,
515
+ index_label="ID",
516
+ )
438
517
 
439
- feat_scores.to_csv(f"{save_dir}/MI.csv", index=True, header=True, index_label="ID")
440
-
441
- return feat_scores
442
-
443
-
444
- def select_categorical_features(X, y, percentile, save_dir: Optional[str] = None):
445
- start = time.time()
446
- logger.debug("Running Chi2 for categorical features...")
447
- feat_selector = SelectPercentile(chi2, percentile=percentile).fit(X, y)
448
- feat_scores = pd.DataFrame()
449
- feat_scores["score"] = feat_selector.scores_
450
- feat_scores["pvalue"] = feat_selector.pvalues_
451
- feat_scores["support"] = feat_selector.get_support()
452
- feat_scores["features"] = X.columns
453
- feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
454
- feat_scores["method"] = "Chi2"
455
- feat_scores.sort_values("rank", ascending=True, inplace=True)
456
- stop = time.time()
457
- training_time = timedelta(seconds=(stop - start)).total_seconds()
458
- feat_scores["training_time"] = training_time
459
-
460
- logger.debug(
461
- f"Chi2 evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
462
- )
518
+ return feat_scores
463
519
 
464
- feat_scores.to_csv(
465
- f"{save_dir}/Chi2.csv", index=True, header=True, index_label="ID"
466
- )
520
+ # Mutual Information
521
+ def select_feature_by_mi(
522
+ self, percentile: int = 20, save_dir: Optional[str] = None
523
+ ):
524
+ X, y, target_type = self.X_numerical, self.y, self.target_type
467
525
 
468
- return feat_scores
526
+ start = time.time()
527
+ logger.debug("Running Mutual Information...")
528
+ model = (
529
+ mutual_info_regression
530
+ if target_type == "regression"
531
+ else mutual_info_classif
532
+ )
533
+ feat_selector = SelectPercentile(model, percentile=percentile).fit(X, y)
534
+ feat_scores = pd.DataFrame()
535
+ feat_scores["score"] = feat_selector.scores_
536
+ feat_scores["support"] = feat_selector.get_support()
537
+ feat_scores["features"] = X.columns
538
+ feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
539
+ feat_scores["method"] = "Mutual Information"
540
+ feat_scores.sort_values("rank", ascending=True, inplace=True)
541
+ stop = time.time()
542
+ training_time = timedelta(seconds=(stop - start)).total_seconds()
543
+ feat_scores["training_time"] = training_time
544
+
545
+ logger.debug(
546
+ f"MI evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
547
+ )
469
548
 
549
+ feat_scores.to_csv(
550
+ f"{save_dir}/MI.csv", index=True, header=True, index_label="ID"
551
+ )
470
552
 
471
- # Intrisic/embeedded method
472
- # ----------------
553
+ return feat_scores
473
554
 
555
+ # Intrisic/embeedded method
556
+ # ----------------
474
557
 
475
- # feature importance
476
- def select_feature_by_feat_imp(
477
- X, y, target_type, percentile: int = 20, save_dir: Optional[str] = None
478
- ):
479
- start = time.time()
480
- logger.debug("Running Feature importance...")
558
+ # feature importance
559
+ def select_feature_by_feat_imp(
560
+ self, percentile: int = 20, save_dir: Optional[str] = None
561
+ ):
562
+ X, y, target_type = self.X_numerical, self.y, self.target_type
481
563
 
482
- params = {"n_estimators": 500, "max_depth": 2**3, "random_state": 42, "n_jobs": -1}
564
+ start = time.time()
565
+ logger.debug("Running Feature importance...")
483
566
 
484
- estimator = (
485
- RandomForestClassifier(**params)
486
- if target_type == "classification"
487
- else RandomForestRegressor(**params)
488
- )
567
+ params = {
568
+ "n_estimators": 500,
569
+ "max_depth": 2**3,
570
+ "random_state": 42,
571
+ "n_jobs": -1,
572
+ }
489
573
 
490
- feat_selector = SelectFromModel(
491
- estimator=estimator,
492
- threshold=-np.inf,
493
- max_features=int(percentile * X.shape[1] / 100),
494
- ).fit(X, y)
495
-
496
- feat_scores = pd.DataFrame()
497
- feat_scores["score"] = feat_selector.estimator_.feature_importances_
498
- feat_scores["support"] = feat_selector.get_support()
499
- feat_scores["features"] = X.columns
500
- feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
501
- feat_scores["method"] = "FI"
502
- feat_scores.sort_values("rank", ascending=True, inplace=True)
503
-
504
- stop = time.time()
505
- training_time = timedelta(seconds=(stop - start)).total_seconds()
506
- feat_scores["training_time"] = training_time
507
-
508
- logger.debug(
509
- f"Feat importance evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
510
- )
574
+ estimator = (
575
+ RandomForestClassifier(**params)
576
+ if target_type == "classification"
577
+ else RandomForestRegressor(**params)
578
+ )
511
579
 
512
- feat_scores.to_csv(f"{save_dir}/FI.csv", index=True, header=True, index_label="ID")
580
+ feat_selector = SelectFromModel(
581
+ estimator=estimator,
582
+ threshold=-np.inf,
583
+ max_features=int(percentile * X.shape[1] / 100),
584
+ ).fit(X, y)
585
+
586
+ feat_scores = pd.DataFrame()
587
+ feat_scores["score"] = feat_selector.estimator_.feature_importances_
588
+ feat_scores["support"] = feat_selector.get_support()
589
+ feat_scores["features"] = X.columns
590
+ feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
591
+ feat_scores["method"] = "FI"
592
+ feat_scores.sort_values("rank", ascending=True, inplace=True)
593
+
594
+ stop = time.time()
595
+ training_time = timedelta(seconds=(stop - start)).total_seconds()
596
+ feat_scores["training_time"] = training_time
597
+
598
+ logger.debug(
599
+ f"Feat importance evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
600
+ )
513
601
 
514
- return feat_scores
602
+ feat_scores.to_csv(
603
+ f"{save_dir}/FI.csv", index=True, header=True, index_label="ID"
604
+ )
515
605
 
606
+ return feat_scores
516
607
 
517
- # Wrapper method
518
- # ----------------
608
+ # Wrapper method
609
+ # ----------------
519
610
 
611
+ # recursive feature elimination
612
+ def select_feature_by_rfe(
613
+ self, percentile: int = 20, save_dir: Optional[str] = None
614
+ ):
615
+ X, y, target_type = self.X_numerical, self.y, self.target_type
520
616
 
521
- # recursive feature elimination
522
- def select_feature_by_rfe(
523
- X, y, target_type, percentile: int = 20, save_dir: Optional[str] = None
524
- ):
525
- start = time.time()
526
- logger.debug("Running Recursive Feature Elimination...")
617
+ start = time.time()
618
+ logger.debug("Running Recursive Feature Elimination...")
527
619
 
528
- params = {
529
- "max_depth": 2**3,
530
- "random_state": 42,
531
- }
532
- estimator = (
533
- DecisionTreeClassifier(**params)
534
- if target_type == "classification"
535
- else DecisionTreeRegressor(**params)
536
- )
537
- rfe = RFE(estimator, n_features_to_select=percentile / 100, step=4, verbose=0)
538
- feat_selector = rfe.fit(X, y)
539
-
540
- feat_scores = pd.DataFrame(
541
- {
542
- "score": 0.0, # Default feature importance
543
- "support": feat_selector.get_support(),
544
- "features": X.columns,
545
- "rank": 0,
546
- "method": "RFE",
620
+ params = {
621
+ "max_depth": 2**3,
622
+ "random_state": 42,
547
623
  }
548
- )
549
- feat_scores.loc[
550
- feat_scores["features"].isin(feat_selector.get_feature_names_out()), "score"
551
- ] = list(feat_selector.estimator_.feature_importances_)
552
- feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
553
- feat_scores.sort_values("rank", ascending=True, inplace=True)
554
-
555
- stop = time.time()
556
- training_time = timedelta(seconds=(stop - start)).total_seconds()
557
- feat_scores["training_time"] = training_time
558
-
559
- logger.debug(
560
- f"RFE evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
561
- )
562
-
563
- feat_scores.to_csv(f"{save_dir}/RFE.csv", index=True, header=True, index_label="ID")
564
-
565
- return feat_scores
624
+ estimator = (
625
+ DecisionTreeClassifier(**params)
626
+ if target_type == "classification"
627
+ else DecisionTreeRegressor(**params)
628
+ )
629
+ rfe = RFE(estimator, n_features_to_select=percentile / 100, step=4, verbose=0)
630
+ feat_selector = rfe.fit(X, y)
631
+
632
+ feat_scores = pd.DataFrame(
633
+ {
634
+ "score": 0.0, # Default feature importance
635
+ "support": feat_selector.get_support(),
636
+ "features": X.columns,
637
+ "rank": 0,
638
+ "method": "RFE",
639
+ }
640
+ )
641
+ feat_scores.loc[
642
+ feat_scores["features"].isin(feat_selector.get_feature_names_out()), "score"
643
+ ] = list(feat_selector.estimator_.feature_importances_)
644
+ feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
645
+ feat_scores.sort_values("rank", ascending=True, inplace=True)
646
+
647
+ stop = time.time()
648
+ training_time = timedelta(seconds=(stop - start)).total_seconds()
649
+ feat_scores["training_time"] = training_time
650
+
651
+ logger.debug(
652
+ f"RFE evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
653
+ )
566
654
 
655
+ feat_scores.to_csv(
656
+ f"{save_dir}/RFE.csv", index=True, header=True, index_label="ID"
657
+ )
567
658
 
568
- # SequentialFeatureSelector (loss based, possibility to do forwards or backwards selection or removal)
569
- def select_feature_by_sfs(
570
- X, y, target_type, percentile: int = 20, save_dir: Optional[str] = None
571
- ):
572
- start = time.time()
573
- logger.debug("Running Sequential Feature Selection...")
574
- warnings.filterwarnings("ignore", category=FutureWarning)
659
+ return feat_scores
575
660
 
576
- params = {
577
- "max_depth": 2**3,
578
- "random_state": 42,
579
- }
580
- estimator = (
581
- DecisionTreeClassifier(**params)
582
- if target_type == "classification"
583
- else DecisionTreeRegressor(**params)
584
- )
661
+ # SequentialFeatureSelector (loss based, possibility to do forwards or backwards selection or removal)
662
+ def select_feature_by_sfs(
663
+ self, percentile: int = 20, save_dir: Optional[str] = None
664
+ ):
665
+ X, y, target_type = self.X_numerical, self.y, self.target_type
585
666
 
586
- n_splits = 3
587
- n_samples = len(X)
588
- test_size = int(n_samples / (n_splits + 4))
589
- tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
590
-
591
- score_function = (
592
- make_scorer(
593
- log_loss, response_method="predict_proba"
594
- ) # logloss needs probabilities
595
- if target_type == "classification"
596
- else make_scorer(root_mean_squared_error)
597
- ) # we avoid greater_is_better = False because it make the score negative and mess up ranking
598
-
599
- sfs = SequentialFeatureSelector(
600
- estimator,
601
- k_features=int(percentile * X.shape[1] / 100),
602
- forward=True,
603
- floating=True, # Enables dynamic feature elimination
604
- scoring=score_function,
605
- cv=tscv,
606
- n_jobs=-1,
607
- verbose=0,
608
- )
667
+ start = time.time()
668
+ logger.debug("Running Sequential Feature Selection...")
669
+ warnings.filterwarnings("ignore", category=FutureWarning)
609
670
 
610
- feat_selector = sfs.fit(X, y)
611
-
612
- # Extract selected features and their scores
613
- selected_features = set(feat_selector.k_feature_names_)
614
- feat_subsets = feat_selector.subsets_
615
-
616
- # Create DataFrame for feature scores
617
- feat_scores = pd.DataFrame(
618
- {
619
- "features": X.columns,
620
- "support": X.columns.isin(
621
- selected_features
622
- ), # TODO: comprendre pourquoi le support n'est pas correct (les bons scores ne sont pas toujours choisis)
623
- "score": 1000,
624
- "rank": None,
625
- "method": "SFS",
671
+ params = {
672
+ "max_depth": 2**3,
673
+ "random_state": 42,
626
674
  }
627
- )
628
-
629
- # Sort subsets by score (lower is better)
630
- sorted_subsets = sorted(feat_subsets.items(), key=lambda item: item[1]["avg_score"])
631
-
632
- # Record score per feature (first appearance)
633
- feature_score_map = {}
634
- for step in sorted_subsets:
635
- step = step[1]
636
- for feature in step["feature_names"]:
637
- if feature not in feature_score_map:
638
- feature_score_map[feature] = step["avg_score"]
639
-
640
- # Assign scores
641
- for feature, score in feature_score_map.items():
642
- feat_scores.loc[feat_scores["features"] == feature, "score"] = score
675
+ estimator = (
676
+ DecisionTreeClassifier(**params)
677
+ if target_type == "classification"
678
+ else DecisionTreeRegressor(**params)
679
+ )
643
680
 
644
- # rank by score (lower = better)
645
- feat_scores["rank"] = (
646
- feat_scores["score"].rank(method="first", ascending=True).astype(int)
647
- )
681
+ n_splits = 3
682
+ n_samples = len(X)
683
+ test_size = int(n_samples / (n_splits + 4))
684
+ tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
685
+
686
+ score_function = (
687
+ make_scorer(
688
+ log_loss, response_method="predict_proba"
689
+ ) # logloss needs probabilities
690
+ if target_type == "classification"
691
+ else make_scorer(root_mean_squared_error)
692
+ ) # we avoid greater_is_better = False because it make the score negative and mess up ranking
693
+
694
+ sfs = SequentialFeatureSelector(
695
+ estimator,
696
+ k_features=int(percentile * X.shape[1] / 100),
697
+ forward=True,
698
+ floating=True, # Enables dynamic feature elimination
699
+ scoring=score_function,
700
+ cv=tscv,
701
+ n_jobs=-1,
702
+ verbose=0,
703
+ )
648
704
 
649
- feat_scores.sort_values("rank", ascending=True, inplace=True)
705
+ feat_selector = sfs.fit(X, y)
706
+
707
+ # Extract selected features and their scores
708
+ selected_features = set(feat_selector.k_feature_names_)
709
+ feat_subsets = feat_selector.subsets_
710
+
711
+ # Create DataFrame for feature scores
712
+ feat_scores = pd.DataFrame(
713
+ {
714
+ "features": X.columns,
715
+ "support": X.columns.isin(
716
+ selected_features
717
+ ), # TODO: comprendre pourquoi le support n'est pas correct (les bons scores ne sont pas toujours choisis)
718
+ "score": 1000,
719
+ "rank": None,
720
+ "method": "SFS",
721
+ }
722
+ )
650
723
 
651
- stop = time.time()
652
- training_time = timedelta(seconds=(stop - start)).total_seconds()
653
- feat_scores["training_time"] = training_time
724
+ # Sort subsets by score (lower is better)
725
+ sorted_subsets = sorted(
726
+ feat_subsets.items(), key=lambda item: item[1]["avg_score"]
727
+ )
654
728
 
655
- logger.debug(
656
- f"SFS evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
657
- )
729
+ # Record score per feature (first appearance)
730
+ feature_score_map = {}
731
+ for step in sorted_subsets:
732
+ step = step[1]
733
+ for feature in step["feature_names"]:
734
+ if feature not in feature_score_map:
735
+ feature_score_map[feature] = step["avg_score"]
736
+
737
+ # Assign scores
738
+ for feature, score in feature_score_map.items():
739
+ feat_scores.loc[feat_scores["features"] == feature, "score"] = score
740
+
741
+ # rank by score (lower = better)
742
+ feat_scores["rank"] = (
743
+ feat_scores["score"].rank(method="first", ascending=True).astype(int)
744
+ )
658
745
 
659
- feat_scores.to_csv(f"{save_dir}/SFS.csv", index=True, header=True, index_label="ID")
746
+ feat_scores.sort_values("rank", ascending=True, inplace=True)
660
747
 
661
- return feat_scores
748
+ stop = time.time()
749
+ training_time = timedelta(seconds=(stop - start)).total_seconds()
750
+ feat_scores["training_time"] = training_time
662
751
 
752
+ logger.debug(
753
+ f"SFS evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
754
+ )
663
755
 
664
- # Remove correlation
665
- # ------------------
756
+ feat_scores.to_csv(
757
+ f"{save_dir}/SFS.csv", index=True, header=True, index_label="ID"
758
+ )
666
759
 
760
+ return feat_scores
761
+
762
+
763
+ class PreprocessModel:
764
+
765
+ def __init__(
766
+ self,
767
+ train,
768
+ val,
769
+ test,
770
+ dataset,
771
+ target_numbers,
772
+ target_clf,
773
+ models_idx,
774
+ time_series,
775
+ max_timesteps,
776
+ group_column,
777
+ date_column,
778
+ **kwargs,
779
+ ):
780
+ self.dataset = dataset
781
+ self.target_numbers = target_numbers
782
+ self.target_clf = target_clf
783
+ self.models_idx = models_idx
784
+ self.time_series = time_series
785
+ self.max_timesteps = max_timesteps
786
+ self.group_column = group_column
787
+ self.date_column = date_column
788
+
789
+ self.dataset_dir = dataset.path
790
+ self.data_dir = f"{self.dataset_dir}/data"
791
+ self.preprocessing_dir = f"{self.dataset_dir}/preprocessing"
792
+
793
+ self.all_features = dataset.get_all_features(
794
+ date_column=date_column, group_column=group_column
795
+ )
796
+ columns_to_keep = self.all_features + [
797
+ f"TARGET_{i}" for i in self.target_numbers
798
+ ]
799
+ duplicates = [
800
+ col for col in set(columns_to_keep) if columns_to_keep.count(col) > 1
801
+ ]
802
+ if duplicates:
803
+ raise ValueError(f"Doublons détectés dans columns_to_keep: {duplicates}")
804
+
805
+ self.train = train[columns_to_keep]
806
+ if isinstance(val, pd.DataFrame):
807
+ self.val = val[columns_to_keep]
808
+ if isinstance(test, pd.DataFrame):
809
+ self.test = test[columns_to_keep]
810
+
811
+ def run(self):
812
+ # save data
813
+ joblib.dump(self.train, f"{self.data_dir}/train.pkl")
814
+ joblib.dump(self.val, f"{self.data_dir}/val.pkl")
815
+ joblib.dump(self.test, f"{self.data_dir}/test.pkl")
816
+
817
+ # scaling features
818
+ if any(t not in self.target_clf for t in self.target_numbers) and any(
819
+ all_models[i].get("need_scaling") for i in self.models_idx
820
+ ):
821
+ logger.info("Scaling features...")
822
+ train_scaled, scaler_x, scalers_y = self.scale_data(self.train)
823
+ val_scaled, _, _ = self.scale_data(
824
+ self.val,
825
+ scaler_x=scaler_x,
826
+ scalers_y=scalers_y,
827
+ )
828
+ test_scaled, _, _ = self.scale_data(
829
+ self.test,
830
+ scaler_x=scaler_x,
831
+ scalers_y=scalers_y,
832
+ )
833
+ else:
834
+ train_scaled = None
835
+ val_scaled = None
836
+ test_scaled = None
837
+
838
+ # save data
839
+ joblib.dump(train_scaled, f"{self.data_dir}/train_scaled.pkl")
840
+ joblib.dump(val_scaled, f"{self.data_dir}/val_scaled.pkl")
841
+ joblib.dump(test_scaled, f"{self.data_dir}/test_scaled.pkl")
842
+
843
+ data = {
844
+ "train": self.train,
845
+ "val": self.val,
846
+ "test": self.test,
847
+ "train_scaled": train_scaled,
848
+ "val_scaled": val_scaled,
849
+ "test_scaled": test_scaled,
850
+ "scalers_y": scalers_y,
851
+ }
667
852
 
668
- def remove_correlated_features(
669
- X: pd.DataFrame, features: list, corr_threshold: int, vizualize: bool = False
670
- ):
671
- # Create correlation matrix, select upper triangle & remove features with correlation greater than threshold
672
- corr_matrix = X[features].corr().abs()
853
+ # reshape data for time series
854
+ reshaped_data = None
855
+ if (
856
+ any(all_models[i].get("recurrent") for i in self.models_idx)
857
+ and self.time_series
858
+ ):
859
+ # reshaping data for recurrent models
860
+ logger.info("Reshaping data for recurrent models...")
861
+ reshaped_data = self.reshape_time_series(
862
+ train_scaled,
863
+ val_scaled,
864
+ test_scaled,
865
+ features=self.all_features,
866
+ timesteps=self.max_timesteps,
867
+ )
673
868
 
674
- upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
675
- features_uncorrelated = [
676
- column
677
- for column in upper.columns
678
- if all(upper[column].dropna() <= corr_threshold / 100)
679
- ]
680
- features_correlated = [
681
- column for column in upper.columns if any(upper[column] > corr_threshold / 100)
682
- ]
869
+ return data, reshaped_data
683
870
 
684
- if vizualize:
685
- features_selected_visualization = (
686
- X[features]
687
- .corr()
688
- .where(np.triu(np.ones(len(features)), k=1).astype(bool))
689
- .fillna(0)
871
+ def inference(self):
872
+ # self.train is new data here
873
+ scaler_x = joblib.load(f"{self.preprocessing_dir}/scaler_x.pkl")
874
+ scaled_data = scaler_x.transform(self.train)
875
+ scaled_data = pd.DataFrame(
876
+ scaled_data, columns=self.train.columns, index=self.train.index
690
877
  )
691
- # Plot the heatmap
692
- plt.figure(figsize=(10, 8))
693
- sns.heatmap(
694
- corr_matrix,
695
- annot=True,
696
- cmap="coolwarm",
697
- center=0,
698
- linewidths=1,
699
- linecolor="black",
700
- )
701
- plt.title(f"Correlation Matrix")
702
- plt.show()
703
-
704
- logger.info(f"\n{features_selected_visualization.describe().to_string()}")
705
- logger.info(f"\n{features_selected_visualization.to_string()}")
706
- return features_uncorrelated, features_correlated
707
-
708
-
709
- # Main feature selection function
710
- def feature_selection(
711
- dataset_id: int,
712
- train: pd.DataFrame,
713
- target_number: int,
714
- single_process: bool = False,
715
- ):
716
- """Function to do feature selection with a range of different feature selection technics
717
-
718
- Args:
719
- - train (pd.DataFrame): a pandas train set
720
- - target_number (in): a target, targets need to be name ``TARGET_{n}```
721
- - single_process (bool): if True, run all feature selection methods in a single process. If False, run them in parallel.
722
- """
723
-
724
- # Create the feature selection in db
725
- target = Target.find_by(name=f"TARGET_{target_number}")
726
- dataset = Dataset.get(dataset_id)
727
- percentile = dataset.percentile
728
- corr_threshold = dataset.corr_threshold
729
- max_features = dataset.max_features
730
-
731
- feature_selection = FeatureSelection.upsert(
732
- match_fields=["target_id", "dataset_id"],
733
- target_id=target.id,
734
- dataset_id=dataset.id,
735
- )
736
878
 
737
- X = train.loc[:, ~train.columns.str.contains("^TARGET_")]
738
- y = train[f"TARGET_{target_number}"]
879
+ reshaped_data = None
880
+ if (
881
+ any(all_models[i].get("recurrent") for i in self.models_idx)
882
+ and self.time_series
883
+ ):
884
+ # we need to make sur we have max_timesteps of data after grouping by group_column
885
+ if (
886
+ self.group_column
887
+ and scaled_data.groupby(self.group_column).size().min()
888
+ < self.max_timesteps
889
+ ) or scaled_data.shape[0] < self.max_timesteps:
890
+ raise ValueError(
891
+ f"Not enough data for group_column {self.group_column} to reshape data for recurrent models"
892
+ )
739
893
 
740
- logger.info(f"Starting feature selection for TARGET_{target_number}...")
894
+ # reshaping data for recurrent models
895
+ logger.info("Reshaping data for recurrent models...")
896
+ reshaped_data = self.reshape_time_series(
897
+ scaled_data,
898
+ features=self.all_features,
899
+ timesteps=self.max_timesteps,
900
+ )
741
901
 
742
- target_type = "classification" if target_number in TARGETS_CLF else "regression"
902
+ return self.train, scaled_data, reshaped_data
903
+
904
+ # scaling
905
+ def scale_data(
906
+ self,
907
+ df: pd.DataFrame,
908
+ scaler_x=None,
909
+ scalers_y: Optional[list] = None,
910
+ ):
911
+ logger.info("Scale data...")
912
+ X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
913
+
914
+ if scaler_x:
915
+ X_scaled = pd.DataFrame(
916
+ scaler_x.transform(X), columns=list(X.columns), index=X.index
917
+ )
918
+ else:
919
+ scaler_x = StandardScaler() # MinMaxScaler(feature_range=(-1,1))
920
+ X_scaled = pd.DataFrame(
921
+ scaler_x.fit_transform(X), columns=list(X.columns), index=X.index
922
+ )
923
+ joblib.dump(scaler_x, f"{self.preprocessing_dir}/scaler_x.pkl")
743
924
 
744
- fs_dir_target = f"{dataset.path}/{y.name}/feature_selection"
745
- preprocessing_dir = f"{dataset.path}/preprocessing"
746
- os.makedirs(fs_dir_target, exist_ok=True)
747
- clean_directory(fs_dir_target)
925
+ # Determine which targets need to be scaled
926
+ targets_numbers_to_scale = [
927
+ i for i in self.target_numbers if i not in self.target_clf
928
+ ]
748
929
 
749
- # Let's start by removing extremly correlated features
750
- # This is needed to reduce nb of feature but also for methods such as anova or chi2 that requires independent features
751
- # TODO: we could also remove low variance features
752
- features_uncorrelated, features_correlated = remove_correlated_features(
753
- X, X.columns, 90, vizualize=False
754
- )
755
- X = X[features_uncorrelated]
756
-
757
- logger.debug(
758
- f"""
759
- \nWe first have removed {len(features_correlated)} features with correlation greater than 90%
760
- \nWe are looking to capture {percentile}% of {len(X.columns)} features, i.e. {int(len(X.columns)*percentile/100)} features, with different feature selection methods
761
- \nWe will then remove above {corr_threshold}% correlated features, keeping the one with the best ranks
762
- \nFinally, we will keep only the {max_features} best ranked features
763
- """
764
- )
930
+ # Dictionary to store scaled target data
931
+ scaled_targets = {}
765
932
 
766
- start = time.time()
933
+ if scalers_y:
934
+ for target_number in targets_numbers_to_scale:
935
+ y = df[[f"TARGET_{target_number}"]]
936
+ scaled_targets[target_number] = pd.DataFrame(
937
+ scalers_y[f"scaler_y_{target_number}"].transform(y.values),
938
+ columns=y.columns,
939
+ index=y.index,
940
+ )
941
+ else:
942
+ scalers_y = {}
943
+ for target_number in targets_numbers_to_scale:
944
+ scaler_y = StandardScaler()
945
+ y = df[[f"TARGET_{target_number}"]]
946
+
947
+ scaled_y = pd.DataFrame(
948
+ scaler_y.fit_transform(y.values),
949
+ columns=y.columns,
950
+ index=y.index,
951
+ )
952
+ joblib.dump(
953
+ scaler_y, f"{self.preprocessing_dir}/scaler_y_{target_number}.pkl"
954
+ )
767
955
 
768
- # handling categorical features (only if classification)
769
- categorical_features = X.select_dtypes(include=["int64", "Int64"]).columns.tolist()
770
- X_categorical = X[categorical_features]
956
+ scalers_y[f"scaler_y_{target_number}"] = scaler_y
957
+ scaled_targets[target_number] = scaled_y
771
958
 
772
- if target_type == "classification":
773
- feat_scores = select_categorical_features(
774
- X_categorical, y, percentile, save_dir=fs_dir_target
959
+ # Reconstruct y_scaled in the original order
960
+ y_scaled = pd.concat(
961
+ [
962
+ scaled_targets[target_number]
963
+ for target_number in targets_numbers_to_scale
964
+ ],
965
+ axis=1,
775
966
  )
776
- with get_db() as db:
777
- for row in feat_scores.itertuples(index=False):
778
- feature = Feature.find_by(name=row.features, db=db)
779
- FeatureSelectionRank.upsert(
780
- ["feature_selection_id", "feature_id", "method"],
781
- db=db,
782
- score=row.score,
783
- pvalue=row.pvalue,
784
- support=row.support,
785
- rank=row.rank,
786
- method=row.method,
787
- training_time=row.training_time,
788
- feature_selection_id=feature_selection.id,
789
- feature_id=feature.id,
790
- )
791
- categorical_features_selected = feat_scores[feat_scores["support"] == True][
792
- "features"
793
- ].values.tolist()
794
-
795
- # removing categorical features from X
796
- numerical_features = list(set(X.columns).difference(set(categorical_features)))
797
- X_numerical = X[numerical_features]
798
-
799
- results = []
800
- if single_process:
801
- results = [
802
- select_feature_by_linear_correlation(
803
- X_numerical, y, target_type, percentile, save_dir=fs_dir_target
804
- ),
805
- select_feature_by_nonlinear_correlation(
806
- X_numerical, y, target_type, percentile, save_dir=fs_dir_target
807
- ),
808
- select_feature_by_mi(
809
- X_numerical, y, target_type, percentile, save_dir=fs_dir_target
810
- ),
811
- select_feature_by_feat_imp(
812
- X_numerical, y, target_type, percentile, save_dir=fs_dir_target
813
- ),
814
- select_feature_by_rfe(
815
- X_numerical, y, target_type, percentile, save_dir=fs_dir_target
816
- ),
817
- # select_feature_by_sfs(
818
- # X_numerical, y, target_type, percentile, save_dir=fs_dir_target
819
- # ), # TODO: this is taking too long
967
+ y_not_scaled = df[
968
+ df.columns.intersection([f"TARGET_{i}" for i in self.target_clf])
820
969
  ]
821
- else:
822
- # Use ProcessPoolExecutor to run tasks in parallel
823
- with ProcessPoolExecutor() as executor:
824
- # Submit different functions to be executed in parallel
825
- futures = [
826
- executor.submit(
827
- select_feature_by_linear_correlation,
828
- X_numerical,
829
- y,
830
- target_type,
831
- percentile,
832
- save_dir=fs_dir_target,
833
- ),
834
- executor.submit(
835
- select_feature_by_nonlinear_correlation,
836
- X_numerical,
837
- y,
838
- target_type,
839
- percentile,
840
- save_dir=fs_dir_target,
841
- ),
842
- executor.submit(
843
- select_feature_by_mi,
844
- X_numerical,
845
- y,
846
- target_type,
847
- percentile,
848
- save_dir=fs_dir_target,
849
- ),
850
- executor.submit(
851
- select_feature_by_feat_imp,
852
- X_numerical,
853
- y,
854
- target_type,
855
- percentile,
856
- save_dir=fs_dir_target,
857
- ),
858
- executor.submit(
859
- select_feature_by_rfe,
860
- X_numerical,
861
- y,
862
- target_type,
863
- percentile,
864
- save_dir=fs_dir_target,
865
- ),
866
- executor.submit(
867
- select_feature_by_sfs,
868
- X_numerical,
869
- y,
870
- target_type,
871
- percentile,
872
- save_dir=fs_dir_target,
873
- ),
874
- ]
875
-
876
- # Wait for all futures to complete and gather the results
877
- with tqdm(total=len(futures)) as pbar:
878
- for future in as_completed(futures):
879
- results.append(future.result())
880
- pbar.update(1)
881
- logger.info(f"Finished feature selection for target {target_number}")
882
-
883
- stop = time.time()
884
-
885
- # Once all tasks are completed, start by inserting results to db
886
- feat_scores = pd.concat(
887
- results,
888
- axis=0,
889
- )
890
-
891
- logger.info("Inserting feature selection results to db...")
892
- rows = []
893
-
894
- with get_db() as db:
895
- feature_map = {f.name: f.id for f in Feature.get_all(db=db, limit=20000)}
896
- for row in feat_scores.itertuples(index=False):
897
- feature_id = feature_map.get(row.features)
898
- if not feature_id:
899
- continue # or raise if feature must exist
900
-
901
- rows.append(
902
- {
903
- "feature_selection_id": feature_selection.id,
904
- "feature_id": feature_id,
905
- "method": row.method,
906
- "score": row.score,
907
- "pvalue": None if pd.isna(row.pvalue) else row.pvalue,
908
- "support": row.support,
909
- "rank": row.rank,
910
- "training_time": row.training_time,
911
- }
912
- )
913
-
914
- if len(rows) == 0:
915
- raise ValueError(f"No features selected for TARGET_{target_number}")
916
-
917
- FeatureSelectionRank.bulk_upsert(rows=rows, db=db)
918
-
919
- # Merge the results
920
- features_selected = feat_scores[feat_scores["support"] == True][
921
- ["features", "rank"]
922
- ]
923
- features_selected.sort_values("rank", inplace=True)
924
- features_selected.drop_duplicates("features", inplace=True)
925
970
 
926
- features_selected_list = features_selected["features"].values.tolist()
971
+ # Ensure the final DataFrame keeps the original order
972
+ df_scaled = pd.concat(
973
+ [X_scaled, y_scaled, y_not_scaled],
974
+ axis=1,
975
+ )[
976
+ df.columns
977
+ ] # Reorder columns to match original `df`
978
+
979
+ if not df_scaled.columns.equals(df.columns):
980
+ raise Exception("Columns are not in the same order after scaling.")
981
+
982
+ return df_scaled, scaler_x, scalers_y
983
+
984
+ # Reshape into 3D tensors for recurrent models
985
+ def reshape_time_series(
986
+ self,
987
+ train: pd.DataFrame,
988
+ val: pd.DataFrame,
989
+ test: pd.DataFrame,
990
+ features: list,
991
+ timesteps: int = 120,
992
+ ):
993
+ # always scale for recurrent layers : train should be scaled
994
+ group_column = self.group_column
995
+
996
+ target_columns = train.columns.intersection(
997
+ [f"TARGET_{i}" for i in self.target_numbers]
998
+ )
927
999
 
928
- logger.info("Merging feature selection methods...")
929
- # features_selected = list(dict.fromkeys(features_selected_by_mi + features_selected_by_nonlinear_correlation + features_selected_by_linear_correlation))
930
- features_selected_by_every_methods = set(results[0]["features"].values.tolist())
1000
+ data = pd.concat([train, val, test], axis=0)
931
1001
 
932
- for df in results[1:]:
933
- features_selected_by_every_methods &= set(
934
- df["features"].values.tolist()
935
- ) # intersection
1002
+ def reshape_df(df: pd.DataFrame, group_series: pd.Series, timesteps: int):
1003
+ fill_value = [[[0] * len(df.columns)]]
936
1004
 
937
- features_selected_by_every_methods = list(features_selected_by_every_methods)
1005
+ def shiftsum(x, timesteps: int):
1006
+ tmp = x.copy()
1007
+ for i in range(1, timesteps):
1008
+ tmp = x.shift(i, fill_value=fill_value) + tmp
1009
+ return tmp
938
1010
 
939
- logger.debug(
940
- f"We selected {len(features_selected_list)} features and {len(features_selected_by_every_methods)} were selected unanimously:"
941
- )
942
- logger.debug(features_selected_by_every_methods)
1011
+ logger.info("Grouping each feature in a unique column with list...")
1012
+ df_reshaped = df.apply(list, axis=1).apply(lambda x: [list(x)])
1013
+ df_reshaped = pd.concat([df_reshaped, group_series], axis=1)
943
1014
 
944
- pd.Series(features_selected_list).to_csv(
945
- f"{fs_dir_target}/features_before_corr.csv",
946
- index=True,
947
- header=True,
948
- index_label="ID",
949
- )
950
- features, features_correlated = remove_correlated_features(
951
- X, features_selected_list, corr_threshold
952
- )
953
- pd.Series(features).to_csv(
954
- f"{fs_dir_target}/features_before_max.csv",
955
- index=True,
956
- header=True,
957
- index_label="ID",
958
- )
959
- features = features[:max_features]
1015
+ logger.info("Grouping method stock and creating timesteps...")
1016
+ df_reshaped = (
1017
+ df_reshaped.groupby(group_column)[0]
1018
+ .apply(lambda x: shiftsum(x, timesteps))
1019
+ .reset_index(group_column, drop=True)
1020
+ .rename("RECURRENT_FEATURES")
1021
+ )
1022
+ df_reshaped = pd.DataFrame(df_reshaped)
960
1023
 
961
- features += categorical_features_selected if target_type == "classification" else []
962
- logger.debug(
963
- f"Final pre-selection: {len(features)} features below {corr_threshold}% out of {len(features_selected_list)} features, and rejected {len(features_correlated)} features, {100*len(features)/len(features_selected_list):.2f}% features selected"
964
- )
1024
+ return df_reshaped
965
1025
 
966
- features_selected_by_every_methods_uncorrelated = list(
967
- set(features) & set(features_selected_by_every_methods)
968
- )
969
- logger.debug(
970
- f"In this pre-selection, there is {len(features_selected_by_every_methods_uncorrelated)} features from the {len(features_selected_by_every_methods)} selected unanimously\n"
971
- )
1026
+ data_reshaped = reshape_df(data[features], data[group_column], timesteps)
972
1027
 
973
- logger.debug(
974
- features_selected[features_selected["features"].isin(features)].to_markdown()
975
- )
1028
+ data_reshaped[target_columns] = data[target_columns]
976
1029
 
977
- best_features_path = Path(
978
- f"{preprocessing_dir}/features_{target_number}.pkl"
979
- ).resolve()
980
- if PYTHON_ENV != "Test":
981
- joblib.dump(features, best_features_path)
1030
+ logger.info("Separating train, val, test data and creating np arrays...")
1031
+ train_reshaped = data_reshaped.loc[train.index]
1032
+ val_reshaped = data_reshaped.loc[val.index]
1033
+ test_reshaped = data_reshaped.loc[test.index]
982
1034
 
983
- db_features = Feature.filter(name__in=features)
984
- # Order matters, to keep the same order in db as in features, we need: map features by name
985
- feature_by_name = {f.name: f for f in db_features}
986
- # Reorder them according to original `features` list
987
- ordered_db_features = [
988
- feature_by_name[name] for name in features if name in feature_by_name
989
- ]
990
-
991
- feature_selection = FeatureSelection.get(feature_selection.id)
992
- feature_selection = feature_selection.add_features(ordered_db_features)
993
- feature_selection.training_time = stop - start
994
- feature_selection.best_features_path = best_features_path
995
- feature_selection.save()
1035
+ x_train_reshaped = np.array(
1036
+ train_reshaped["RECURRENT_FEATURES"].values.tolist()
1037
+ )
1038
+ y_train_reshaped = np.array(train_reshaped[target_columns].reset_index())
1039
+ x_val_reshaped = np.array(val_reshaped["RECURRENT_FEATURES"].values.tolist())
1040
+ y_val_reshaped = np.array(val_reshaped[target_columns].reset_index())
1041
+ x_test_reshaped = np.array(test_reshaped["RECURRENT_FEATURES"].values.tolist())
1042
+ y_test_reshaped = np.array(test_reshaped[target_columns].reset_index())
1043
+
1044
+ reshaped_data = {
1045
+ "x_train_reshaped": x_train_reshaped,
1046
+ "y_train_reshaped": y_train_reshaped,
1047
+ "x_val_reshaped": x_val_reshaped,
1048
+ "y_val_reshaped": y_val_reshaped,
1049
+ "x_test_reshaped": x_test_reshaped,
1050
+ "y_test_reshaped": y_test_reshaped,
1051
+ }
996
1052
 
997
- return features
1053
+ return reshaped_data
998
1054
 
999
1055
 
1056
+ # utils
1000
1057
  # TODO : can we use this to select the ideal number of features ?
1001
1058
  def feature_selection_analysis(feature_selection_id: int, n_components: int = 5):
1002
1059
 
@@ -1072,158 +1129,18 @@ def feature_selection_analysis(feature_selection_id: int, n_components: int = 5)
1072
1129
  plt.show()
1073
1130
 
1074
1131
 
1075
- # scaling
1076
- def scale_data(
1077
- df: pd.DataFrame, save_dir: str, scaler_x=None, scalers_y: Optional[list] = None
1078
- ):
1079
- logger.info("Scale data...")
1080
- X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
1081
-
1082
- if scaler_x:
1083
- X_scaled = pd.DataFrame(
1084
- scaler_x.transform(X), columns=list(X.columns), index=X.index
1085
- )
1086
- else:
1087
- scaler_x = StandardScaler() # MinMaxScaler(feature_range=(-1,1))
1088
- X_scaled = pd.DataFrame(
1089
- scaler_x.fit_transform(X), columns=list(X.columns), index=X.index
1090
- )
1091
- if PYTHON_ENV != "Test":
1092
- joblib.dump(scaler_x, f"{save_dir}/scaler_x.pkl")
1093
-
1094
- # Determine which targets need to be scaled
1095
- targets_numbers_to_scale = [i for i in TARGETS_NUMBER if i not in TARGETS_CLF]
1096
-
1097
- # Dictionary to store scaled target data
1098
- scaled_targets = {}
1099
-
1100
- if scalers_y:
1101
- for target_number in targets_numbers_to_scale:
1102
- y = df[[f"TARGET_{target_number}"]]
1103
- scaled_targets[target_number] = pd.DataFrame(
1104
- scalers_y[f"scaler_y_{target_number}"].transform(y.values),
1105
- columns=y.columns,
1106
- index=y.index,
1107
- )
1108
- else:
1109
- scalers_y = {}
1110
- for target_number in targets_numbers_to_scale:
1111
- scaler_y = StandardScaler()
1112
- y = df[[f"TARGET_{target_number}"]]
1113
-
1114
- scaled_y = pd.DataFrame(
1115
- scaler_y.fit_transform(y.values),
1116
- columns=y.columns,
1117
- index=y.index,
1118
- )
1119
- if PYTHON_ENV != "Test":
1120
- joblib.dump(scaler_y, f"{save_dir}/scaler_y_{target_number}.pkl")
1121
-
1122
- scalers_y[f"scaler_y_{target_number}"] = scaler_y
1123
- scaled_targets[target_number] = scaled_y
1124
-
1125
- # Reconstruct y_scaled in the original order
1126
- y_scaled = pd.concat(
1127
- [scaled_targets[target_number] for target_number in targets_numbers_to_scale],
1128
- axis=1,
1129
- )
1130
- y_not_scaled = df[df.columns.intersection([f"TARGET_{i}" for i in TARGETS_CLF])]
1131
-
1132
- # Ensure the final DataFrame keeps the original order
1133
- df_scaled = pd.concat(
1134
- [X_scaled, y_scaled, y_not_scaled],
1135
- axis=1,
1136
- )[
1137
- df.columns
1138
- ] # Reorder columns to match original `df`
1139
-
1140
- if not df_scaled.columns.equals(df.columns):
1141
- raise Exception("Columns are not in the same order after scaling.")
1142
-
1143
- return df_scaled, scaler_x, scalers_y
1144
-
1145
-
1146
- # Reshape into 3D tensors for recurrent models
1147
- def reshape_time_series(
1148
- train: pd.DataFrame,
1149
- val: pd.DataFrame,
1150
- test: pd.DataFrame,
1151
- features: list,
1152
- timesteps: int = 120,
1153
- ):
1154
- # always scale for recurrent layers : train should be scaled
1155
-
1156
- target_columns = train.columns.intersection([f"TARGET_{i}" for i in TARGETS_NUMBER])
1157
-
1158
- data = pd.concat([train, val, test], axis=0)
1159
-
1160
- data_reshaped = reshape_df(data[features], data[GROUPING_COLUMN], timesteps)
1161
-
1162
- data_reshaped[target_columns] = data[target_columns]
1163
-
1164
- logger.info("Separating train, val, test data and creating np arrays...")
1165
- train_reshaped = data_reshaped.loc[train.index]
1166
- val_reshaped = data_reshaped.loc[val.index]
1167
- test_reshaped = data_reshaped.loc[test.index]
1168
-
1169
- x_train_reshaped = np.array(train_reshaped["RECURRENT_FEATURES"].values.tolist())
1170
- y_train_reshaped = np.array(train_reshaped[target_columns].reset_index())
1171
- x_val_reshaped = np.array(val_reshaped["RECURRENT_FEATURES"].values.tolist())
1172
- y_val_reshaped = np.array(val_reshaped[target_columns].reset_index())
1173
- x_test_reshaped = np.array(test_reshaped["RECURRENT_FEATURES"].values.tolist())
1174
- y_test_reshaped = np.array(test_reshaped[target_columns].reset_index())
1175
-
1176
- reshaped_data = {
1177
- "x_train_reshaped": x_train_reshaped,
1178
- "y_train_reshaped": y_train_reshaped,
1179
- "x_val_reshaped": x_val_reshaped,
1180
- "y_val_reshaped": y_val_reshaped,
1181
- "x_test_reshaped": x_test_reshaped,
1182
- "y_test_reshaped": y_test_reshaped,
1183
- }
1184
-
1185
- return reshaped_data
1186
-
1187
-
1188
- def reshape_df(df: pd.DataFrame, stock_column: pd.DataFrame, timesteps: int):
1189
- fill_value = [[[0] * len(df.columns)]]
1190
-
1191
- def shiftsum(x, timesteps: int):
1192
- tmp = x.copy()
1193
- for i in range(1, timesteps):
1194
- tmp = x.shift(i, fill_value=fill_value) + tmp
1195
- return tmp
1196
-
1197
- logger.info("Grouping each feature in a unique column with list...")
1198
- df_reshaped = df.apply(list, axis=1).apply(lambda x: [list(x)])
1199
- df_reshaped = pd.concat([df_reshaped, stock_column], axis=1)
1200
-
1201
- logger.info("Grouping method stock and creating timesteps...")
1202
- df_reshaped = (
1203
- df_reshaped.groupby(GROUPING_COLUMN)[0]
1204
- .apply(lambda x: shiftsum(x, timesteps))
1205
- .reset_index(GROUPING_COLUMN, drop=True)
1206
- .rename("RECURRENT_FEATURES")
1207
- )
1208
- df_reshaped = pd.DataFrame(df_reshaped)
1209
-
1210
- return df_reshaped
1211
-
1212
-
1213
- def load_train_data(dataset_dir, target_number, target_type="regression"):
1214
- train_data_dir = f"{dataset_dir}/data"
1215
- preprocessing_dir = f"{dataset_dir}/preprocessing"
1216
-
1217
- _scaler_y = (
1218
- joblib.load(f"{preprocessing_dir}/scaler_y_{target_number}.pkl")
1219
- if target_type == "regression"
1220
- else None
1221
- )
1132
+ def get_features_by_types(df: pd.DataFrame, sample_categorical_threshold: int = 15):
1133
+ categorical_features = [
1134
+ col
1135
+ for col in df.columns
1136
+ if df[col].nunique() <= sample_categorical_threshold
1137
+ and df[col].dtype in ["int64", "Int64"]
1138
+ ]
1139
+ df_categorical = df[categorical_features]
1140
+ logger.info(f"Number of categorical features: {len(categorical_features)}")
1222
1141
 
1223
- logger.info("Loading data...")
1224
- train = joblib.load(f"{train_data_dir}/train.pkl")
1225
- val = joblib.load(f"{train_data_dir}/val.pkl")
1226
- train_scaled = joblib.load(f"{train_data_dir}/train_scaled.pkl")
1227
- val_scaled = joblib.load(f"{train_data_dir}/val_scaled.pkl")
1142
+ numerical_features = list(set(df.columns).difference(set(categorical_features)))
1143
+ df_numerical = df[numerical_features]
1144
+ logger.info(f"Number of numerical features: {len(numerical_features)}")
1228
1145
 
1229
- return train, val, train_scaled, val_scaled, _scaler_y
1146
+ return df_categorical, df_numerical