lecrapaud 0.19.0__py3-none-any.whl → 0.22.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lecrapaud/__init__.py +22 -1
- lecrapaud/{api.py → base.py} +331 -241
- lecrapaud/config.py +15 -3
- lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +39 -0
- lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +264 -0
- lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +75 -0
- lecrapaud/db/models/__init__.py +2 -4
- lecrapaud/db/models/base.py +116 -65
- lecrapaud/db/models/experiment.py +195 -182
- lecrapaud/db/models/feature_selection.py +0 -3
- lecrapaud/db/models/feature_selection_rank.py +0 -18
- lecrapaud/db/models/model_selection.py +2 -2
- lecrapaud/db/models/{score.py → model_selection_score.py} +29 -12
- lecrapaud/db/session.py +4 -0
- lecrapaud/experiment.py +44 -17
- lecrapaud/feature_engineering.py +45 -674
- lecrapaud/feature_preprocessing.py +1202 -0
- lecrapaud/feature_selection.py +145 -332
- lecrapaud/integrations/sentry_integration.py +46 -0
- lecrapaud/misc/tabpfn_tests.ipynb +2 -2
- lecrapaud/mixins.py +247 -0
- lecrapaud/model_preprocessing.py +295 -0
- lecrapaud/model_selection.py +612 -242
- lecrapaud/pipeline.py +548 -0
- lecrapaud/search_space.py +2 -1
- lecrapaud/utils.py +36 -3
- lecrapaud-0.22.6.dist-info/METADATA +423 -0
- lecrapaud-0.22.6.dist-info/RECORD +51 -0
- {lecrapaud-0.19.0.dist-info → lecrapaud-0.22.6.dist-info}/WHEEL +1 -1
- {lecrapaud-0.19.0.dist-info → lecrapaud-0.22.6.dist-info/licenses}/LICENSE +1 -1
- lecrapaud/db/models/model_training.py +0 -64
- lecrapaud/jobs/__init__.py +0 -13
- lecrapaud/jobs/config.py +0 -17
- lecrapaud/jobs/scheduler.py +0 -30
- lecrapaud/jobs/tasks.py +0 -17
- lecrapaud-0.19.0.dist-info/METADATA +0 -249
- lecrapaud-0.19.0.dist-info/RECORD +0 -48
lecrapaud/feature_selection.py
CHANGED
|
@@ -5,16 +5,12 @@ import matplotlib.pyplot as plt
|
|
|
5
5
|
import seaborn as sns
|
|
6
6
|
import os
|
|
7
7
|
import time
|
|
8
|
-
from typing import Optional
|
|
9
8
|
from tqdm import tqdm
|
|
10
9
|
import warnings
|
|
11
10
|
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
12
11
|
import joblib
|
|
13
|
-
import re
|
|
14
12
|
from pathlib import Path
|
|
15
13
|
|
|
16
|
-
os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
|
|
17
|
-
|
|
18
14
|
# feature selection
|
|
19
15
|
from sklearn.feature_selection import (
|
|
20
16
|
f_classif,
|
|
@@ -33,7 +29,6 @@ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
|
|
33
29
|
from sklearn.model_selection import TimeSeriesSplit
|
|
34
30
|
from sklearn.metrics import root_mean_squared_error, log_loss, make_scorer
|
|
35
31
|
from mlxtend.feature_selection import SequentialFeatureSelector
|
|
36
|
-
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
|
37
32
|
from scipy.stats import spearmanr, kendalltau
|
|
38
33
|
|
|
39
34
|
# Internal
|
|
@@ -48,6 +43,7 @@ from lecrapaud.db import (
|
|
|
48
43
|
FeatureSelectionRank,
|
|
49
44
|
)
|
|
50
45
|
from lecrapaud.search_space import all_models
|
|
46
|
+
from lecrapaud.mixins import LeCrapaudEstimatorMixin
|
|
51
47
|
|
|
52
48
|
# Annoying Warnings
|
|
53
49
|
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
@@ -72,39 +68,66 @@ def load_train_data(experiment_dir):
|
|
|
72
68
|
return train, val, test, train_scaled, val_scaled, test_scaled
|
|
73
69
|
|
|
74
70
|
|
|
75
|
-
class
|
|
76
|
-
def __init__(self,
|
|
77
|
-
|
|
78
|
-
|
|
71
|
+
class FeatureSelector(LeCrapaudEstimatorMixin):
|
|
72
|
+
def __init__(self, experiment=None, target_number=None, **kwargs):
|
|
73
|
+
# The mixin will automatically set all experiment.context parameters as attributes
|
|
74
|
+
super().__init__(experiment=experiment, target_number=target_number, **kwargs)
|
|
75
|
+
|
|
76
|
+
# Set defaults for required parameters if not provided
|
|
77
|
+
if not hasattr(self, "target_clf"):
|
|
78
|
+
self.target_clf = []
|
|
79
|
+
if not hasattr(self, "max_p_value_categorical"):
|
|
80
|
+
self.max_p_value_categorical = 0.05
|
|
81
|
+
if not hasattr(self, "percentile"):
|
|
82
|
+
self.percentile = 20
|
|
83
|
+
if not hasattr(self, "corr_threshold"):
|
|
84
|
+
self.corr_threshold = 80
|
|
85
|
+
if not hasattr(self, "max_features"):
|
|
86
|
+
self.max_features = 50
|
|
87
|
+
|
|
79
88
|
self.target_number = target_number
|
|
80
|
-
self.target_clf = target_clf
|
|
81
89
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
90
|
+
# Derived attributes
|
|
91
|
+
if self.target_number is not None and hasattr(self, "target_clf"):
|
|
92
|
+
self.target_type = (
|
|
93
|
+
"classification"
|
|
94
|
+
if self.target_number in self.target_clf
|
|
95
|
+
else "regression"
|
|
96
|
+
)
|
|
88
97
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
98
|
+
# Set paths if experiment is available
|
|
99
|
+
if self.experiment:
|
|
100
|
+
self.experiment_dir = self.experiment.path
|
|
101
|
+
self.experiment_id = self.experiment.id
|
|
102
|
+
self.data_dir = f"{self.experiment_dir}/data"
|
|
103
|
+
if self.target_number is not None:
|
|
104
|
+
self.target_dir = f"{self.experiment_dir}/TARGET_{self.target_number}"
|
|
105
|
+
self.feature_selection_dir = f"{self.target_dir}/feature_selection"
|
|
106
|
+
os.makedirs(self.feature_selection_dir, exist_ok=True)
|
|
95
107
|
|
|
96
108
|
# Main feature selection function
|
|
97
|
-
def
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
):
|
|
101
|
-
"""Function to do feature selection with a range of different feature selection technics
|
|
109
|
+
def fit(self, X, y=None, single_process=True):
|
|
110
|
+
"""
|
|
111
|
+
Fit the feature selector.
|
|
102
112
|
|
|
103
113
|
Args:
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
114
|
+
X (pd.DataFrame): Input features
|
|
115
|
+
y: Target values (ignored, uses TARGET columns in X)
|
|
116
|
+
single_process (bool): if True, run all feature selection methods in a single process
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
self: Returns self for chaining (sklearn convention)
|
|
107
120
|
"""
|
|
121
|
+
# Validate data
|
|
122
|
+
X, y = self._validate_data(X, y)
|
|
123
|
+
|
|
124
|
+
# Store train data
|
|
125
|
+
self.train = X
|
|
126
|
+
|
|
127
|
+
# Check that target_number is set
|
|
128
|
+
if self.target_number is None:
|
|
129
|
+
raise ValueError("target_number must be set before fitting")
|
|
130
|
+
|
|
108
131
|
target_number = self.target_number
|
|
109
132
|
target_type = self.target_type
|
|
110
133
|
|
|
@@ -115,7 +138,6 @@ class FeatureSelectionEngine:
|
|
|
115
138
|
max_features = self.max_features
|
|
116
139
|
|
|
117
140
|
feature_selection = FeatureSelection.upsert(
|
|
118
|
-
match_fields=["target_id", "experiment_id"],
|
|
119
141
|
target_id=target.id,
|
|
120
142
|
experiment_id=self.experiment_id,
|
|
121
143
|
)
|
|
@@ -276,6 +298,58 @@ class FeatureSelectionEngine:
|
|
|
276
298
|
|
|
277
299
|
features_selected_list = features_selected["features"].values.tolist()
|
|
278
300
|
|
|
301
|
+
# Save ensemble features for all numerical features with global ranking
|
|
302
|
+
logger.info(
|
|
303
|
+
"Saving ensemble features with global ranking for all numerical features..."
|
|
304
|
+
)
|
|
305
|
+
numerical_features_in_data = self.X_numerical.columns.tolist()
|
|
306
|
+
ensemble_rows = []
|
|
307
|
+
|
|
308
|
+
# Create global ranking for ALL numerical features (1 to n, no null values)
|
|
309
|
+
all_numerical_scores = pd.concat(results, axis=0)
|
|
310
|
+
all_numerical_scores = (
|
|
311
|
+
all_numerical_scores.groupby("features")
|
|
312
|
+
.agg({"rank": "mean"}) # Average rank across all methods
|
|
313
|
+
.reset_index()
|
|
314
|
+
)
|
|
315
|
+
all_numerical_scores.sort_values("rank", inplace=True)
|
|
316
|
+
all_numerical_scores["global_rank"] = range(1, len(all_numerical_scores) + 1)
|
|
317
|
+
|
|
318
|
+
for feature in numerical_features_in_data:
|
|
319
|
+
feature_id = feature_map.get(feature)
|
|
320
|
+
if feature_id:
|
|
321
|
+
is_selected = feature in features_selected_list
|
|
322
|
+
|
|
323
|
+
# Get global rank (no null values - all features get a rank)
|
|
324
|
+
if feature in all_numerical_scores["features"].values:
|
|
325
|
+
global_rank = all_numerical_scores[
|
|
326
|
+
all_numerical_scores["features"] == feature
|
|
327
|
+
]["global_rank"].values[0]
|
|
328
|
+
else:
|
|
329
|
+
# Fallback: assign last rank + position for features not in results
|
|
330
|
+
global_rank = (
|
|
331
|
+
len(all_numerical_scores)
|
|
332
|
+
+ numerical_features_in_data.index(feature)
|
|
333
|
+
+ 1
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
ensemble_rows.append(
|
|
337
|
+
{
|
|
338
|
+
"feature_selection_id": feature_selection.id,
|
|
339
|
+
"feature_id": feature_id,
|
|
340
|
+
"method": "ensemble",
|
|
341
|
+
"score": None,
|
|
342
|
+
"pvalue": None,
|
|
343
|
+
"support": (
|
|
344
|
+
2 if is_selected else 0
|
|
345
|
+
), # 2 = in aggregated features
|
|
346
|
+
"rank": global_rank,
|
|
347
|
+
"training_time": 0,
|
|
348
|
+
}
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
FeatureSelectionRank.bulk_upsert(rows=ensemble_rows)
|
|
352
|
+
|
|
279
353
|
# analysis 1
|
|
280
354
|
features_selected_by_every_methods = set(results[0]["features"].values.tolist())
|
|
281
355
|
for df in results[1:]:
|
|
@@ -303,12 +377,30 @@ class FeatureSelectionEngine:
|
|
|
303
377
|
header=True,
|
|
304
378
|
index_label="ID",
|
|
305
379
|
)
|
|
380
|
+
|
|
381
|
+
# Update support for features after correlation removal (before max)
|
|
382
|
+
logger.info("Updating ensemble features after correlation removal...")
|
|
383
|
+
for row in ensemble_rows:
|
|
384
|
+
feature = Feature.get(row["feature_id"]).name
|
|
385
|
+
if feature in features:
|
|
386
|
+
row["support"] = 1 # 1 = survived correlation removal
|
|
387
|
+
|
|
306
388
|
features = features[:max_features]
|
|
307
389
|
|
|
308
390
|
# adding categorical features selected
|
|
309
391
|
features += (
|
|
310
392
|
categorical_features_selected if target_type == "classification" else []
|
|
311
393
|
)
|
|
394
|
+
|
|
395
|
+
# Final update for features after max limitation (final selection)
|
|
396
|
+
logger.info("Finalizing ensemble features...")
|
|
397
|
+
for row in ensemble_rows:
|
|
398
|
+
feature = Feature.get(row["feature_id"]).name
|
|
399
|
+
if feature in features and row["support"] == 1:
|
|
400
|
+
row["support"] = 2 # 2 = in final selection
|
|
401
|
+
|
|
402
|
+
# Re-save all ensemble data with updated support values
|
|
403
|
+
FeatureSelectionRank.bulk_upsert(rows=ensemble_rows)
|
|
312
404
|
logger.debug(
|
|
313
405
|
f"Final pre-selection: {len(features)} features below {corr_threshold}% out of {len(features_selected_list)} features, and rejected {len(features_correlated)} features, {100*len(features)/len(features_selected_list):.2f}% features selected"
|
|
314
406
|
)
|
|
@@ -352,7 +444,20 @@ class FeatureSelectionEngine:
|
|
|
352
444
|
feature_selection.best_features_path = best_features_path
|
|
353
445
|
feature_selection.save()
|
|
354
446
|
|
|
355
|
-
|
|
447
|
+
# Store selected features for later access
|
|
448
|
+
self.selected_features_ = features
|
|
449
|
+
self._set_fitted()
|
|
450
|
+
return self
|
|
451
|
+
|
|
452
|
+
def get_selected_features(self):
|
|
453
|
+
"""
|
|
454
|
+
Get the list of selected features after fitting.
|
|
455
|
+
|
|
456
|
+
Returns:
|
|
457
|
+
list: Selected feature names
|
|
458
|
+
"""
|
|
459
|
+
self._check_is_fitted()
|
|
460
|
+
return self.selected_features_
|
|
356
461
|
|
|
357
462
|
# Remove correlation
|
|
358
463
|
# ------------------
|
|
@@ -441,13 +546,20 @@ class FeatureSelectionEngine:
|
|
|
441
546
|
feat_scores["features"] = X.columns
|
|
442
547
|
feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
|
|
443
548
|
feat_scores["method"] = "Chi2"
|
|
549
|
+
|
|
550
|
+
# Apply both percentile and p-value filtering
|
|
551
|
+
# Keep features that satisfy BOTH conditions: within percentile AND p-value < threshold
|
|
552
|
+
feat_scores["support"] = feat_scores["support"] & (
|
|
553
|
+
feat_scores["pvalue"] <= self.max_p_value_categorical
|
|
554
|
+
)
|
|
555
|
+
|
|
444
556
|
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
445
557
|
stop = time.time()
|
|
446
558
|
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
447
559
|
feat_scores["training_time"] = training_time
|
|
448
560
|
|
|
449
561
|
logger.debug(
|
|
450
|
-
f"Chi2 evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
562
|
+
f"Chi2 evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds (percentile={percentile}%, p-value<={self.max_p_value_categorical})"
|
|
451
563
|
)
|
|
452
564
|
|
|
453
565
|
feat_scores.to_csv(
|
|
@@ -796,305 +908,6 @@ class FeatureSelectionEngine:
|
|
|
796
908
|
return feat_scores
|
|
797
909
|
|
|
798
910
|
|
|
799
|
-
class PreprocessModel:
|
|
800
|
-
|
|
801
|
-
def __init__(
|
|
802
|
-
self,
|
|
803
|
-
train,
|
|
804
|
-
val,
|
|
805
|
-
test,
|
|
806
|
-
experiment,
|
|
807
|
-
target_numbers,
|
|
808
|
-
target_clf,
|
|
809
|
-
models_idx,
|
|
810
|
-
time_series,
|
|
811
|
-
max_timesteps,
|
|
812
|
-
group_column,
|
|
813
|
-
date_column,
|
|
814
|
-
**kwargs,
|
|
815
|
-
):
|
|
816
|
-
self.train = train
|
|
817
|
-
self.val = val
|
|
818
|
-
self.test = test
|
|
819
|
-
self.experiment = experiment
|
|
820
|
-
self.target_numbers = target_numbers
|
|
821
|
-
self.target_clf = target_clf
|
|
822
|
-
self.models_idx = models_idx
|
|
823
|
-
self.time_series = time_series
|
|
824
|
-
self.max_timesteps = max_timesteps
|
|
825
|
-
self.group_column = group_column
|
|
826
|
-
self.date_column = date_column
|
|
827
|
-
|
|
828
|
-
self.experiment_dir = experiment.path
|
|
829
|
-
self.data_dir = f"{self.experiment_dir}/data"
|
|
830
|
-
self.preprocessing_dir = f"{self.experiment_dir}/preprocessing"
|
|
831
|
-
|
|
832
|
-
self.all_features = experiment.get_all_features(
|
|
833
|
-
date_column=date_column, group_column=group_column
|
|
834
|
-
)
|
|
835
|
-
|
|
836
|
-
def run(self):
|
|
837
|
-
# save data
|
|
838
|
-
columns_to_keep = self.all_features + [
|
|
839
|
-
f"TARGET_{i}" for i in self.target_numbers
|
|
840
|
-
]
|
|
841
|
-
duplicates = [
|
|
842
|
-
col for col in set(columns_to_keep) if columns_to_keep.count(col) > 1
|
|
843
|
-
]
|
|
844
|
-
if duplicates:
|
|
845
|
-
raise ValueError(f"Doublons détectés dans columns_to_keep: {duplicates}")
|
|
846
|
-
self.train = self.train[columns_to_keep]
|
|
847
|
-
self.val = self.val[columns_to_keep]
|
|
848
|
-
self.test = self.test[columns_to_keep]
|
|
849
|
-
joblib.dump(self.train, f"{self.data_dir}/train.pkl")
|
|
850
|
-
joblib.dump(self.val, f"{self.data_dir}/val.pkl")
|
|
851
|
-
joblib.dump(self.test, f"{self.data_dir}/test.pkl")
|
|
852
|
-
|
|
853
|
-
# scaling features
|
|
854
|
-
if any(t not in self.target_clf for t in self.target_numbers) and any(
|
|
855
|
-
all_models[i].get("need_scaling") for i in self.models_idx
|
|
856
|
-
):
|
|
857
|
-
logger.info("Scaling features...")
|
|
858
|
-
train_scaled, scaler_x, scalers_y = self.scale_data(self.train)
|
|
859
|
-
val_scaled, _, _ = self.scale_data(
|
|
860
|
-
self.val,
|
|
861
|
-
scaler_x=scaler_x,
|
|
862
|
-
scalers_y=scalers_y,
|
|
863
|
-
)
|
|
864
|
-
test_scaled, _, _ = self.scale_data(
|
|
865
|
-
self.test,
|
|
866
|
-
scaler_x=scaler_x,
|
|
867
|
-
scalers_y=scalers_y,
|
|
868
|
-
)
|
|
869
|
-
else:
|
|
870
|
-
train_scaled = None
|
|
871
|
-
val_scaled = None
|
|
872
|
-
test_scaled = None
|
|
873
|
-
scaler_x = None
|
|
874
|
-
|
|
875
|
-
# save data
|
|
876
|
-
joblib.dump(train_scaled, f"{self.data_dir}/train_scaled.pkl")
|
|
877
|
-
joblib.dump(val_scaled, f"{self.data_dir}/val_scaled.pkl")
|
|
878
|
-
joblib.dump(test_scaled, f"{self.data_dir}/test_scaled.pkl")
|
|
879
|
-
joblib.dump(scaler_x, f"{self.preprocessing_dir}/scaler_x.pkl")
|
|
880
|
-
|
|
881
|
-
data = {
|
|
882
|
-
"train": self.train,
|
|
883
|
-
"val": self.val,
|
|
884
|
-
"test": self.test,
|
|
885
|
-
"train_scaled": train_scaled,
|
|
886
|
-
"val_scaled": val_scaled,
|
|
887
|
-
"test_scaled": test_scaled,
|
|
888
|
-
}
|
|
889
|
-
|
|
890
|
-
# reshape data for time series
|
|
891
|
-
reshaped_data = None
|
|
892
|
-
if (
|
|
893
|
-
any(all_models[i].get("recurrent") for i in self.models_idx)
|
|
894
|
-
and self.time_series
|
|
895
|
-
):
|
|
896
|
-
# reshaping data for recurrent models
|
|
897
|
-
logger.info("Reshaping data for recurrent models...")
|
|
898
|
-
reshaped_data = self.reshape_time_series(
|
|
899
|
-
train_scaled,
|
|
900
|
-
val_scaled,
|
|
901
|
-
test_scaled,
|
|
902
|
-
features=self.all_features,
|
|
903
|
-
timesteps=self.max_timesteps,
|
|
904
|
-
)
|
|
905
|
-
|
|
906
|
-
return data, reshaped_data
|
|
907
|
-
|
|
908
|
-
def inference(self):
|
|
909
|
-
# self.train is new data here
|
|
910
|
-
columns_to_keep = self.all_features
|
|
911
|
-
self.train = self.train[columns_to_keep]
|
|
912
|
-
|
|
913
|
-
scaler_x = joblib.load(f"{self.preprocessing_dir}/scaler_x.pkl")
|
|
914
|
-
|
|
915
|
-
if scaler_x:
|
|
916
|
-
scaled_data = scaler_x.transform(self.train)
|
|
917
|
-
scaled_data = pd.DataFrame(
|
|
918
|
-
scaled_data, columns=self.train.columns, index=self.train.index
|
|
919
|
-
)
|
|
920
|
-
else:
|
|
921
|
-
scaled_data = self.train
|
|
922
|
-
|
|
923
|
-
reshaped_data = None
|
|
924
|
-
if (
|
|
925
|
-
any(all_models[i].get("recurrent") for i in self.models_idx)
|
|
926
|
-
and self.time_series
|
|
927
|
-
):
|
|
928
|
-
# we need to make sur we have max_timesteps of data after grouping by group_column
|
|
929
|
-
if (
|
|
930
|
-
self.group_column
|
|
931
|
-
and scaled_data.groupby(self.group_column).size().min()
|
|
932
|
-
< self.max_timesteps
|
|
933
|
-
) or scaled_data.shape[0] < self.max_timesteps:
|
|
934
|
-
raise ValueError(
|
|
935
|
-
f"Not enough data for group_column {self.group_column} to reshape data for recurrent models"
|
|
936
|
-
)
|
|
937
|
-
|
|
938
|
-
# reshaping data for recurrent models
|
|
939
|
-
logger.info("Reshaping data for recurrent models...")
|
|
940
|
-
reshaped_data = self.reshape_time_series(
|
|
941
|
-
scaled_data,
|
|
942
|
-
features=self.all_features,
|
|
943
|
-
timesteps=self.max_timesteps,
|
|
944
|
-
)
|
|
945
|
-
|
|
946
|
-
return self.train, scaled_data, reshaped_data
|
|
947
|
-
|
|
948
|
-
# scaling
|
|
949
|
-
def scale_data(
|
|
950
|
-
self,
|
|
951
|
-
df: pd.DataFrame,
|
|
952
|
-
scaler_x=None,
|
|
953
|
-
scalers_y: Optional[list] = None,
|
|
954
|
-
):
|
|
955
|
-
logger.info("Scale data...")
|
|
956
|
-
X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
|
|
957
|
-
|
|
958
|
-
if scaler_x:
|
|
959
|
-
X_scaled = pd.DataFrame(
|
|
960
|
-
scaler_x.transform(X), columns=list(X.columns), index=X.index
|
|
961
|
-
)
|
|
962
|
-
else:
|
|
963
|
-
scaler_x = StandardScaler() # MinMaxScaler(feature_range=(-1,1))
|
|
964
|
-
X_scaled = pd.DataFrame(
|
|
965
|
-
scaler_x.fit_transform(X), columns=list(X.columns), index=X.index
|
|
966
|
-
)
|
|
967
|
-
|
|
968
|
-
# Determine which targets need to be scaled
|
|
969
|
-
targets_numbers_to_scale = [
|
|
970
|
-
i for i in self.target_numbers if i not in self.target_clf
|
|
971
|
-
]
|
|
972
|
-
|
|
973
|
-
# Dictionary to store scaled target data
|
|
974
|
-
scaled_targets = {}
|
|
975
|
-
|
|
976
|
-
if scalers_y:
|
|
977
|
-
for target_number in targets_numbers_to_scale:
|
|
978
|
-
y = df[[f"TARGET_{target_number}"]]
|
|
979
|
-
scaled_targets[target_number] = pd.DataFrame(
|
|
980
|
-
scalers_y[f"scaler_y_{target_number}"].transform(y.values),
|
|
981
|
-
columns=y.columns,
|
|
982
|
-
index=y.index,
|
|
983
|
-
)
|
|
984
|
-
else:
|
|
985
|
-
scalers_y = {}
|
|
986
|
-
for target_number in targets_numbers_to_scale:
|
|
987
|
-
scaler_y = StandardScaler()
|
|
988
|
-
y = df[[f"TARGET_{target_number}"]]
|
|
989
|
-
|
|
990
|
-
scaled_y = pd.DataFrame(
|
|
991
|
-
scaler_y.fit_transform(y.values),
|
|
992
|
-
columns=y.columns,
|
|
993
|
-
index=y.index,
|
|
994
|
-
)
|
|
995
|
-
target_dir = f"{self.experiment_dir}/TARGET_{target_number}"
|
|
996
|
-
joblib.dump(scaler_y, f"{target_dir}/scaler_y.pkl")
|
|
997
|
-
|
|
998
|
-
scalers_y[f"scaler_y_{target_number}"] = scaler_y
|
|
999
|
-
scaled_targets[target_number] = scaled_y
|
|
1000
|
-
|
|
1001
|
-
# Reconstruct y_scaled in the original order
|
|
1002
|
-
y_scaled = pd.concat(
|
|
1003
|
-
[
|
|
1004
|
-
scaled_targets[target_number]
|
|
1005
|
-
for target_number in targets_numbers_to_scale
|
|
1006
|
-
],
|
|
1007
|
-
axis=1,
|
|
1008
|
-
)
|
|
1009
|
-
y_not_scaled = df[
|
|
1010
|
-
df.columns.intersection([f"TARGET_{i}" for i in self.target_clf])
|
|
1011
|
-
]
|
|
1012
|
-
|
|
1013
|
-
# Ensure the final DataFrame keeps the original order
|
|
1014
|
-
df_scaled = pd.concat(
|
|
1015
|
-
[X_scaled, y_scaled, y_not_scaled],
|
|
1016
|
-
axis=1,
|
|
1017
|
-
)[
|
|
1018
|
-
df.columns
|
|
1019
|
-
] # Reorder columns to match original `df`
|
|
1020
|
-
|
|
1021
|
-
if not df_scaled.columns.equals(df.columns):
|
|
1022
|
-
raise Exception("Columns are not in the same order after scaling.")
|
|
1023
|
-
|
|
1024
|
-
return df_scaled, scaler_x, scalers_y
|
|
1025
|
-
|
|
1026
|
-
# Reshape into 3D tensors for recurrent models
|
|
1027
|
-
def reshape_time_series(
|
|
1028
|
-
self,
|
|
1029
|
-
train: pd.DataFrame,
|
|
1030
|
-
val: pd.DataFrame,
|
|
1031
|
-
test: pd.DataFrame,
|
|
1032
|
-
features: list,
|
|
1033
|
-
timesteps: int = 120,
|
|
1034
|
-
):
|
|
1035
|
-
# always scale for recurrent layers : train should be scaled
|
|
1036
|
-
group_column = self.group_column
|
|
1037
|
-
|
|
1038
|
-
target_columns = train.columns.intersection(
|
|
1039
|
-
[f"TARGET_{i}" for i in self.target_numbers]
|
|
1040
|
-
)
|
|
1041
|
-
|
|
1042
|
-
data = pd.concat([train, val, test], axis=0)
|
|
1043
|
-
|
|
1044
|
-
def reshape_df(df: pd.DataFrame, group_series: pd.Series, timesteps: int):
|
|
1045
|
-
fill_value = [[[0] * len(df.columns)]]
|
|
1046
|
-
|
|
1047
|
-
def shiftsum(x, timesteps: int):
|
|
1048
|
-
tmp = x.copy()
|
|
1049
|
-
for i in range(1, timesteps):
|
|
1050
|
-
tmp = x.shift(i, fill_value=fill_value) + tmp
|
|
1051
|
-
return tmp
|
|
1052
|
-
|
|
1053
|
-
logger.info("Grouping each feature in a unique column with list...")
|
|
1054
|
-
df_reshaped = df.apply(list, axis=1).apply(lambda x: [list(x)])
|
|
1055
|
-
df_reshaped = pd.concat([df_reshaped, group_series], axis=1)
|
|
1056
|
-
|
|
1057
|
-
logger.info("Grouping features and creating timesteps...")
|
|
1058
|
-
df_reshaped = (
|
|
1059
|
-
df_reshaped.groupby(group_column)[0]
|
|
1060
|
-
.apply(lambda x: shiftsum(x, timesteps))
|
|
1061
|
-
.reset_index(group_column, drop=True)
|
|
1062
|
-
.rename("RECURRENT_FEATURES")
|
|
1063
|
-
)
|
|
1064
|
-
df_reshaped = pd.DataFrame(df_reshaped)
|
|
1065
|
-
|
|
1066
|
-
return df_reshaped
|
|
1067
|
-
|
|
1068
|
-
data_reshaped = reshape_df(data[features], data[group_column], timesteps)
|
|
1069
|
-
|
|
1070
|
-
data_reshaped[target_columns] = data[target_columns]
|
|
1071
|
-
|
|
1072
|
-
logger.info("Separating train, val, test data and creating np arrays...")
|
|
1073
|
-
train_reshaped = data_reshaped.loc[train.index]
|
|
1074
|
-
val_reshaped = data_reshaped.loc[val.index]
|
|
1075
|
-
test_reshaped = data_reshaped.loc[test.index]
|
|
1076
|
-
|
|
1077
|
-
x_train_reshaped = np.array(
|
|
1078
|
-
train_reshaped["RECURRENT_FEATURES"].values.tolist()
|
|
1079
|
-
)
|
|
1080
|
-
y_train_reshaped = np.array(train_reshaped[target_columns].reset_index())
|
|
1081
|
-
x_val_reshaped = np.array(val_reshaped["RECURRENT_FEATURES"].values.tolist())
|
|
1082
|
-
y_val_reshaped = np.array(val_reshaped[target_columns].reset_index())
|
|
1083
|
-
x_test_reshaped = np.array(test_reshaped["RECURRENT_FEATURES"].values.tolist())
|
|
1084
|
-
y_test_reshaped = np.array(test_reshaped[target_columns].reset_index())
|
|
1085
|
-
|
|
1086
|
-
reshaped_data = {
|
|
1087
|
-
"x_train_reshaped": x_train_reshaped,
|
|
1088
|
-
"y_train_reshaped": y_train_reshaped,
|
|
1089
|
-
"x_val_reshaped": x_val_reshaped,
|
|
1090
|
-
"y_val_reshaped": y_val_reshaped,
|
|
1091
|
-
"x_test_reshaped": x_test_reshaped,
|
|
1092
|
-
"y_test_reshaped": y_test_reshaped,
|
|
1093
|
-
}
|
|
1094
|
-
|
|
1095
|
-
return reshaped_data
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
911
|
# utils
|
|
1099
912
|
# TODO : can we use this to select the ideal number of features ?
|
|
1100
913
|
def feature_selection_analysis(feature_selection_id: int, n_components: int = 5):
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from importlib.metadata import version
|
|
3
|
+
|
|
4
|
+
import sentry_sdk
|
|
5
|
+
from sentry_sdk.integrations.logging import LoggingIntegration
|
|
6
|
+
|
|
7
|
+
from lecrapaud.config import (
|
|
8
|
+
LOGGING_LEVEL,
|
|
9
|
+
PYTHON_ENV,
|
|
10
|
+
SENTRY_DSN,
|
|
11
|
+
SENTRY_PROFILES_SAMPLE_RATE,
|
|
12
|
+
SENTRY_TRACES_SAMPLE_RATE,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _release_version():
|
|
17
|
+
try:
|
|
18
|
+
return f"lecrapaud@{version('lecrapaud')}"
|
|
19
|
+
except Exception:
|
|
20
|
+
return None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def init_sentry():
|
|
24
|
+
"""
|
|
25
|
+
Initialize Sentry if a DSN is configured.
|
|
26
|
+
Returns True when enabled, False otherwise.
|
|
27
|
+
"""
|
|
28
|
+
if not SENTRY_DSN:
|
|
29
|
+
return False
|
|
30
|
+
|
|
31
|
+
sentry_logging = LoggingIntegration(
|
|
32
|
+
level=getattr(logging, LOGGING_LEVEL.upper(), logging.INFO),
|
|
33
|
+
event_level=logging.ERROR,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
sentry_sdk.init(
|
|
37
|
+
dsn=SENTRY_DSN,
|
|
38
|
+
environment=PYTHON_ENV,
|
|
39
|
+
release=_release_version(),
|
|
40
|
+
integrations=[sentry_logging],
|
|
41
|
+
traces_sample_rate=SENTRY_TRACES_SAMPLE_RATE,
|
|
42
|
+
profiles_sample_rate=SENTRY_PROFILES_SAMPLE_RATE,
|
|
43
|
+
send_default_pii=False,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
return True
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"cells": [
|
|
3
3
|
{
|
|
4
4
|
"cell_type": "code",
|
|
5
|
-
"execution_count":
|
|
5
|
+
"execution_count": null,
|
|
6
6
|
"metadata": {},
|
|
7
7
|
"outputs": [
|
|
8
8
|
{
|
|
@@ -64,7 +64,7 @@
|
|
|
64
64
|
"from sklearn.metrics import accuracy_score, roc_auc_score\n",
|
|
65
65
|
"from sklearn.model_selection import train_test_split\n",
|
|
66
66
|
"\n",
|
|
67
|
-
"from tabpfn import TabPFNClassifier\n",
|
|
67
|
+
"# from tabpfn import TabPFNClassifier\n",
|
|
68
68
|
"\n",
|
|
69
69
|
"# Load data\n",
|
|
70
70
|
"X, y = load_breast_cancer(return_X_y=True)\n",
|