lecrapaud 0.19.0__py3-none-any.whl → 0.22.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. lecrapaud/__init__.py +22 -1
  2. lecrapaud/{api.py → base.py} +331 -241
  3. lecrapaud/config.py +15 -3
  4. lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +39 -0
  5. lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +264 -0
  6. lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +75 -0
  7. lecrapaud/db/models/__init__.py +2 -4
  8. lecrapaud/db/models/base.py +116 -65
  9. lecrapaud/db/models/experiment.py +195 -182
  10. lecrapaud/db/models/feature_selection.py +0 -3
  11. lecrapaud/db/models/feature_selection_rank.py +0 -18
  12. lecrapaud/db/models/model_selection.py +2 -2
  13. lecrapaud/db/models/{score.py → model_selection_score.py} +29 -12
  14. lecrapaud/db/session.py +4 -0
  15. lecrapaud/experiment.py +44 -17
  16. lecrapaud/feature_engineering.py +45 -674
  17. lecrapaud/feature_preprocessing.py +1202 -0
  18. lecrapaud/feature_selection.py +145 -332
  19. lecrapaud/integrations/sentry_integration.py +46 -0
  20. lecrapaud/misc/tabpfn_tests.ipynb +2 -2
  21. lecrapaud/mixins.py +247 -0
  22. lecrapaud/model_preprocessing.py +295 -0
  23. lecrapaud/model_selection.py +612 -242
  24. lecrapaud/pipeline.py +548 -0
  25. lecrapaud/search_space.py +2 -1
  26. lecrapaud/utils.py +36 -3
  27. lecrapaud-0.22.6.dist-info/METADATA +423 -0
  28. lecrapaud-0.22.6.dist-info/RECORD +51 -0
  29. {lecrapaud-0.19.0.dist-info → lecrapaud-0.22.6.dist-info}/WHEEL +1 -1
  30. {lecrapaud-0.19.0.dist-info → lecrapaud-0.22.6.dist-info/licenses}/LICENSE +1 -1
  31. lecrapaud/db/models/model_training.py +0 -64
  32. lecrapaud/jobs/__init__.py +0 -13
  33. lecrapaud/jobs/config.py +0 -17
  34. lecrapaud/jobs/scheduler.py +0 -30
  35. lecrapaud/jobs/tasks.py +0 -17
  36. lecrapaud-0.19.0.dist-info/METADATA +0 -249
  37. lecrapaud-0.19.0.dist-info/RECORD +0 -48
@@ -5,16 +5,12 @@ import matplotlib.pyplot as plt
5
5
  import seaborn as sns
6
6
  import os
7
7
  import time
8
- from typing import Optional
9
8
  from tqdm import tqdm
10
9
  import warnings
11
10
  from concurrent.futures import ProcessPoolExecutor, as_completed
12
11
  import joblib
13
- import re
14
12
  from pathlib import Path
15
13
 
16
- os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
17
-
18
14
  # feature selection
19
15
  from sklearn.feature_selection import (
20
16
  f_classif,
@@ -33,7 +29,6 @@ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
33
29
  from sklearn.model_selection import TimeSeriesSplit
34
30
  from sklearn.metrics import root_mean_squared_error, log_loss, make_scorer
35
31
  from mlxtend.feature_selection import SequentialFeatureSelector
36
- from sklearn.preprocessing import StandardScaler, MinMaxScaler
37
32
  from scipy.stats import spearmanr, kendalltau
38
33
 
39
34
  # Internal
@@ -48,6 +43,7 @@ from lecrapaud.db import (
48
43
  FeatureSelectionRank,
49
44
  )
50
45
  from lecrapaud.search_space import all_models
46
+ from lecrapaud.mixins import LeCrapaudEstimatorMixin
51
47
 
52
48
  # Annoying Warnings
53
49
  warnings.filterwarnings("ignore", category=FutureWarning)
@@ -72,39 +68,66 @@ def load_train_data(experiment_dir):
72
68
  return train, val, test, train_scaled, val_scaled, test_scaled
73
69
 
74
70
 
75
- class FeatureSelectionEngine:
76
- def __init__(self, train, experiment, target_number, target_clf, **kwargs):
77
- self.experiment = experiment
78
- self.train = train
71
+ class FeatureSelector(LeCrapaudEstimatorMixin):
72
+ def __init__(self, experiment=None, target_number=None, **kwargs):
73
+ # The mixin will automatically set all experiment.context parameters as attributes
74
+ super().__init__(experiment=experiment, target_number=target_number, **kwargs)
75
+
76
+ # Set defaults for required parameters if not provided
77
+ if not hasattr(self, "target_clf"):
78
+ self.target_clf = []
79
+ if not hasattr(self, "max_p_value_categorical"):
80
+ self.max_p_value_categorical = 0.05
81
+ if not hasattr(self, "percentile"):
82
+ self.percentile = 20
83
+ if not hasattr(self, "corr_threshold"):
84
+ self.corr_threshold = 80
85
+ if not hasattr(self, "max_features"):
86
+ self.max_features = 50
87
+
79
88
  self.target_number = target_number
80
- self.target_clf = target_clf
81
89
 
82
- self.target_type = (
83
- "classification" if self.target_number in self.target_clf else "regression"
84
- )
85
- self.percentile = self.experiment.percentile
86
- self.corr_threshold = self.experiment.corr_threshold
87
- self.max_features = self.experiment.max_features
90
+ # Derived attributes
91
+ if self.target_number is not None and hasattr(self, "target_clf"):
92
+ self.target_type = (
93
+ "classification"
94
+ if self.target_number in self.target_clf
95
+ else "regression"
96
+ )
88
97
 
89
- self.experiment_dir = self.experiment.path
90
- self.experiment_id = self.experiment.id
91
- self.data_dir = f"{self.experiment_dir}/data"
92
- self.target_dir = f"{self.experiment_dir}/TARGET_{self.target_number}"
93
- self.feature_selection_dir = f"{self.target_dir}/feature_selection"
94
- os.makedirs(self.feature_selection_dir, exist_ok=True)
98
+ # Set paths if experiment is available
99
+ if self.experiment:
100
+ self.experiment_dir = self.experiment.path
101
+ self.experiment_id = self.experiment.id
102
+ self.data_dir = f"{self.experiment_dir}/data"
103
+ if self.target_number is not None:
104
+ self.target_dir = f"{self.experiment_dir}/TARGET_{self.target_number}"
105
+ self.feature_selection_dir = f"{self.target_dir}/feature_selection"
106
+ os.makedirs(self.feature_selection_dir, exist_ok=True)
95
107
 
96
108
  # Main feature selection function
97
- def run(
98
- self,
99
- single_process: bool = True,
100
- ):
101
- """Function to do feature selection with a range of different feature selection technics
109
+ def fit(self, X, y=None, single_process=True):
110
+ """
111
+ Fit the feature selector.
102
112
 
103
113
  Args:
104
- - train (pd.DataFrame): a pandas train set
105
- - target_number (in): a target, targets need to be name ``TARGET_{n}```
106
- - single_process (bool): if True, run all feature selection methods in a single process. If False, run them in parallel.
114
+ X (pd.DataFrame): Input features
115
+ y: Target values (ignored, uses TARGET columns in X)
116
+ single_process (bool): if True, run all feature selection methods in a single process
117
+
118
+ Returns:
119
+ self: Returns self for chaining (sklearn convention)
107
120
  """
121
+ # Validate data
122
+ X, y = self._validate_data(X, y)
123
+
124
+ # Store train data
125
+ self.train = X
126
+
127
+ # Check that target_number is set
128
+ if self.target_number is None:
129
+ raise ValueError("target_number must be set before fitting")
130
+
108
131
  target_number = self.target_number
109
132
  target_type = self.target_type
110
133
 
@@ -115,7 +138,6 @@ class FeatureSelectionEngine:
115
138
  max_features = self.max_features
116
139
 
117
140
  feature_selection = FeatureSelection.upsert(
118
- match_fields=["target_id", "experiment_id"],
119
141
  target_id=target.id,
120
142
  experiment_id=self.experiment_id,
121
143
  )
@@ -276,6 +298,58 @@ class FeatureSelectionEngine:
276
298
 
277
299
  features_selected_list = features_selected["features"].values.tolist()
278
300
 
301
+ # Save ensemble features for all numerical features with global ranking
302
+ logger.info(
303
+ "Saving ensemble features with global ranking for all numerical features..."
304
+ )
305
+ numerical_features_in_data = self.X_numerical.columns.tolist()
306
+ ensemble_rows = []
307
+
308
+ # Create global ranking for ALL numerical features (1 to n, no null values)
309
+ all_numerical_scores = pd.concat(results, axis=0)
310
+ all_numerical_scores = (
311
+ all_numerical_scores.groupby("features")
312
+ .agg({"rank": "mean"}) # Average rank across all methods
313
+ .reset_index()
314
+ )
315
+ all_numerical_scores.sort_values("rank", inplace=True)
316
+ all_numerical_scores["global_rank"] = range(1, len(all_numerical_scores) + 1)
317
+
318
+ for feature in numerical_features_in_data:
319
+ feature_id = feature_map.get(feature)
320
+ if feature_id:
321
+ is_selected = feature in features_selected_list
322
+
323
+ # Get global rank (no null values - all features get a rank)
324
+ if feature in all_numerical_scores["features"].values:
325
+ global_rank = all_numerical_scores[
326
+ all_numerical_scores["features"] == feature
327
+ ]["global_rank"].values[0]
328
+ else:
329
+ # Fallback: assign last rank + position for features not in results
330
+ global_rank = (
331
+ len(all_numerical_scores)
332
+ + numerical_features_in_data.index(feature)
333
+ + 1
334
+ )
335
+
336
+ ensemble_rows.append(
337
+ {
338
+ "feature_selection_id": feature_selection.id,
339
+ "feature_id": feature_id,
340
+ "method": "ensemble",
341
+ "score": None,
342
+ "pvalue": None,
343
+ "support": (
344
+ 2 if is_selected else 0
345
+ ), # 2 = in aggregated features
346
+ "rank": global_rank,
347
+ "training_time": 0,
348
+ }
349
+ )
350
+
351
+ FeatureSelectionRank.bulk_upsert(rows=ensemble_rows)
352
+
279
353
  # analysis 1
280
354
  features_selected_by_every_methods = set(results[0]["features"].values.tolist())
281
355
  for df in results[1:]:
@@ -303,12 +377,30 @@ class FeatureSelectionEngine:
303
377
  header=True,
304
378
  index_label="ID",
305
379
  )
380
+
381
+ # Update support for features after correlation removal (before max)
382
+ logger.info("Updating ensemble features after correlation removal...")
383
+ for row in ensemble_rows:
384
+ feature = Feature.get(row["feature_id"]).name
385
+ if feature in features:
386
+ row["support"] = 1 # 1 = survived correlation removal
387
+
306
388
  features = features[:max_features]
307
389
 
308
390
  # adding categorical features selected
309
391
  features += (
310
392
  categorical_features_selected if target_type == "classification" else []
311
393
  )
394
+
395
+ # Final update for features after max limitation (final selection)
396
+ logger.info("Finalizing ensemble features...")
397
+ for row in ensemble_rows:
398
+ feature = Feature.get(row["feature_id"]).name
399
+ if feature in features and row["support"] == 1:
400
+ row["support"] = 2 # 2 = in final selection
401
+
402
+ # Re-save all ensemble data with updated support values
403
+ FeatureSelectionRank.bulk_upsert(rows=ensemble_rows)
312
404
  logger.debug(
313
405
  f"Final pre-selection: {len(features)} features below {corr_threshold}% out of {len(features_selected_list)} features, and rejected {len(features_correlated)} features, {100*len(features)/len(features_selected_list):.2f}% features selected"
314
406
  )
@@ -352,7 +444,20 @@ class FeatureSelectionEngine:
352
444
  feature_selection.best_features_path = best_features_path
353
445
  feature_selection.save()
354
446
 
355
- return features
447
+ # Store selected features for later access
448
+ self.selected_features_ = features
449
+ self._set_fitted()
450
+ return self
451
+
452
+ def get_selected_features(self):
453
+ """
454
+ Get the list of selected features after fitting.
455
+
456
+ Returns:
457
+ list: Selected feature names
458
+ """
459
+ self._check_is_fitted()
460
+ return self.selected_features_
356
461
 
357
462
  # Remove correlation
358
463
  # ------------------
@@ -441,13 +546,20 @@ class FeatureSelectionEngine:
441
546
  feat_scores["features"] = X.columns
442
547
  feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
443
548
  feat_scores["method"] = "Chi2"
549
+
550
+ # Apply both percentile and p-value filtering
551
+ # Keep features that satisfy BOTH conditions: within percentile AND p-value < threshold
552
+ feat_scores["support"] = feat_scores["support"] & (
553
+ feat_scores["pvalue"] <= self.max_p_value_categorical
554
+ )
555
+
444
556
  feat_scores.sort_values("rank", ascending=True, inplace=True)
445
557
  stop = time.time()
446
558
  training_time = timedelta(seconds=(stop - start)).total_seconds()
447
559
  feat_scores["training_time"] = training_time
448
560
 
449
561
  logger.debug(
450
- f"Chi2 evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
562
+ f"Chi2 evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds (percentile={percentile}%, p-value<={self.max_p_value_categorical})"
451
563
  )
452
564
 
453
565
  feat_scores.to_csv(
@@ -796,305 +908,6 @@ class FeatureSelectionEngine:
796
908
  return feat_scores
797
909
 
798
910
 
799
- class PreprocessModel:
800
-
801
- def __init__(
802
- self,
803
- train,
804
- val,
805
- test,
806
- experiment,
807
- target_numbers,
808
- target_clf,
809
- models_idx,
810
- time_series,
811
- max_timesteps,
812
- group_column,
813
- date_column,
814
- **kwargs,
815
- ):
816
- self.train = train
817
- self.val = val
818
- self.test = test
819
- self.experiment = experiment
820
- self.target_numbers = target_numbers
821
- self.target_clf = target_clf
822
- self.models_idx = models_idx
823
- self.time_series = time_series
824
- self.max_timesteps = max_timesteps
825
- self.group_column = group_column
826
- self.date_column = date_column
827
-
828
- self.experiment_dir = experiment.path
829
- self.data_dir = f"{self.experiment_dir}/data"
830
- self.preprocessing_dir = f"{self.experiment_dir}/preprocessing"
831
-
832
- self.all_features = experiment.get_all_features(
833
- date_column=date_column, group_column=group_column
834
- )
835
-
836
- def run(self):
837
- # save data
838
- columns_to_keep = self.all_features + [
839
- f"TARGET_{i}" for i in self.target_numbers
840
- ]
841
- duplicates = [
842
- col for col in set(columns_to_keep) if columns_to_keep.count(col) > 1
843
- ]
844
- if duplicates:
845
- raise ValueError(f"Doublons détectés dans columns_to_keep: {duplicates}")
846
- self.train = self.train[columns_to_keep]
847
- self.val = self.val[columns_to_keep]
848
- self.test = self.test[columns_to_keep]
849
- joblib.dump(self.train, f"{self.data_dir}/train.pkl")
850
- joblib.dump(self.val, f"{self.data_dir}/val.pkl")
851
- joblib.dump(self.test, f"{self.data_dir}/test.pkl")
852
-
853
- # scaling features
854
- if any(t not in self.target_clf for t in self.target_numbers) and any(
855
- all_models[i].get("need_scaling") for i in self.models_idx
856
- ):
857
- logger.info("Scaling features...")
858
- train_scaled, scaler_x, scalers_y = self.scale_data(self.train)
859
- val_scaled, _, _ = self.scale_data(
860
- self.val,
861
- scaler_x=scaler_x,
862
- scalers_y=scalers_y,
863
- )
864
- test_scaled, _, _ = self.scale_data(
865
- self.test,
866
- scaler_x=scaler_x,
867
- scalers_y=scalers_y,
868
- )
869
- else:
870
- train_scaled = None
871
- val_scaled = None
872
- test_scaled = None
873
- scaler_x = None
874
-
875
- # save data
876
- joblib.dump(train_scaled, f"{self.data_dir}/train_scaled.pkl")
877
- joblib.dump(val_scaled, f"{self.data_dir}/val_scaled.pkl")
878
- joblib.dump(test_scaled, f"{self.data_dir}/test_scaled.pkl")
879
- joblib.dump(scaler_x, f"{self.preprocessing_dir}/scaler_x.pkl")
880
-
881
- data = {
882
- "train": self.train,
883
- "val": self.val,
884
- "test": self.test,
885
- "train_scaled": train_scaled,
886
- "val_scaled": val_scaled,
887
- "test_scaled": test_scaled,
888
- }
889
-
890
- # reshape data for time series
891
- reshaped_data = None
892
- if (
893
- any(all_models[i].get("recurrent") for i in self.models_idx)
894
- and self.time_series
895
- ):
896
- # reshaping data for recurrent models
897
- logger.info("Reshaping data for recurrent models...")
898
- reshaped_data = self.reshape_time_series(
899
- train_scaled,
900
- val_scaled,
901
- test_scaled,
902
- features=self.all_features,
903
- timesteps=self.max_timesteps,
904
- )
905
-
906
- return data, reshaped_data
907
-
908
- def inference(self):
909
- # self.train is new data here
910
- columns_to_keep = self.all_features
911
- self.train = self.train[columns_to_keep]
912
-
913
- scaler_x = joblib.load(f"{self.preprocessing_dir}/scaler_x.pkl")
914
-
915
- if scaler_x:
916
- scaled_data = scaler_x.transform(self.train)
917
- scaled_data = pd.DataFrame(
918
- scaled_data, columns=self.train.columns, index=self.train.index
919
- )
920
- else:
921
- scaled_data = self.train
922
-
923
- reshaped_data = None
924
- if (
925
- any(all_models[i].get("recurrent") for i in self.models_idx)
926
- and self.time_series
927
- ):
928
- # we need to make sur we have max_timesteps of data after grouping by group_column
929
- if (
930
- self.group_column
931
- and scaled_data.groupby(self.group_column).size().min()
932
- < self.max_timesteps
933
- ) or scaled_data.shape[0] < self.max_timesteps:
934
- raise ValueError(
935
- f"Not enough data for group_column {self.group_column} to reshape data for recurrent models"
936
- )
937
-
938
- # reshaping data for recurrent models
939
- logger.info("Reshaping data for recurrent models...")
940
- reshaped_data = self.reshape_time_series(
941
- scaled_data,
942
- features=self.all_features,
943
- timesteps=self.max_timesteps,
944
- )
945
-
946
- return self.train, scaled_data, reshaped_data
947
-
948
- # scaling
949
- def scale_data(
950
- self,
951
- df: pd.DataFrame,
952
- scaler_x=None,
953
- scalers_y: Optional[list] = None,
954
- ):
955
- logger.info("Scale data...")
956
- X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
957
-
958
- if scaler_x:
959
- X_scaled = pd.DataFrame(
960
- scaler_x.transform(X), columns=list(X.columns), index=X.index
961
- )
962
- else:
963
- scaler_x = StandardScaler() # MinMaxScaler(feature_range=(-1,1))
964
- X_scaled = pd.DataFrame(
965
- scaler_x.fit_transform(X), columns=list(X.columns), index=X.index
966
- )
967
-
968
- # Determine which targets need to be scaled
969
- targets_numbers_to_scale = [
970
- i for i in self.target_numbers if i not in self.target_clf
971
- ]
972
-
973
- # Dictionary to store scaled target data
974
- scaled_targets = {}
975
-
976
- if scalers_y:
977
- for target_number in targets_numbers_to_scale:
978
- y = df[[f"TARGET_{target_number}"]]
979
- scaled_targets[target_number] = pd.DataFrame(
980
- scalers_y[f"scaler_y_{target_number}"].transform(y.values),
981
- columns=y.columns,
982
- index=y.index,
983
- )
984
- else:
985
- scalers_y = {}
986
- for target_number in targets_numbers_to_scale:
987
- scaler_y = StandardScaler()
988
- y = df[[f"TARGET_{target_number}"]]
989
-
990
- scaled_y = pd.DataFrame(
991
- scaler_y.fit_transform(y.values),
992
- columns=y.columns,
993
- index=y.index,
994
- )
995
- target_dir = f"{self.experiment_dir}/TARGET_{target_number}"
996
- joblib.dump(scaler_y, f"{target_dir}/scaler_y.pkl")
997
-
998
- scalers_y[f"scaler_y_{target_number}"] = scaler_y
999
- scaled_targets[target_number] = scaled_y
1000
-
1001
- # Reconstruct y_scaled in the original order
1002
- y_scaled = pd.concat(
1003
- [
1004
- scaled_targets[target_number]
1005
- for target_number in targets_numbers_to_scale
1006
- ],
1007
- axis=1,
1008
- )
1009
- y_not_scaled = df[
1010
- df.columns.intersection([f"TARGET_{i}" for i in self.target_clf])
1011
- ]
1012
-
1013
- # Ensure the final DataFrame keeps the original order
1014
- df_scaled = pd.concat(
1015
- [X_scaled, y_scaled, y_not_scaled],
1016
- axis=1,
1017
- )[
1018
- df.columns
1019
- ] # Reorder columns to match original `df`
1020
-
1021
- if not df_scaled.columns.equals(df.columns):
1022
- raise Exception("Columns are not in the same order after scaling.")
1023
-
1024
- return df_scaled, scaler_x, scalers_y
1025
-
1026
- # Reshape into 3D tensors for recurrent models
1027
- def reshape_time_series(
1028
- self,
1029
- train: pd.DataFrame,
1030
- val: pd.DataFrame,
1031
- test: pd.DataFrame,
1032
- features: list,
1033
- timesteps: int = 120,
1034
- ):
1035
- # always scale for recurrent layers : train should be scaled
1036
- group_column = self.group_column
1037
-
1038
- target_columns = train.columns.intersection(
1039
- [f"TARGET_{i}" for i in self.target_numbers]
1040
- )
1041
-
1042
- data = pd.concat([train, val, test], axis=0)
1043
-
1044
- def reshape_df(df: pd.DataFrame, group_series: pd.Series, timesteps: int):
1045
- fill_value = [[[0] * len(df.columns)]]
1046
-
1047
- def shiftsum(x, timesteps: int):
1048
- tmp = x.copy()
1049
- for i in range(1, timesteps):
1050
- tmp = x.shift(i, fill_value=fill_value) + tmp
1051
- return tmp
1052
-
1053
- logger.info("Grouping each feature in a unique column with list...")
1054
- df_reshaped = df.apply(list, axis=1).apply(lambda x: [list(x)])
1055
- df_reshaped = pd.concat([df_reshaped, group_series], axis=1)
1056
-
1057
- logger.info("Grouping features and creating timesteps...")
1058
- df_reshaped = (
1059
- df_reshaped.groupby(group_column)[0]
1060
- .apply(lambda x: shiftsum(x, timesteps))
1061
- .reset_index(group_column, drop=True)
1062
- .rename("RECURRENT_FEATURES")
1063
- )
1064
- df_reshaped = pd.DataFrame(df_reshaped)
1065
-
1066
- return df_reshaped
1067
-
1068
- data_reshaped = reshape_df(data[features], data[group_column], timesteps)
1069
-
1070
- data_reshaped[target_columns] = data[target_columns]
1071
-
1072
- logger.info("Separating train, val, test data and creating np arrays...")
1073
- train_reshaped = data_reshaped.loc[train.index]
1074
- val_reshaped = data_reshaped.loc[val.index]
1075
- test_reshaped = data_reshaped.loc[test.index]
1076
-
1077
- x_train_reshaped = np.array(
1078
- train_reshaped["RECURRENT_FEATURES"].values.tolist()
1079
- )
1080
- y_train_reshaped = np.array(train_reshaped[target_columns].reset_index())
1081
- x_val_reshaped = np.array(val_reshaped["RECURRENT_FEATURES"].values.tolist())
1082
- y_val_reshaped = np.array(val_reshaped[target_columns].reset_index())
1083
- x_test_reshaped = np.array(test_reshaped["RECURRENT_FEATURES"].values.tolist())
1084
- y_test_reshaped = np.array(test_reshaped[target_columns].reset_index())
1085
-
1086
- reshaped_data = {
1087
- "x_train_reshaped": x_train_reshaped,
1088
- "y_train_reshaped": y_train_reshaped,
1089
- "x_val_reshaped": x_val_reshaped,
1090
- "y_val_reshaped": y_val_reshaped,
1091
- "x_test_reshaped": x_test_reshaped,
1092
- "y_test_reshaped": y_test_reshaped,
1093
- }
1094
-
1095
- return reshaped_data
1096
-
1097
-
1098
911
  # utils
1099
912
  # TODO : can we use this to select the ideal number of features ?
1100
913
  def feature_selection_analysis(feature_selection_id: int, n_components: int = 5):
@@ -0,0 +1,46 @@
1
+ import logging
2
+ from importlib.metadata import version
3
+
4
+ import sentry_sdk
5
+ from sentry_sdk.integrations.logging import LoggingIntegration
6
+
7
+ from lecrapaud.config import (
8
+ LOGGING_LEVEL,
9
+ PYTHON_ENV,
10
+ SENTRY_DSN,
11
+ SENTRY_PROFILES_SAMPLE_RATE,
12
+ SENTRY_TRACES_SAMPLE_RATE,
13
+ )
14
+
15
+
16
+ def _release_version():
17
+ try:
18
+ return f"lecrapaud@{version('lecrapaud')}"
19
+ except Exception:
20
+ return None
21
+
22
+
23
+ def init_sentry():
24
+ """
25
+ Initialize Sentry if a DSN is configured.
26
+ Returns True when enabled, False otherwise.
27
+ """
28
+ if not SENTRY_DSN:
29
+ return False
30
+
31
+ sentry_logging = LoggingIntegration(
32
+ level=getattr(logging, LOGGING_LEVEL.upper(), logging.INFO),
33
+ event_level=logging.ERROR,
34
+ )
35
+
36
+ sentry_sdk.init(
37
+ dsn=SENTRY_DSN,
38
+ environment=PYTHON_ENV,
39
+ release=_release_version(),
40
+ integrations=[sentry_logging],
41
+ traces_sample_rate=SENTRY_TRACES_SAMPLE_RATE,
42
+ profiles_sample_rate=SENTRY_PROFILES_SAMPLE_RATE,
43
+ send_default_pii=False,
44
+ )
45
+
46
+ return True
@@ -2,7 +2,7 @@
2
2
  "cells": [
3
3
  {
4
4
  "cell_type": "code",
5
- "execution_count": 3,
5
+ "execution_count": null,
6
6
  "metadata": {},
7
7
  "outputs": [
8
8
  {
@@ -64,7 +64,7 @@
64
64
  "from sklearn.metrics import accuracy_score, roc_auc_score\n",
65
65
  "from sklearn.model_selection import train_test_split\n",
66
66
  "\n",
67
- "from tabpfn import TabPFNClassifier\n",
67
+ "# from tabpfn import TabPFNClassifier\n",
68
68
  "\n",
69
69
  "# Load data\n",
70
70
  "X, y = load_breast_cancer(return_X_y=True)\n",