lecrapaud 0.21.0__tar.gz → 0.21.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

Files changed (50) hide show
  1. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/PKG-INFO +18 -5
  2. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/README.md +17 -4
  3. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/config.py +1 -1
  4. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/feature_engineering.py +189 -3
  5. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/feature_selection.py +20 -28
  6. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/model_selection.py +11 -8
  7. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/pyproject.toml +1 -1
  8. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/LICENSE +0 -0
  9. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/__init__.py +0 -0
  10. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/api.py +0 -0
  11. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/db/__init__.py +0 -0
  12. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/db/alembic/README +0 -0
  13. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/db/alembic/env.py +0 -0
  14. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/db/alembic/script.py.mako +0 -0
  15. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/db/alembic/versions/2025_06_23_1748-f089dfb7e3ba_.py +0 -0
  16. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/db/alembic/versions/2025_06_24_1216-c62251b129ed_.py +0 -0
  17. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/db/alembic/versions/2025_06_24_1711-86457e2f333f_.py +0 -0
  18. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/db/alembic/versions/2025_06_25_1759-72aa496ca65b_.py +0 -0
  19. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/db/alembic/versions/2025_08_25_1434-7ed9963e732f_add_best_score_to_model_selection.py +0 -0
  20. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/db/alembic/versions/2025_08_28_1516-c36e9fee22b9_add_avg_precision_to_score.py +0 -0
  21. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/db/alembic/versions/2025_08_28_1622-8b11c1ba982e_change_name_column.py +0 -0
  22. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +0 -0
  23. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +0 -0
  24. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +0 -0
  25. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/db/alembic.ini +0 -0
  26. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/db/models/__init__.py +0 -0
  27. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/db/models/base.py +0 -0
  28. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/db/models/experiment.py +0 -0
  29. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/db/models/feature.py +0 -0
  30. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/db/models/feature_selection.py +0 -0
  31. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/db/models/feature_selection_rank.py +0 -0
  32. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/db/models/model.py +0 -0
  33. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/db/models/model_selection.py +0 -0
  34. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/db/models/model_selection_score.py +0 -0
  35. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/db/models/target.py +0 -0
  36. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/db/models/utils.py +0 -0
  37. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/db/session.py +0 -0
  38. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/directories.py +0 -0
  39. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/experiment.py +0 -0
  40. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/integrations/openai_integration.py +0 -0
  41. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/jobs/__init__.py +0 -0
  42. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/jobs/config.py +0 -0
  43. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/jobs/scheduler.py +0 -0
  44. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/jobs/tasks.py +0 -0
  45. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/misc/tabpfn_tests.ipynb +0 -0
  46. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/misc/test-gpu-bilstm.ipynb +0 -0
  47. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/misc/test-gpu-resnet.ipynb +0 -0
  48. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/misc/test-gpu-transformers.ipynb +0 -0
  49. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/search_space.py +0 -0
  50. {lecrapaud-0.21.0 → lecrapaud-0.21.2}/lecrapaud/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lecrapaud
3
- Version: 0.21.0
3
+ Version: 0.21.2
4
4
  Summary: Framework for machine and deep learning, with regression, classification and time series analysis
5
5
  License: Apache License
6
6
  License-File: LICENSE
@@ -218,7 +218,11 @@ context = {
218
218
  "val_size": 0.2,
219
219
  "test_size": 0.2,
220
220
  "pca_temporal": [
221
- {"name": "LAST_20_RET", "columns": [f"RET_-{i}" for i in range(1, 21)]},
221
+ # Old format (still supported)
222
+ # {"name": "LAST_20_RET", "columns": [f"RET_-{i}" for i in range(1, 21)]},
223
+ # New simplified format - automatically creates lag columns
224
+ {"name": "LAST_20_RET", "column": "RET", "lags": 20},
225
+ {"name": "LAST_10_VOL", "column": "VOLUME", "lags": 10},
222
226
  ],
223
227
  "pca_cross_sectional": [
224
228
  {
@@ -255,11 +259,20 @@ experiment = app.create_experiment(data=your_dataframe, **context)
255
259
 
256
260
  2. **Parameter Precedence**: When loading an existing experiment, the stored context takes precedence over any parameters passed to the constructor.
257
261
 
258
- 3. **PCA Time Series**: For time series data with `pca_cross_sectional` where index equals `date_column`, the system automatically uses an expanding window approach to prevent data leakage.
262
+ 3. **PCA Time Series**:
263
+ - For time series data, both `pca_cross_sectional` and `pca_temporal` automatically use an expanding window approach with periodic refresh (default: every 90 days) to prevent data leakage.
264
+ - The system fits PCA only on historical data (lookback window of 365 days by default) and avoids look-ahead bias.
265
+ - For panel data (e.g., multiple stocks), lag features are created per group when using the simplified `pca_temporal` format.
266
+ - Missing PCA values are handled with forward-fill followed by zero-fill to ensure compatibility with downstream models.
259
267
 
260
- 4. **OpenAI Embeddings**: If using `columns_pca` with text columns, ensure `OPENAI_API_KEY` is set as an environment variable.
268
+ 4. **PCA Temporal Simplified Format**:
269
+ - Instead of manually listing lag columns: `{"name": "LAST_20_RET", "columns": ["RET_-1", "RET_-2", ..., "RET_-20"]}`
270
+ - Use the simplified format: `{"name": "LAST_20_RET", "column": "RET", "lags": 20}`
271
+ - The system automatically creates the lag columns, handling panel data correctly with `group_column`.
261
272
 
262
- 5. **Model Indices**: The `models_idx` parameter accepts both integer indices and string names (e.g., `'xgb'`, `'lgb'`, `'catboost'`).
273
+ 5. **OpenAI Embeddings**: If using `columns_pca` with text columns, ensure `OPENAI_API_KEY` is set as an environment variable.
274
+
275
+ 6. **Model Indices**: The `models_idx` parameter accepts both integer indices and string names (e.g., `'xgb'`, `'lgb'`, `'catboost'`).
263
276
 
264
277
 
265
278
 
@@ -179,7 +179,11 @@ context = {
179
179
  "val_size": 0.2,
180
180
  "test_size": 0.2,
181
181
  "pca_temporal": [
182
- {"name": "LAST_20_RET", "columns": [f"RET_-{i}" for i in range(1, 21)]},
182
+ # Old format (still supported)
183
+ # {"name": "LAST_20_RET", "columns": [f"RET_-{i}" for i in range(1, 21)]},
184
+ # New simplified format - automatically creates lag columns
185
+ {"name": "LAST_20_RET", "column": "RET", "lags": 20},
186
+ {"name": "LAST_10_VOL", "column": "VOLUME", "lags": 10},
183
187
  ],
184
188
  "pca_cross_sectional": [
185
189
  {
@@ -216,11 +220,20 @@ experiment = app.create_experiment(data=your_dataframe, **context)
216
220
 
217
221
  2. **Parameter Precedence**: When loading an existing experiment, the stored context takes precedence over any parameters passed to the constructor.
218
222
 
219
- 3. **PCA Time Series**: For time series data with `pca_cross_sectional` where index equals `date_column`, the system automatically uses an expanding window approach to prevent data leakage.
223
+ 3. **PCA Time Series**:
224
+ - For time series data, both `pca_cross_sectional` and `pca_temporal` automatically use an expanding window approach with periodic refresh (default: every 90 days) to prevent data leakage.
225
+ - The system fits PCA only on historical data (lookback window of 365 days by default) and avoids look-ahead bias.
226
+ - For panel data (e.g., multiple stocks), lag features are created per group when using the simplified `pca_temporal` format.
227
+ - Missing PCA values are handled with forward-fill followed by zero-fill to ensure compatibility with downstream models.
220
228
 
221
- 4. **OpenAI Embeddings**: If using `columns_pca` with text columns, ensure `OPENAI_API_KEY` is set as an environment variable.
229
+ 4. **PCA Temporal Simplified Format**:
230
+ - Instead of manually listing lag columns: `{"name": "LAST_20_RET", "columns": ["RET_-1", "RET_-2", ..., "RET_-20"]}`
231
+ - Use the simplified format: `{"name": "LAST_20_RET", "column": "RET", "lags": 20}`
232
+ - The system automatically creates the lag columns, handling panel data correctly with `group_column`.
222
233
 
223
- 5. **Model Indices**: The `models_idx` parameter accepts both integer indices and string names (e.g., `'xgb'`, `'lgb'`, `'catboost'`).
234
+ 5. **OpenAI Embeddings**: If using `columns_pca` with text columns, ensure `OPENAI_API_KEY` is set as an environment variable.
235
+
236
+ 6. **Model Indices**: The `models_idx` parameter accepts both integer indices and string names (e.g., `'xgb'`, `'lgb'`, `'catboost'`).
224
237
 
225
238
 
226
239
 
@@ -34,5 +34,5 @@ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
34
34
  LECRAPAUD_LOGFILE = os.getenv("LECRAPAUD_LOGFILE")
35
35
  LECRAPAUD_TABLE_PREFIX = os.getenv("LECRAPAUD_TABLE_PREFIX", "lecrapaud")
36
36
  LECRAPAUD_OPTIMIZATION_BACKEND = os.getenv(
37
- "LECRAPAUD_OPTIMIZATION_BACKEND", "ray"
37
+ "LECRAPAUD_OPTIMIZATION_BACKEND", "hyperopt"
38
38
  ).lower()
@@ -605,7 +605,7 @@ class PreprocessFeature:
605
605
 
606
606
  return df, pcas_dict
607
607
 
608
- def add_pca_feature_cross_sectional(
608
+ def add_pca_feature_cross_sectional_old(
609
609
  self,
610
610
  df: pd.DataFrame,
611
611
  *,
@@ -657,7 +657,7 @@ class PreprocessFeature:
657
657
 
658
658
  return df, pcas_dict
659
659
 
660
- def add_pca_feature_cross_sectional_time_series(
660
+ def add_pca_feature_cross_sectional(
661
661
  self,
662
662
  df: pd.DataFrame,
663
663
  *,
@@ -840,6 +840,11 @@ class PreprocessFeature:
840
840
  # Merger les scores
841
841
  df = df.merge(scores_df, on=index_col, how="left")
842
842
  df.index = index_saved
843
+
844
+ # Forward fill puis 0 pour éviter les NaN
845
+ pca_cols = [col for col in df.columns if col.startswith(prefix)]
846
+ df[pca_cols] = df[pca_cols].fillna(method='ffill').fillna(0)
847
+
843
848
  pcas_dict.update({name: pipe})
844
849
 
845
850
  else:
@@ -873,7 +878,7 @@ class PreprocessFeature:
873
878
  return df, pcas_dict
874
879
 
875
880
  # ----------------- 2) PCA TEMPORELLE (liste de colonnes lags) ----------------
876
- def add_pca_feature_temporal(
881
+ def add_pca_feature_temporal_old(
877
882
  self,
878
883
  df: pd.DataFrame,
879
884
  *,
@@ -936,6 +941,187 @@ class PreprocessFeature:
936
941
 
937
942
  return df, pcas_dict
938
943
 
944
+ def add_pca_feature_temporal(
945
+ self,
946
+ df: pd.DataFrame,
947
+ *,
948
+ n_components: int = 5,
949
+ pcas: dict[str, Pipeline] | None = None,
950
+ impute_strategy: str = "median",
951
+ standardize: bool = True,
952
+ lookback_days: int = 365,
953
+ refresh_frequency: int = 90,
954
+ ) -> tuple[pd.DataFrame, dict[str, Pipeline]]:
955
+ """
956
+ PCA temporelle pour time series avec support panel data.
957
+ Crée automatiquement les colonnes de lags et évite le look-ahead bias.
958
+
959
+ Format pca_temporal simplifié:
960
+ [{"name": "LAST_20_RET", "column": "RET", "lags": 20}]
961
+ """
962
+ pcas_dict = {}
963
+
964
+ for pca_config in self.pca_temporal:
965
+ # Support both old and new format
966
+ if "columns" in pca_config:
967
+ # Old format: use existing columns
968
+ name = pca_config["name"]
969
+ lag_columns = pca_config["columns"]
970
+ base_column = None
971
+ num_lags = len(lag_columns)
972
+ else:
973
+ # New format: create lag columns
974
+ name = pca_config["name"]
975
+ base_column = pca_config["column"].upper()
976
+ num_lags = pca_config.get("lags", 20)
977
+
978
+ # Create lag columns if they don't exist
979
+ if self.group_column:
980
+ # Panel data: create lags by group
981
+ for lag in range(1, num_lags + 1):
982
+ lag_col = f"{base_column}_-{lag}"
983
+ if lag_col not in df.columns:
984
+ df[lag_col] = df.groupby(self.group_column)[base_column].shift(lag)
985
+ else:
986
+ # Simple time series
987
+ for lag in range(1, num_lags + 1):
988
+ lag_col = f"{base_column}_-{lag}"
989
+ if lag_col not in df.columns:
990
+ df[lag_col] = df[base_column].shift(lag)
991
+
992
+ lag_columns = [f"{base_column}_-{i}" for i in range(1, num_lags + 1)]
993
+
994
+ prefix = f"TMP_PC_{name}"
995
+
996
+ # For time series: avoid look-ahead bias
997
+ if self.time_series and self.date_column:
998
+ all_scores = []
999
+ unique_dates = sorted(df[self.date_column].unique())
1000
+
1001
+ if pcas is not None:
1002
+ # Inference: use provided PCA
1003
+ pipe = pcas[name]
1004
+
1005
+ # Apply to all data at once
1006
+ mask = df[lag_columns].notna().all(axis=1)
1007
+ if mask.any():
1008
+ X_transform = df.loc[mask, lag_columns]
1009
+ scores = pipe.transform(X_transform)
1010
+
1011
+ for i in range(n_components):
1012
+ df.loc[mask, f"{prefix}_{i}"] = scores[:, i]
1013
+
1014
+ # Fill NaN with forward fill then 0
1015
+ pca_cols = [f"{prefix}_{i}" for i in range(n_components)]
1016
+ df[pca_cols] = df[pca_cols].fillna(method='ffill').fillna(0)
1017
+
1018
+ else:
1019
+ # Training: expanding window with periodic refresh
1020
+ pipe = None
1021
+ last_fit_date = None
1022
+
1023
+ for current_date_ordinal in unique_dates:
1024
+ current_date = pd.Timestamp.fromordinal(int(current_date_ordinal))
1025
+
1026
+ # Determine if we should refit
1027
+ should_refit = pipe is None or (
1028
+ last_fit_date is not None
1029
+ and (current_date - last_fit_date).days >= refresh_frequency
1030
+ )
1031
+
1032
+ if should_refit and len(df[df[self.date_column] < current_date_ordinal]) > num_lags * 2:
1033
+ # Get historical data for fitting
1034
+ lookback_start = current_date - pd.Timedelta(days=lookback_days)
1035
+ lookback_start_ordinal = pd.Timestamp.toordinal(lookback_start)
1036
+
1037
+ mask_fit = (
1038
+ (df[self.date_column] >= lookback_start_ordinal) &
1039
+ (df[self.date_column] < current_date_ordinal) &
1040
+ df[lag_columns].notna().all(axis=1)
1041
+ )
1042
+
1043
+ if mask_fit.sum() >= n_components:
1044
+ X_fit = df.loc[mask_fit, lag_columns]
1045
+
1046
+ # Create pipeline
1047
+ steps = []
1048
+ if impute_strategy is not None:
1049
+ steps.append(("imputer", SimpleImputer(strategy=impute_strategy)))
1050
+ if standardize:
1051
+ steps.append(("scaler", StandardScaler()))
1052
+ steps.append(("pca", PCA(n_components=n_components, random_state=0)))
1053
+
1054
+ pipe = Pipeline(steps)
1055
+ pipe.fit(X_fit)
1056
+ last_fit_date = current_date
1057
+
1058
+ logger.debug(
1059
+ f"Temporal PCA {name} refitted at {current_date.strftime('%Y-%m-%d')} "
1060
+ f"using {len(X_fit)} samples"
1061
+ )
1062
+
1063
+ # Transform current date data
1064
+ if pipe is not None:
1065
+ mask_current = (
1066
+ (df[self.date_column] == current_date_ordinal) &
1067
+ df[lag_columns].notna().all(axis=1)
1068
+ )
1069
+
1070
+ if mask_current.any():
1071
+ X_current = df.loc[mask_current, lag_columns]
1072
+ scores = pipe.transform(X_current)
1073
+
1074
+ for i in range(n_components):
1075
+ df.loc[mask_current, f"{prefix}_{i}"] = scores[:, i]
1076
+
1077
+ # Fill NaN with forward fill then 0
1078
+ pca_cols = [f"{prefix}_{i}" for i in range(n_components)]
1079
+ for col in pca_cols:
1080
+ if col not in df.columns:
1081
+ df[col] = 0
1082
+ df[pca_cols] = df[pca_cols].fillna(method='ffill').fillna(0)
1083
+
1084
+ pcas_dict[name] = pipe
1085
+
1086
+ else:
1087
+ # Non time-series: use original approach
1088
+ mask = df[lag_columns].notna().all(axis=1)
1089
+
1090
+ if pcas is None and mask.any():
1091
+ X_fit = df.loc[mask, lag_columns]
1092
+
1093
+ steps = []
1094
+ if impute_strategy is not None:
1095
+ steps.append(("imputer", SimpleImputer(strategy=impute_strategy)))
1096
+ if standardize:
1097
+ steps.append(("scaler", StandardScaler()))
1098
+ steps.append(("pca", PCA(n_components=n_components, random_state=0)))
1099
+
1100
+ pipe = Pipeline(steps)
1101
+ pipe.fit(X_fit)
1102
+ pcas_dict[name] = pipe
1103
+ elif pcas is not None:
1104
+ pipe = pcas[name]
1105
+ pcas_dict[name] = pipe
1106
+ else:
1107
+ continue
1108
+
1109
+ if mask.any():
1110
+ X_transform = df.loc[mask, lag_columns]
1111
+ scores = pipe.transform(X_transform)
1112
+
1113
+ for i in range(n_components):
1114
+ df.loc[mask, f"{prefix}_{i}"] = scores[:, i]
1115
+
1116
+ # Fill missing values
1117
+ pca_cols = [f"{prefix}_{i}" for i in range(n_components)]
1118
+ for col in pca_cols:
1119
+ if col not in df.columns:
1120
+ df[col] = 0
1121
+ df[pca_cols] = df[pca_cols].fillna(0)
1122
+
1123
+ return df, pcas_dict
1124
+
939
1125
  # encoding categorical features
940
1126
  def encode_categorical_features(
941
1127
  self,
@@ -278,24 +278,32 @@ class FeatureSelectionEngine:
278
278
 
279
279
  features_selected_list = features_selected["features"].values.tolist()
280
280
 
281
- # Save ensemble features before correlation (aggregated features)
282
- logger.info("Saving ensemble features before correlation...")
283
- all_features_in_data = self.X.columns.tolist()
281
+ # Save ensemble features for all numerical features with global ranking
282
+ logger.info("Saving ensemble features with global ranking for all numerical features...")
283
+ numerical_features_in_data = self.X_numerical.columns.tolist()
284
284
  ensemble_rows = []
285
285
 
286
- # Add global rank for selected features
287
- features_selected_with_global_rank = features_selected.copy()
288
- features_selected_with_global_rank["global_rank"] = range(1, len(features_selected_with_global_rank) + 1)
286
+ # Create global ranking for ALL numerical features (1 to n, no null values)
287
+ all_numerical_scores = pd.concat(results, axis=0)
288
+ all_numerical_scores = all_numerical_scores.groupby("features").agg({
289
+ "rank": "mean" # Average rank across all methods
290
+ }).reset_index()
291
+ all_numerical_scores.sort_values("rank", inplace=True)
292
+ all_numerical_scores["global_rank"] = range(1, len(all_numerical_scores) + 1)
289
293
 
290
- for feature in all_features_in_data:
294
+ for feature in numerical_features_in_data:
291
295
  feature_id = feature_map.get(feature)
292
296
  if feature_id:
293
297
  is_selected = feature in features_selected_list
294
- global_rank = None
295
- if is_selected:
296
- global_rank = features_selected_with_global_rank[
297
- features_selected_with_global_rank["features"] == feature
298
+
299
+ # Get global rank (no null values - all features get a rank)
300
+ if feature in all_numerical_scores["features"].values:
301
+ global_rank = all_numerical_scores[
302
+ all_numerical_scores["features"] == feature
298
303
  ]["global_rank"].values[0]
304
+ else:
305
+ # Fallback: assign last rank + position for features not in results
306
+ global_rank = len(all_numerical_scores) + numerical_features_in_data.index(feature) + 1
299
307
 
300
308
  ensemble_rows.append({
301
309
  "feature_selection_id": feature_selection.id,
@@ -353,28 +361,12 @@ class FeatureSelectionEngine:
353
361
  )
354
362
 
355
363
  # Final update for features after max limitation (final selection)
356
- logger.info("Finalizing ensemble features with categorical features...")
364
+ logger.info("Finalizing ensemble features...")
357
365
  for row in ensemble_rows:
358
366
  feature = Feature.get(row["feature_id"]).name
359
367
  if feature in features and row["support"] == 1:
360
368
  row["support"] = 2 # 2 = in final selection
361
369
 
362
- # Add categorical features to ensemble if not already present
363
- if target_type == "classification":
364
- for cat_feature in categorical_features_selected:
365
- feature_id = feature_map.get(cat_feature)
366
- if feature_id and not any(row["feature_id"] == feature_id for row in ensemble_rows):
367
- ensemble_rows.append({
368
- "feature_selection_id": feature_selection.id,
369
- "feature_id": feature_id,
370
- "method": "ensemble",
371
- "score": None,
372
- "pvalue": None,
373
- "support": 2, # 2 = in final selection (categorical)
374
- "rank": None, # No rank for categorical features added at the end
375
- "training_time": 0,
376
- })
377
-
378
370
  # Re-save all ensemble data with updated support values
379
371
  FeatureSelectionRank.bulk_upsert(rows=ensemble_rows)
380
372
  logger.debug(
@@ -55,8 +55,7 @@ from tensorboardX import SummaryWriter
55
55
 
56
56
  # Optimization
57
57
  import ray
58
- from ray.tune import Tuner, TuneConfig, with_parameters
59
- from ray.train import RunConfig
58
+ from ray.tune import Tuner, TuneConfig, with_parameters, RunConfig
60
59
  from ray.tune.search.hyperopt import HyperOptSearch
61
60
  from ray.tune.search.bayesopt import BayesOptSearch
62
61
  from ray.tune.logger import TBXLoggerCallback
@@ -1357,8 +1356,12 @@ class ModelSelectionEngine:
1357
1356
  """Choose between Ray Tune and HyperOpt standalone based on configuration."""
1358
1357
  if LECRAPAUD_OPTIMIZATION_BACKEND == "hyperopt":
1359
1358
  return self.hyperoptimize_hyperopt(x_train, y_train, x_val, y_val, model)
1360
- else:
1359
+ elif LECRAPAUD_OPTIMIZATION_BACKEND == "ray":
1361
1360
  return self.hyperoptimize_ray(x_train, y_train, x_val, y_val, model)
1361
+ else:
1362
+ raise ValueError(
1363
+ f"Invalid optimization backend: {LECRAPAUD_OPTIMIZATION_BACKEND}."
1364
+ )
1362
1365
 
1363
1366
  def hyperoptimize_hyperopt(
1364
1367
  self, x_train, y_train, x_val, y_val, model: ModelEngine
@@ -1746,11 +1749,11 @@ def evaluate(
1746
1749
  y_pred_proba = (
1747
1750
  prediction[1] if num_classes == 2 else prediction.iloc[:, 2:].values
1748
1751
  )
1749
- if num_classes > 2:
1750
- lb = LabelBinarizer(sparse_output=False) # Change to True for sparse matrix
1751
- lb.fit(labels)
1752
- y_true_onhot = lb.transform(y_true)
1753
- y_pred_onehot = lb.transform(y_pred)
1752
+ # if num_classes > 2:
1753
+ # lb = LabelBinarizer(sparse_output=False) # Change to True for sparse matrix
1754
+ # lb.fit(labels)
1755
+ # y_true_onhot = lb.transform(y_true)
1756
+ # y_pred_onehot = lb.transform(y_pred)
1754
1757
 
1755
1758
  score["LOGLOSS"] = log_loss(y_true, y_pred_proba)
1756
1759
  score["ACCURACY"] = accuracy_score(y_true, y_pred)
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "lecrapaud"
3
- version = "0.21.0"
3
+ version = "0.21.2"
4
4
  description = "Framework for machine and deep learning, with regression, classification and time series analysis"
5
5
  authors = [
6
6
  {name = "Pierre H. Gallet"}
File without changes
File without changes