lecrapaud 0.19.3__py3-none-any.whl → 0.20.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

@@ -87,21 +87,20 @@ class FeatureEngineeringEngine:
87
87
  def __init__(
88
88
  self,
89
89
  data: pd.DataFrame,
90
- columns_drop: list[str] = [],
91
- columns_boolean: list[str] = [],
92
- columns_date: list[str] = [],
93
- columns_te_groupby: list[str] = [],
94
- columns_te_target: list[str] = [],
90
+ experiment,
95
91
  for_training: bool = True,
96
92
  **kwargs,
97
93
  ):
98
94
  self.data = data
99
- self.columns_drop = columns_drop
100
- self.columns_boolean = columns_boolean
101
- self.columns_date = columns_date
102
- self.columns_te_groupby = columns_te_groupby
103
- self.columns_te_target = columns_te_target
95
+ self.experiment = experiment
104
96
  self.for_training = for_training
97
+
98
+ # Get all parameters from experiment context
99
+ self.columns_drop = self.experiment.context.get("columns_drop", [])
100
+ self.columns_boolean = self.experiment.context.get("columns_boolean", [])
101
+ self.columns_date = self.experiment.context.get("columns_date", [])
102
+ self.columns_te_groupby = self.experiment.context.get("columns_te_groupby", [])
103
+ self.columns_te_target = self.experiment.context.get("columns_te_target", [])
105
104
 
106
105
  def run(self) -> pd.DataFrame:
107
106
  # drop columns
@@ -316,41 +315,30 @@ class PreprocessFeature:
316
315
  self,
317
316
  data: pd.DataFrame,
318
317
  experiment,
319
- time_series: bool = False,
320
- date_column: str | None = None,
321
- group_column: str | None = None,
322
- val_size: float = 0.2,
323
- test_size: float = 0.2,
324
- columns_pca: list[str] = [],
325
- pca_temporal: list[dict[str, list[str]]] = [],
326
- pca_cross_sectional: list[dict[str, list[str]]] = [],
327
- columns_onehot: list[str] = [],
328
- columns_binary: list[str] = [],
329
- columns_ordinal: list[str] = [],
330
- columns_frequency: list[str] = [],
331
- target_numbers: list = [],
332
- target_clf: list = [],
333
318
  **kwargs,
334
319
  ):
335
320
  self.data = data
336
321
  self.data.columns = self.data.columns.str.upper()
337
-
338
322
  self.experiment = experiment
339
- self.columns_pca = [col.upper() for col in columns_pca]
340
- self.pca_temporal = pca_temporal
341
- self.pca_cross_sectional = pca_cross_sectional
342
- self.columns_onehot = [col.upper() for col in columns_onehot]
343
- self.columns_binary = [col.upper() for col in columns_binary]
344
- self.columns_ordinal = [col.upper() for col in columns_ordinal]
345
- self.columns_frequency = [col.upper() for col in columns_frequency]
346
- self.target_numbers = target_numbers
347
- self.target_clf = target_clf
348
-
349
- self.time_series = time_series
350
- self.date_column = date_column
351
- self.group_column = group_column
352
- self.val_size = val_size
353
- self.test_size = test_size
323
+
324
+ # Get all parameters from experiment context
325
+ context = self.experiment.context
326
+ self.time_series = context.get("time_series", False)
327
+ self.date_column = context.get("date_column", None)
328
+ self.group_column = context.get("group_column", None)
329
+ self.val_size = context.get("val_size", 0.2)
330
+ self.test_size = context.get("test_size", 0.2)
331
+ self.target_numbers = context.get("target_numbers", [])
332
+ self.target_clf = context.get("target_clf", [])
333
+
334
+ # Handle list parameters with uppercase conversion
335
+ self.columns_pca = [col.upper() for col in context.get("columns_pca", [])]
336
+ self.pca_temporal = context.get("pca_temporal", [])
337
+ self.pca_cross_sectional = context.get("pca_cross_sectional", [])
338
+ self.columns_onehot = [col.upper() for col in context.get("columns_onehot", [])]
339
+ self.columns_binary = [col.upper() for col in context.get("columns_binary", [])]
340
+ self.columns_ordinal = [col.upper() for col in context.get("columns_ordinal", [])]
341
+ self.columns_frequency = [col.upper() for col in context.get("columns_frequency", [])]
354
342
 
355
343
  self.experiment_dir = self.experiment.path
356
344
  self.experiment_id = self.experiment.id
@@ -483,8 +471,8 @@ class PreprocessFeature:
483
471
  f"{data.shape} {name} data from {dates[f"{name}_start_date"].strftime('%d/%m/%Y')} to {dates[f"{name}_end_date"].strftime('%d/%m/%Y')}"
484
472
  )
485
473
 
486
- Experiment.upsert(
487
- match_fields=["id"],
474
+ # Update existing experiment with sizes and dates
475
+ Experiment.update(
488
476
  id=self.experiment_id,
489
477
  train_size=len(train),
490
478
  val_size=len(val),
@@ -545,8 +533,8 @@ class PreprocessFeature:
545
533
  for name, data in zip(["train", "val", "test"], [train, val, test]):
546
534
  logger.info(f"{data.shape} {name} data")
547
535
 
548
- Experiment.upsert(
549
- match_fields=["id"],
536
+ # Update existing experiment with sizes
537
+ Experiment.update(
550
538
  id=self.experiment_id,
551
539
  train_size=len(train),
552
540
  val_size=len(val),
@@ -838,8 +826,7 @@ class PreprocessFeature:
838
826
 
839
827
  # Upsert features in bulk if we have any features
840
828
  if all_feature_names:
841
- Feature.upsert_bulk(
842
- match_fields=["name"],
829
+ Feature.bulk_upsert(
843
830
  name=all_feature_names,
844
831
  type=all_feature_types,
845
832
  )
@@ -855,9 +842,7 @@ class PreprocessFeature:
855
842
  for target in target_names
856
843
  ]
857
844
 
858
- Target.upsert_bulk(
859
- match_fields=["name"], name=target_names, type=target_types
860
- )
845
+ Target.bulk_upsert(name=target_names, type=target_types)
861
846
 
862
847
  # Get all the upserted objects
863
848
  targets = Target.filter(name__in=target_names)
@@ -73,18 +73,21 @@ def load_train_data(experiment_dir):
73
73
 
74
74
 
75
75
  class FeatureSelectionEngine:
76
- def __init__(self, train, experiment, target_number, target_clf, **kwargs):
76
+ def __init__(self, train, experiment, target_number, **kwargs):
77
77
  self.experiment = experiment
78
78
  self.train = train
79
79
  self.target_number = target_number
80
- self.target_clf = target_clf
80
+
81
+ # Get all parameters from experiment context
82
+ self.target_clf = self.experiment.context.get("target_clf", [])
83
+ self.max_p_value_categorical = self.experiment.context.get("max_p_value_categorical", 0.05)
84
+ self.percentile = self.experiment.context.get("percentile", 20)
85
+ self.corr_threshold = self.experiment.context.get("corr_threshold", 80)
86
+ self.max_features = self.experiment.context.get("max_features", 50)
81
87
 
82
88
  self.target_type = (
83
89
  "classification" if self.target_number in self.target_clf else "regression"
84
90
  )
85
- self.percentile = self.experiment.percentile
86
- self.corr_threshold = self.experiment.corr_threshold
87
- self.max_features = self.experiment.max_features
88
91
 
89
92
  self.experiment_dir = self.experiment.path
90
93
  self.experiment_id = self.experiment.id
@@ -115,7 +118,6 @@ class FeatureSelectionEngine:
115
118
  max_features = self.max_features
116
119
 
117
120
  feature_selection = FeatureSelection.upsert(
118
- match_fields=["target_id", "experiment_id"],
119
121
  target_id=target.id,
120
122
  experiment_id=self.experiment_id,
121
123
  )
@@ -275,6 +277,38 @@ class FeatureSelectionEngine:
275
277
  features_selected.drop_duplicates("features", inplace=True)
276
278
 
277
279
  features_selected_list = features_selected["features"].values.tolist()
280
+
281
+ # Save ensemble features before correlation (aggregated features)
282
+ logger.info("Saving ensemble features before correlation...")
283
+ all_features_in_data = self.X.columns.tolist()
284
+ ensemble_rows = []
285
+
286
+ # Add global rank for selected features
287
+ features_selected_with_global_rank = features_selected.copy()
288
+ features_selected_with_global_rank["global_rank"] = range(1, len(features_selected_with_global_rank) + 1)
289
+
290
+ for feature in all_features_in_data:
291
+ feature_id = feature_map.get(feature)
292
+ if feature_id:
293
+ is_selected = feature in features_selected_list
294
+ global_rank = None
295
+ if is_selected:
296
+ global_rank = features_selected_with_global_rank[
297
+ features_selected_with_global_rank["features"] == feature
298
+ ]["global_rank"].values[0]
299
+
300
+ ensemble_rows.append({
301
+ "feature_selection_id": feature_selection.id,
302
+ "feature_id": feature_id,
303
+ "method": "ensemble",
304
+ "score": None,
305
+ "pvalue": None,
306
+ "support": 2 if is_selected else 0, # 2 = in aggregated features
307
+ "rank": global_rank,
308
+ "training_time": 0,
309
+ })
310
+
311
+ FeatureSelectionRank.bulk_upsert(rows=ensemble_rows)
278
312
 
279
313
  # analysis 1
280
314
  features_selected_by_every_methods = set(results[0]["features"].values.tolist())
@@ -303,12 +337,46 @@ class FeatureSelectionEngine:
303
337
  header=True,
304
338
  index_label="ID",
305
339
  )
340
+
341
+ # Update support for features after correlation removal (before max)
342
+ logger.info("Updating ensemble features after correlation removal...")
343
+ for row in ensemble_rows:
344
+ feature = Feature.get(row["feature_id"]).name
345
+ if feature in features:
346
+ row["support"] = 1 # 1 = survived correlation removal
347
+
306
348
  features = features[:max_features]
307
349
 
308
350
  # adding categorical features selected
309
351
  features += (
310
352
  categorical_features_selected if target_type == "classification" else []
311
353
  )
354
+
355
+ # Final update for features after max limitation (final selection)
356
+ logger.info("Finalizing ensemble features with categorical features...")
357
+ for row in ensemble_rows:
358
+ feature = Feature.get(row["feature_id"]).name
359
+ if feature in features and row["support"] == 1:
360
+ row["support"] = 2 # 2 = in final selection
361
+
362
+ # Add categorical features to ensemble if not already present
363
+ if target_type == "classification":
364
+ for cat_feature in categorical_features_selected:
365
+ feature_id = feature_map.get(cat_feature)
366
+ if feature_id and not any(row["feature_id"] == feature_id for row in ensemble_rows):
367
+ ensemble_rows.append({
368
+ "feature_selection_id": feature_selection.id,
369
+ "feature_id": feature_id,
370
+ "method": "ensemble",
371
+ "score": None,
372
+ "pvalue": None,
373
+ "support": 2, # 2 = in final selection (categorical)
374
+ "rank": None, # No rank for categorical features added at the end
375
+ "training_time": 0,
376
+ })
377
+
378
+ # Re-save all ensemble data with updated support values
379
+ FeatureSelectionRank.bulk_upsert(rows=ensemble_rows)
312
380
  logger.debug(
313
381
  f"Final pre-selection: {len(features)} features below {corr_threshold}% out of {len(features_selected_list)} features, and rejected {len(features_correlated)} features, {100*len(features)/len(features_selected_list):.2f}% features selected"
314
382
  )
@@ -441,13 +509,18 @@ class FeatureSelectionEngine:
441
509
  feat_scores["features"] = X.columns
442
510
  feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
443
511
  feat_scores["method"] = "Chi2"
512
+
513
+ # Apply both percentile and p-value filtering
514
+ # Keep features that satisfy BOTH conditions: within percentile AND p-value < threshold
515
+ feat_scores["support"] = feat_scores["support"] & (feat_scores["pvalue"] <= self.max_p_value_categorical)
516
+
444
517
  feat_scores.sort_values("rank", ascending=True, inplace=True)
445
518
  stop = time.time()
446
519
  training_time = timedelta(seconds=(stop - start)).total_seconds()
447
520
  feat_scores["training_time"] = training_time
448
521
 
449
522
  logger.debug(
450
- f"Chi2 evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
523
+ f"Chi2 evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds (percentile={percentile}%, p-value<={self.max_p_value_categorical})"
451
524
  )
452
525
 
453
526
  feat_scores.to_csv(
@@ -804,33 +877,28 @@ class PreprocessModel:
804
877
  val,
805
878
  test,
806
879
  experiment,
807
- target_numbers,
808
- target_clf,
809
- models_idx,
810
- time_series,
811
- max_timesteps,
812
- group_column,
813
- date_column,
814
880
  **kwargs,
815
881
  ):
816
882
  self.train = train
817
883
  self.val = val
818
884
  self.test = test
819
885
  self.experiment = experiment
820
- self.target_numbers = target_numbers
821
- self.target_clf = target_clf
822
- self.models_idx = models_idx
823
- self.time_series = time_series
824
- self.max_timesteps = max_timesteps
825
- self.group_column = group_column
826
- self.date_column = date_column
886
+
887
+ # Get all parameters from experiment context
888
+ self.target_numbers = self.experiment.context.get("target_numbers", [])
889
+ self.target_clf = self.experiment.context.get("target_clf", [])
890
+ self.models_idx = self.experiment.context.get("models_idx", [])
891
+ self.time_series = self.experiment.context.get("time_series", False)
892
+ self.max_timesteps = self.experiment.context.get("max_timesteps", 120)
893
+ self.group_column = self.experiment.context.get("group_column", None)
894
+ self.date_column = self.experiment.context.get("date_column", None)
827
895
 
828
896
  self.experiment_dir = experiment.path
829
897
  self.data_dir = f"{self.experiment_dir}/data"
830
898
  self.preprocessing_dir = f"{self.experiment_dir}/preprocessing"
831
899
 
832
900
  self.all_features = experiment.get_all_features(
833
- date_column=date_column, group_column=group_column
901
+ date_column=self.date_column, group_column=self.group_column
834
902
  )
835
903
 
836
904
  def run(self):