lecrapaud 0.19.3__py3-none-any.whl → 0.20.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lecrapaud might be problematic. Click here for more details.
- lecrapaud/api.py +11 -49
- lecrapaud/config.py +1 -0
- lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +39 -0
- lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +264 -0
- lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +42 -0
- lecrapaud/db/models/__init__.py +2 -4
- lecrapaud/db/models/base.py +103 -65
- lecrapaud/db/models/experiment.py +79 -99
- lecrapaud/db/models/feature_selection.py +0 -3
- lecrapaud/db/models/feature_selection_rank.py +0 -18
- lecrapaud/db/models/model_selection.py +2 -2
- lecrapaud/db/models/{score.py → model_selection_score.py} +29 -12
- lecrapaud/db/session.py +1 -0
- lecrapaud/experiment.py +11 -13
- lecrapaud/feature_engineering.py +34 -49
- lecrapaud/feature_selection.py +90 -22
- lecrapaud/model_selection.py +434 -192
- lecrapaud/search_space.py +2 -1
- lecrapaud/utils.py +22 -2
- {lecrapaud-0.19.3.dist-info → lecrapaud-0.20.1.dist-info}/METADATA +1 -1
- {lecrapaud-0.19.3.dist-info → lecrapaud-0.20.1.dist-info}/RECORD +23 -21
- lecrapaud/db/models/model_training.py +0 -64
- {lecrapaud-0.19.3.dist-info → lecrapaud-0.20.1.dist-info}/WHEEL +0 -0
- {lecrapaud-0.19.3.dist-info → lecrapaud-0.20.1.dist-info}/licenses/LICENSE +0 -0
lecrapaud/feature_engineering.py
CHANGED
|
@@ -87,21 +87,20 @@ class FeatureEngineeringEngine:
|
|
|
87
87
|
def __init__(
|
|
88
88
|
self,
|
|
89
89
|
data: pd.DataFrame,
|
|
90
|
-
|
|
91
|
-
columns_boolean: list[str] = [],
|
|
92
|
-
columns_date: list[str] = [],
|
|
93
|
-
columns_te_groupby: list[str] = [],
|
|
94
|
-
columns_te_target: list[str] = [],
|
|
90
|
+
experiment,
|
|
95
91
|
for_training: bool = True,
|
|
96
92
|
**kwargs,
|
|
97
93
|
):
|
|
98
94
|
self.data = data
|
|
99
|
-
self.
|
|
100
|
-
self.columns_boolean = columns_boolean
|
|
101
|
-
self.columns_date = columns_date
|
|
102
|
-
self.columns_te_groupby = columns_te_groupby
|
|
103
|
-
self.columns_te_target = columns_te_target
|
|
95
|
+
self.experiment = experiment
|
|
104
96
|
self.for_training = for_training
|
|
97
|
+
|
|
98
|
+
# Get all parameters from experiment context
|
|
99
|
+
self.columns_drop = self.experiment.context.get("columns_drop", [])
|
|
100
|
+
self.columns_boolean = self.experiment.context.get("columns_boolean", [])
|
|
101
|
+
self.columns_date = self.experiment.context.get("columns_date", [])
|
|
102
|
+
self.columns_te_groupby = self.experiment.context.get("columns_te_groupby", [])
|
|
103
|
+
self.columns_te_target = self.experiment.context.get("columns_te_target", [])
|
|
105
104
|
|
|
106
105
|
def run(self) -> pd.DataFrame:
|
|
107
106
|
# drop columns
|
|
@@ -316,41 +315,30 @@ class PreprocessFeature:
|
|
|
316
315
|
self,
|
|
317
316
|
data: pd.DataFrame,
|
|
318
317
|
experiment,
|
|
319
|
-
time_series: bool = False,
|
|
320
|
-
date_column: str | None = None,
|
|
321
|
-
group_column: str | None = None,
|
|
322
|
-
val_size: float = 0.2,
|
|
323
|
-
test_size: float = 0.2,
|
|
324
|
-
columns_pca: list[str] = [],
|
|
325
|
-
pca_temporal: list[dict[str, list[str]]] = [],
|
|
326
|
-
pca_cross_sectional: list[dict[str, list[str]]] = [],
|
|
327
|
-
columns_onehot: list[str] = [],
|
|
328
|
-
columns_binary: list[str] = [],
|
|
329
|
-
columns_ordinal: list[str] = [],
|
|
330
|
-
columns_frequency: list[str] = [],
|
|
331
|
-
target_numbers: list = [],
|
|
332
|
-
target_clf: list = [],
|
|
333
318
|
**kwargs,
|
|
334
319
|
):
|
|
335
320
|
self.data = data
|
|
336
321
|
self.data.columns = self.data.columns.str.upper()
|
|
337
|
-
|
|
338
322
|
self.experiment = experiment
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
self.
|
|
343
|
-
self.
|
|
344
|
-
self.
|
|
345
|
-
self.
|
|
346
|
-
self.
|
|
347
|
-
self.
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
self.
|
|
352
|
-
self.
|
|
353
|
-
self.
|
|
323
|
+
|
|
324
|
+
# Get all parameters from experiment context
|
|
325
|
+
context = self.experiment.context
|
|
326
|
+
self.time_series = context.get("time_series", False)
|
|
327
|
+
self.date_column = context.get("date_column", None)
|
|
328
|
+
self.group_column = context.get("group_column", None)
|
|
329
|
+
self.val_size = context.get("val_size", 0.2)
|
|
330
|
+
self.test_size = context.get("test_size", 0.2)
|
|
331
|
+
self.target_numbers = context.get("target_numbers", [])
|
|
332
|
+
self.target_clf = context.get("target_clf", [])
|
|
333
|
+
|
|
334
|
+
# Handle list parameters with uppercase conversion
|
|
335
|
+
self.columns_pca = [col.upper() for col in context.get("columns_pca", [])]
|
|
336
|
+
self.pca_temporal = context.get("pca_temporal", [])
|
|
337
|
+
self.pca_cross_sectional = context.get("pca_cross_sectional", [])
|
|
338
|
+
self.columns_onehot = [col.upper() for col in context.get("columns_onehot", [])]
|
|
339
|
+
self.columns_binary = [col.upper() for col in context.get("columns_binary", [])]
|
|
340
|
+
self.columns_ordinal = [col.upper() for col in context.get("columns_ordinal", [])]
|
|
341
|
+
self.columns_frequency = [col.upper() for col in context.get("columns_frequency", [])]
|
|
354
342
|
|
|
355
343
|
self.experiment_dir = self.experiment.path
|
|
356
344
|
self.experiment_id = self.experiment.id
|
|
@@ -483,8 +471,8 @@ class PreprocessFeature:
|
|
|
483
471
|
f"{data.shape} {name} data from {dates[f"{name}_start_date"].strftime('%d/%m/%Y')} to {dates[f"{name}_end_date"].strftime('%d/%m/%Y')}"
|
|
484
472
|
)
|
|
485
473
|
|
|
486
|
-
|
|
487
|
-
|
|
474
|
+
# Update existing experiment with sizes and dates
|
|
475
|
+
Experiment.update(
|
|
488
476
|
id=self.experiment_id,
|
|
489
477
|
train_size=len(train),
|
|
490
478
|
val_size=len(val),
|
|
@@ -545,8 +533,8 @@ class PreprocessFeature:
|
|
|
545
533
|
for name, data in zip(["train", "val", "test"], [train, val, test]):
|
|
546
534
|
logger.info(f"{data.shape} {name} data")
|
|
547
535
|
|
|
548
|
-
|
|
549
|
-
|
|
536
|
+
# Update existing experiment with sizes
|
|
537
|
+
Experiment.update(
|
|
550
538
|
id=self.experiment_id,
|
|
551
539
|
train_size=len(train),
|
|
552
540
|
val_size=len(val),
|
|
@@ -838,8 +826,7 @@ class PreprocessFeature:
|
|
|
838
826
|
|
|
839
827
|
# Upsert features in bulk if we have any features
|
|
840
828
|
if all_feature_names:
|
|
841
|
-
Feature.
|
|
842
|
-
match_fields=["name"],
|
|
829
|
+
Feature.bulk_upsert(
|
|
843
830
|
name=all_feature_names,
|
|
844
831
|
type=all_feature_types,
|
|
845
832
|
)
|
|
@@ -855,9 +842,7 @@ class PreprocessFeature:
|
|
|
855
842
|
for target in target_names
|
|
856
843
|
]
|
|
857
844
|
|
|
858
|
-
Target.
|
|
859
|
-
match_fields=["name"], name=target_names, type=target_types
|
|
860
|
-
)
|
|
845
|
+
Target.bulk_upsert(name=target_names, type=target_types)
|
|
861
846
|
|
|
862
847
|
# Get all the upserted objects
|
|
863
848
|
targets = Target.filter(name__in=target_names)
|
lecrapaud/feature_selection.py
CHANGED
|
@@ -73,18 +73,21 @@ def load_train_data(experiment_dir):
|
|
|
73
73
|
|
|
74
74
|
|
|
75
75
|
class FeatureSelectionEngine:
|
|
76
|
-
def __init__(self, train, experiment, target_number,
|
|
76
|
+
def __init__(self, train, experiment, target_number, **kwargs):
|
|
77
77
|
self.experiment = experiment
|
|
78
78
|
self.train = train
|
|
79
79
|
self.target_number = target_number
|
|
80
|
-
|
|
80
|
+
|
|
81
|
+
# Get all parameters from experiment context
|
|
82
|
+
self.target_clf = self.experiment.context.get("target_clf", [])
|
|
83
|
+
self.max_p_value_categorical = self.experiment.context.get("max_p_value_categorical", 0.05)
|
|
84
|
+
self.percentile = self.experiment.context.get("percentile", 20)
|
|
85
|
+
self.corr_threshold = self.experiment.context.get("corr_threshold", 80)
|
|
86
|
+
self.max_features = self.experiment.context.get("max_features", 50)
|
|
81
87
|
|
|
82
88
|
self.target_type = (
|
|
83
89
|
"classification" if self.target_number in self.target_clf else "regression"
|
|
84
90
|
)
|
|
85
|
-
self.percentile = self.experiment.percentile
|
|
86
|
-
self.corr_threshold = self.experiment.corr_threshold
|
|
87
|
-
self.max_features = self.experiment.max_features
|
|
88
91
|
|
|
89
92
|
self.experiment_dir = self.experiment.path
|
|
90
93
|
self.experiment_id = self.experiment.id
|
|
@@ -115,7 +118,6 @@ class FeatureSelectionEngine:
|
|
|
115
118
|
max_features = self.max_features
|
|
116
119
|
|
|
117
120
|
feature_selection = FeatureSelection.upsert(
|
|
118
|
-
match_fields=["target_id", "experiment_id"],
|
|
119
121
|
target_id=target.id,
|
|
120
122
|
experiment_id=self.experiment_id,
|
|
121
123
|
)
|
|
@@ -275,6 +277,38 @@ class FeatureSelectionEngine:
|
|
|
275
277
|
features_selected.drop_duplicates("features", inplace=True)
|
|
276
278
|
|
|
277
279
|
features_selected_list = features_selected["features"].values.tolist()
|
|
280
|
+
|
|
281
|
+
# Save ensemble features before correlation (aggregated features)
|
|
282
|
+
logger.info("Saving ensemble features before correlation...")
|
|
283
|
+
all_features_in_data = self.X.columns.tolist()
|
|
284
|
+
ensemble_rows = []
|
|
285
|
+
|
|
286
|
+
# Add global rank for selected features
|
|
287
|
+
features_selected_with_global_rank = features_selected.copy()
|
|
288
|
+
features_selected_with_global_rank["global_rank"] = range(1, len(features_selected_with_global_rank) + 1)
|
|
289
|
+
|
|
290
|
+
for feature in all_features_in_data:
|
|
291
|
+
feature_id = feature_map.get(feature)
|
|
292
|
+
if feature_id:
|
|
293
|
+
is_selected = feature in features_selected_list
|
|
294
|
+
global_rank = None
|
|
295
|
+
if is_selected:
|
|
296
|
+
global_rank = features_selected_with_global_rank[
|
|
297
|
+
features_selected_with_global_rank["features"] == feature
|
|
298
|
+
]["global_rank"].values[0]
|
|
299
|
+
|
|
300
|
+
ensemble_rows.append({
|
|
301
|
+
"feature_selection_id": feature_selection.id,
|
|
302
|
+
"feature_id": feature_id,
|
|
303
|
+
"method": "ensemble",
|
|
304
|
+
"score": None,
|
|
305
|
+
"pvalue": None,
|
|
306
|
+
"support": 2 if is_selected else 0, # 2 = in aggregated features
|
|
307
|
+
"rank": global_rank,
|
|
308
|
+
"training_time": 0,
|
|
309
|
+
})
|
|
310
|
+
|
|
311
|
+
FeatureSelectionRank.bulk_upsert(rows=ensemble_rows)
|
|
278
312
|
|
|
279
313
|
# analysis 1
|
|
280
314
|
features_selected_by_every_methods = set(results[0]["features"].values.tolist())
|
|
@@ -303,12 +337,46 @@ class FeatureSelectionEngine:
|
|
|
303
337
|
header=True,
|
|
304
338
|
index_label="ID",
|
|
305
339
|
)
|
|
340
|
+
|
|
341
|
+
# Update support for features after correlation removal (before max)
|
|
342
|
+
logger.info("Updating ensemble features after correlation removal...")
|
|
343
|
+
for row in ensemble_rows:
|
|
344
|
+
feature = Feature.get(row["feature_id"]).name
|
|
345
|
+
if feature in features:
|
|
346
|
+
row["support"] = 1 # 1 = survived correlation removal
|
|
347
|
+
|
|
306
348
|
features = features[:max_features]
|
|
307
349
|
|
|
308
350
|
# adding categorical features selected
|
|
309
351
|
features += (
|
|
310
352
|
categorical_features_selected if target_type == "classification" else []
|
|
311
353
|
)
|
|
354
|
+
|
|
355
|
+
# Final update for features after max limitation (final selection)
|
|
356
|
+
logger.info("Finalizing ensemble features with categorical features...")
|
|
357
|
+
for row in ensemble_rows:
|
|
358
|
+
feature = Feature.get(row["feature_id"]).name
|
|
359
|
+
if feature in features and row["support"] == 1:
|
|
360
|
+
row["support"] = 2 # 2 = in final selection
|
|
361
|
+
|
|
362
|
+
# Add categorical features to ensemble if not already present
|
|
363
|
+
if target_type == "classification":
|
|
364
|
+
for cat_feature in categorical_features_selected:
|
|
365
|
+
feature_id = feature_map.get(cat_feature)
|
|
366
|
+
if feature_id and not any(row["feature_id"] == feature_id for row in ensemble_rows):
|
|
367
|
+
ensemble_rows.append({
|
|
368
|
+
"feature_selection_id": feature_selection.id,
|
|
369
|
+
"feature_id": feature_id,
|
|
370
|
+
"method": "ensemble",
|
|
371
|
+
"score": None,
|
|
372
|
+
"pvalue": None,
|
|
373
|
+
"support": 2, # 2 = in final selection (categorical)
|
|
374
|
+
"rank": None, # No rank for categorical features added at the end
|
|
375
|
+
"training_time": 0,
|
|
376
|
+
})
|
|
377
|
+
|
|
378
|
+
# Re-save all ensemble data with updated support values
|
|
379
|
+
FeatureSelectionRank.bulk_upsert(rows=ensemble_rows)
|
|
312
380
|
logger.debug(
|
|
313
381
|
f"Final pre-selection: {len(features)} features below {corr_threshold}% out of {len(features_selected_list)} features, and rejected {len(features_correlated)} features, {100*len(features)/len(features_selected_list):.2f}% features selected"
|
|
314
382
|
)
|
|
@@ -441,13 +509,18 @@ class FeatureSelectionEngine:
|
|
|
441
509
|
feat_scores["features"] = X.columns
|
|
442
510
|
feat_scores["rank"] = feat_scores["score"].rank(method="first", ascending=False)
|
|
443
511
|
feat_scores["method"] = "Chi2"
|
|
512
|
+
|
|
513
|
+
# Apply both percentile and p-value filtering
|
|
514
|
+
# Keep features that satisfy BOTH conditions: within percentile AND p-value < threshold
|
|
515
|
+
feat_scores["support"] = feat_scores["support"] & (feat_scores["pvalue"] <= self.max_p_value_categorical)
|
|
516
|
+
|
|
444
517
|
feat_scores.sort_values("rank", ascending=True, inplace=True)
|
|
445
518
|
stop = time.time()
|
|
446
519
|
training_time = timedelta(seconds=(stop - start)).total_seconds()
|
|
447
520
|
feat_scores["training_time"] = training_time
|
|
448
521
|
|
|
449
522
|
logger.debug(
|
|
450
|
-
f"Chi2 evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds"
|
|
523
|
+
f"Chi2 evaluation selected {feat_scores['support'].sum()} features in {training_time:.2f} seconds (percentile={percentile}%, p-value<={self.max_p_value_categorical})"
|
|
451
524
|
)
|
|
452
525
|
|
|
453
526
|
feat_scores.to_csv(
|
|
@@ -804,33 +877,28 @@ class PreprocessModel:
|
|
|
804
877
|
val,
|
|
805
878
|
test,
|
|
806
879
|
experiment,
|
|
807
|
-
target_numbers,
|
|
808
|
-
target_clf,
|
|
809
|
-
models_idx,
|
|
810
|
-
time_series,
|
|
811
|
-
max_timesteps,
|
|
812
|
-
group_column,
|
|
813
|
-
date_column,
|
|
814
880
|
**kwargs,
|
|
815
881
|
):
|
|
816
882
|
self.train = train
|
|
817
883
|
self.val = val
|
|
818
884
|
self.test = test
|
|
819
885
|
self.experiment = experiment
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
self.
|
|
823
|
-
self.
|
|
824
|
-
self.
|
|
825
|
-
self.
|
|
826
|
-
self.
|
|
886
|
+
|
|
887
|
+
# Get all parameters from experiment context
|
|
888
|
+
self.target_numbers = self.experiment.context.get("target_numbers", [])
|
|
889
|
+
self.target_clf = self.experiment.context.get("target_clf", [])
|
|
890
|
+
self.models_idx = self.experiment.context.get("models_idx", [])
|
|
891
|
+
self.time_series = self.experiment.context.get("time_series", False)
|
|
892
|
+
self.max_timesteps = self.experiment.context.get("max_timesteps", 120)
|
|
893
|
+
self.group_column = self.experiment.context.get("group_column", None)
|
|
894
|
+
self.date_column = self.experiment.context.get("date_column", None)
|
|
827
895
|
|
|
828
896
|
self.experiment_dir = experiment.path
|
|
829
897
|
self.data_dir = f"{self.experiment_dir}/data"
|
|
830
898
|
self.preprocessing_dir = f"{self.experiment_dir}/preprocessing"
|
|
831
899
|
|
|
832
900
|
self.all_features = experiment.get_all_features(
|
|
833
|
-
date_column=date_column, group_column=group_column
|
|
901
|
+
date_column=self.date_column, group_column=self.group_column
|
|
834
902
|
)
|
|
835
903
|
|
|
836
904
|
def run(self):
|