lecrapaud 0.8.3__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lecrapaud/api.py +4 -8
- lecrapaud/db/models/experiment.py +11 -0
- lecrapaud/experiment.py +6 -1
- lecrapaud/feature_selection.py +44 -49
- lecrapaud/model_selection.py +31 -39
- {lecrapaud-0.8.3.dist-info → lecrapaud-0.9.0.dist-info}/METADATA +1 -1
- {lecrapaud-0.8.3.dist-info → lecrapaud-0.9.0.dist-info}/RECORD +9 -9
- {lecrapaud-0.8.3.dist-info → lecrapaud-0.9.0.dist-info}/LICENSE +0 -0
- {lecrapaud-0.8.3.dist-info → lecrapaud-0.9.0.dist-info}/WHEEL +0 -0
lecrapaud/api.py
CHANGED
|
@@ -108,17 +108,13 @@ class ExperimentEngine:
|
|
|
108
108
|
for target_number in self.target_numbers:
|
|
109
109
|
|
|
110
110
|
# loading model
|
|
111
|
-
|
|
111
|
+
target_dir = f"{self.experiment.path}/TARGET_{target_number}"
|
|
112
112
|
all_features = self.experiment.get_all_features(
|
|
113
113
|
date_column=self.date_column, group_column=self.group_column
|
|
114
114
|
)
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
) # we keep this for backward compatibility
|
|
119
|
-
else:
|
|
120
|
-
features = self.experiment.get_features(target_number)
|
|
121
|
-
model = ModelEngine(path=training_target_dir)
|
|
115
|
+
features = self.experiment.get_features(target_number)
|
|
116
|
+
|
|
117
|
+
model = ModelEngine(path=target_dir)
|
|
122
118
|
|
|
123
119
|
# getting data
|
|
124
120
|
if model.recurrent:
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from itertools import chain
|
|
2
|
+
import joblib
|
|
2
3
|
|
|
3
4
|
from sqlalchemy import (
|
|
4
5
|
Column,
|
|
@@ -106,6 +107,11 @@ class Experiment(Base):
|
|
|
106
107
|
fs for fs in feature_selections if fs.target_id == target_id
|
|
107
108
|
][0]
|
|
108
109
|
features = [f.name for f in feature_selection.features]
|
|
110
|
+
|
|
111
|
+
# fallback to path if no features found
|
|
112
|
+
if len(features) == 0:
|
|
113
|
+
features = joblib.load(f"{self.path}/TARGET_{target_number}/features.pkl")
|
|
114
|
+
|
|
109
115
|
return features
|
|
110
116
|
|
|
111
117
|
def get_all_features(self, date_column: str = None, group_column: str = None):
|
|
@@ -115,6 +121,11 @@ class Experiment(Base):
|
|
|
115
121
|
for fs in self.feature_selections
|
|
116
122
|
if fs.target_id in target_idx
|
|
117
123
|
)
|
|
124
|
+
|
|
125
|
+
# fallback to path if no features found
|
|
126
|
+
if len(_all_features) == 0:
|
|
127
|
+
_all_features = joblib.load(f"{self.path}/preprocessing/all_features.pkl")
|
|
128
|
+
|
|
118
129
|
all_features = []
|
|
119
130
|
if date_column:
|
|
120
131
|
all_features.append(date_column)
|
lecrapaud/experiment.py
CHANGED
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
|
+
import joblib
|
|
5
6
|
|
|
6
7
|
# Set up coverage file path
|
|
7
8
|
os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
|
|
@@ -13,7 +14,7 @@ from lecrapaud.db.session import get_db
|
|
|
13
14
|
|
|
14
15
|
|
|
15
16
|
def create_experiment(
|
|
16
|
-
data: pd.DataFrame,
|
|
17
|
+
data: pd.DataFrame | str,
|
|
17
18
|
corr_threshold,
|
|
18
19
|
percentile,
|
|
19
20
|
max_features,
|
|
@@ -22,6 +23,10 @@ def create_experiment(
|
|
|
22
23
|
experiment_name,
|
|
23
24
|
**kwargs,
|
|
24
25
|
):
|
|
26
|
+
if isinstance(data, str):
|
|
27
|
+
path = f"{data}/data/full.pkl" if "data" not in data else f"{data}/full.pkl"
|
|
28
|
+
data = joblib.load(path)
|
|
29
|
+
|
|
25
30
|
dates = {}
|
|
26
31
|
if date_column:
|
|
27
32
|
dates["start_date"] = pd.to_datetime(data[date_column].iat[0])
|
lecrapaud/feature_selection.py
CHANGED
|
@@ -54,7 +54,7 @@ from lecrapaud.search_space import all_models
|
|
|
54
54
|
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
55
55
|
|
|
56
56
|
|
|
57
|
-
def load_train_data(experiment_dir
|
|
57
|
+
def load_train_data(experiment_dir):
|
|
58
58
|
data_dir = f"{experiment_dir}/data"
|
|
59
59
|
|
|
60
60
|
logger.info("Loading data...")
|
|
@@ -90,11 +90,9 @@ class FeatureSelectionEngine:
|
|
|
90
90
|
self.experiment_dir = self.experiment.path
|
|
91
91
|
self.experiment_id = self.experiment.id
|
|
92
92
|
self.data_dir = f"{self.experiment_dir}/data"
|
|
93
|
-
self.
|
|
94
|
-
self.
|
|
95
|
-
|
|
96
|
-
)
|
|
97
|
-
os.makedirs(self.fs_dir_target, exist_ok=True)
|
|
93
|
+
self.target_dir = f"{self.experiment_dir}/TARGET_{self.target_number}"
|
|
94
|
+
self.feature_selection_dir = f"{self.target_dir}/feature_selection"
|
|
95
|
+
os.makedirs(self.feature_selection_dir, exist_ok=True)
|
|
98
96
|
|
|
99
97
|
# Main feature selection function
|
|
100
98
|
def run(
|
|
@@ -110,7 +108,6 @@ class FeatureSelectionEngine:
|
|
|
110
108
|
"""
|
|
111
109
|
target_number = self.target_number
|
|
112
110
|
target_type = self.target_type
|
|
113
|
-
fs_dir_target = self.fs_dir_target
|
|
114
111
|
|
|
115
112
|
# Create the feature selection in db
|
|
116
113
|
target = Target.find_by(name=f"TARGET_{target_number}")
|
|
@@ -133,7 +130,7 @@ class FeatureSelectionEngine:
|
|
|
133
130
|
self.y = self.train[f"TARGET_{target_number}"]
|
|
134
131
|
|
|
135
132
|
logger.info(f"Starting feature selection for TARGET_{target_number}...")
|
|
136
|
-
clean_directory(self.
|
|
133
|
+
clean_directory(self.feature_selection_dir)
|
|
137
134
|
|
|
138
135
|
# Let's start by removing very low variance feature and extremly correlated features
|
|
139
136
|
# This is needed to reduce nb of feature but also for methods such as anova or chi2 that requires independent, non constant, non full 0 features
|
|
@@ -158,9 +155,7 @@ class FeatureSelectionEngine:
|
|
|
158
155
|
self.X_categorical, self.X_numerical = get_features_by_types(self.X)
|
|
159
156
|
|
|
160
157
|
if target_type == "classification" and self.X_categorical.shape[1] > 0:
|
|
161
|
-
feat_scores = self.select_categorical_features(
|
|
162
|
-
percentile=percentile, save_dir=fs_dir_target
|
|
163
|
-
)
|
|
158
|
+
feat_scores = self.select_categorical_features(percentile=percentile)
|
|
164
159
|
with get_db() as db:
|
|
165
160
|
for row in feat_scores.itertuples(index=False):
|
|
166
161
|
feature = Feature.find_by(name=row.features, db=db)
|
|
@@ -181,7 +176,7 @@ class FeatureSelectionEngine:
|
|
|
181
176
|
].values.tolist()
|
|
182
177
|
|
|
183
178
|
results = []
|
|
184
|
-
params = {"percentile": percentile
|
|
179
|
+
params = {"percentile": percentile}
|
|
185
180
|
if single_process:
|
|
186
181
|
results = [
|
|
187
182
|
self.select_feature_by_linear_correlation(**params),
|
|
@@ -288,7 +283,7 @@ class FeatureSelectionEngine:
|
|
|
288
283
|
)
|
|
289
284
|
logger.debug(features_selected_by_every_methods)
|
|
290
285
|
pd.Series(features_selected_list).to_csv(
|
|
291
|
-
f"{
|
|
286
|
+
f"{self.feature_selection_dir}/features_before_corr.csv",
|
|
292
287
|
index=True,
|
|
293
288
|
header=True,
|
|
294
289
|
index_label="ID",
|
|
@@ -298,7 +293,7 @@ class FeatureSelectionEngine:
|
|
|
298
293
|
self.X = self.X[features_selected_list]
|
|
299
294
|
features, features_correlated = self.remove_correlated_features(corr_threshold)
|
|
300
295
|
pd.Series(features).to_csv(
|
|
301
|
-
f"{
|
|
296
|
+
f"{self.feature_selection_dir}/features_before_max.csv",
|
|
302
297
|
index=True,
|
|
303
298
|
header=True,
|
|
304
299
|
index_label="ID",
|
|
@@ -327,9 +322,7 @@ class FeatureSelectionEngine:
|
|
|
327
322
|
)
|
|
328
323
|
|
|
329
324
|
# save to path
|
|
330
|
-
best_features_path = Path(
|
|
331
|
-
f"{self.preprocessing_dir}/features_{target_number}.pkl"
|
|
332
|
-
).resolve()
|
|
325
|
+
best_features_path = Path(f"{self.target_dir}/features.pkl").resolve()
|
|
333
326
|
joblib.dump(features, best_features_path)
|
|
334
327
|
|
|
335
328
|
# save in db
|
|
@@ -423,7 +416,7 @@ class FeatureSelectionEngine:
|
|
|
423
416
|
# Filter methods
|
|
424
417
|
# ----------------
|
|
425
418
|
|
|
426
|
-
def select_categorical_features(self, percentile
|
|
419
|
+
def select_categorical_features(self, percentile):
|
|
427
420
|
X, y = self.X_categorical, self.y
|
|
428
421
|
|
|
429
422
|
start = time.time()
|
|
@@ -446,15 +439,16 @@ class FeatureSelectionEngine:
|
|
|
446
439
|
)
|
|
447
440
|
|
|
448
441
|
feat_scores.to_csv(
|
|
449
|
-
f"{
|
|
442
|
+
f"{self.feature_selection_dir}/Chi2.csv",
|
|
443
|
+
index=True,
|
|
444
|
+
header=True,
|
|
445
|
+
index_label="ID",
|
|
450
446
|
)
|
|
451
447
|
|
|
452
448
|
return feat_scores
|
|
453
449
|
|
|
454
450
|
# Linear correlation (Person's R for regression and ANOVA for classification)
|
|
455
|
-
def select_feature_by_linear_correlation(
|
|
456
|
-
self, percentile: int = 20, save_dir: Optional[str] = None
|
|
457
|
-
):
|
|
451
|
+
def select_feature_by_linear_correlation(self, percentile: int = 20):
|
|
458
452
|
X, y, target_type = self.X_numerical, self.y, self.target_type
|
|
459
453
|
|
|
460
454
|
start = time.time()
|
|
@@ -480,7 +474,7 @@ class FeatureSelectionEngine:
|
|
|
480
474
|
)
|
|
481
475
|
|
|
482
476
|
feat_scores.to_csv(
|
|
483
|
-
f"{
|
|
477
|
+
f"{self.feature_selection_dir}/{test_type}.csv",
|
|
484
478
|
index=True,
|
|
485
479
|
header=True,
|
|
486
480
|
index_label="ID",
|
|
@@ -489,9 +483,7 @@ class FeatureSelectionEngine:
|
|
|
489
483
|
return feat_scores
|
|
490
484
|
|
|
491
485
|
# Non-Linear correlation (Spearsman's R for regression and Kendall's Tau for classification)
|
|
492
|
-
def select_feature_by_nonlinear_correlation(
|
|
493
|
-
self, percentile: int = 20, save_dir: Optional[str] = None
|
|
494
|
-
):
|
|
486
|
+
def select_feature_by_nonlinear_correlation(self, percentile: int = 20):
|
|
495
487
|
X, y, target_type = self.X_numerical, self.y, self.target_type
|
|
496
488
|
|
|
497
489
|
start = time.time()
|
|
@@ -537,7 +529,7 @@ class FeatureSelectionEngine:
|
|
|
537
529
|
)
|
|
538
530
|
|
|
539
531
|
feat_scores.to_csv(
|
|
540
|
-
f"{
|
|
532
|
+
f"{self.feature_selection_dir}/{test_type}.csv",
|
|
541
533
|
index=True,
|
|
542
534
|
header=True,
|
|
543
535
|
index_label="ID",
|
|
@@ -546,9 +538,7 @@ class FeatureSelectionEngine:
|
|
|
546
538
|
return feat_scores
|
|
547
539
|
|
|
548
540
|
# Mutual Information
|
|
549
|
-
def select_feature_by_mi(
|
|
550
|
-
self, percentile: int = 20, save_dir: Optional[str] = None
|
|
551
|
-
):
|
|
541
|
+
def select_feature_by_mi(self, percentile: int = 20):
|
|
552
542
|
X, y, target_type = self.X_numerical, self.y, self.target_type
|
|
553
543
|
|
|
554
544
|
start = time.time()
|
|
@@ -575,7 +565,10 @@ class FeatureSelectionEngine:
|
|
|
575
565
|
)
|
|
576
566
|
|
|
577
567
|
feat_scores.to_csv(
|
|
578
|
-
f"{
|
|
568
|
+
f"{self.feature_selection_dir}/MI.csv",
|
|
569
|
+
index=True,
|
|
570
|
+
header=True,
|
|
571
|
+
index_label="ID",
|
|
579
572
|
)
|
|
580
573
|
|
|
581
574
|
return feat_scores
|
|
@@ -584,9 +577,7 @@ class FeatureSelectionEngine:
|
|
|
584
577
|
# ----------------
|
|
585
578
|
|
|
586
579
|
# feature importance
|
|
587
|
-
def select_feature_by_feat_imp(
|
|
588
|
-
self, percentile: int = 20, save_dir: Optional[str] = None
|
|
589
|
-
):
|
|
580
|
+
def select_feature_by_feat_imp(self, percentile: int = 20):
|
|
590
581
|
X, y, target_type = self.X_numerical, self.y, self.target_type
|
|
591
582
|
|
|
592
583
|
start = time.time()
|
|
@@ -628,7 +619,10 @@ class FeatureSelectionEngine:
|
|
|
628
619
|
)
|
|
629
620
|
|
|
630
621
|
feat_scores.to_csv(
|
|
631
|
-
f"{
|
|
622
|
+
f"{self.feature_selection_dir}/FI.csv",
|
|
623
|
+
index=True,
|
|
624
|
+
header=True,
|
|
625
|
+
index_label="ID",
|
|
632
626
|
)
|
|
633
627
|
|
|
634
628
|
return feat_scores
|
|
@@ -637,9 +631,7 @@ class FeatureSelectionEngine:
|
|
|
637
631
|
# ----------------
|
|
638
632
|
|
|
639
633
|
# recursive feature elimination
|
|
640
|
-
def select_feature_by_rfe(
|
|
641
|
-
self, percentile: int = 20, save_dir: Optional[str] = None
|
|
642
|
-
):
|
|
634
|
+
def select_feature_by_rfe(self, percentile: int = 20):
|
|
643
635
|
X, y, target_type = self.X_numerical, self.y, self.target_type
|
|
644
636
|
|
|
645
637
|
start = time.time()
|
|
@@ -681,15 +673,16 @@ class FeatureSelectionEngine:
|
|
|
681
673
|
)
|
|
682
674
|
|
|
683
675
|
feat_scores.to_csv(
|
|
684
|
-
f"{
|
|
676
|
+
f"{self.feature_selection_dir}/RFE.csv",
|
|
677
|
+
index=True,
|
|
678
|
+
header=True,
|
|
679
|
+
index_label="ID",
|
|
685
680
|
)
|
|
686
681
|
|
|
687
682
|
return feat_scores
|
|
688
683
|
|
|
689
684
|
# SequentialFeatureSelector (loss based, possibility to do forwards or backwards selection or removal)
|
|
690
|
-
def select_feature_by_sfs(
|
|
691
|
-
self, percentile: int = 20, save_dir: Optional[str] = None
|
|
692
|
-
):
|
|
685
|
+
def select_feature_by_sfs(self, percentile: int = 20):
|
|
693
686
|
X, y, target_type = self.X_numerical, self.y, self.target_type
|
|
694
687
|
|
|
695
688
|
start = time.time()
|
|
@@ -782,7 +775,10 @@ class FeatureSelectionEngine:
|
|
|
782
775
|
)
|
|
783
776
|
|
|
784
777
|
feat_scores.to_csv(
|
|
785
|
-
f"{
|
|
778
|
+
f"{self.feature_selection_dir}/SFS.csv",
|
|
779
|
+
index=True,
|
|
780
|
+
header=True,
|
|
781
|
+
index_label="ID",
|
|
786
782
|
)
|
|
787
783
|
|
|
788
784
|
return feat_scores
|
|
@@ -862,11 +858,13 @@ class PreprocessModel:
|
|
|
862
858
|
train_scaled = None
|
|
863
859
|
val_scaled = None
|
|
864
860
|
test_scaled = None
|
|
861
|
+
scaler_x = None
|
|
865
862
|
|
|
866
863
|
# save data
|
|
867
864
|
joblib.dump(train_scaled, f"{self.data_dir}/train_scaled.pkl")
|
|
868
865
|
joblib.dump(val_scaled, f"{self.data_dir}/val_scaled.pkl")
|
|
869
866
|
joblib.dump(test_scaled, f"{self.data_dir}/test_scaled.pkl")
|
|
867
|
+
joblib.dump(scaler_x, f"{self.preprocessing_dir}/scaler_x.pkl")
|
|
870
868
|
|
|
871
869
|
data = {
|
|
872
870
|
"train": self.train,
|
|
@@ -875,7 +873,6 @@ class PreprocessModel:
|
|
|
875
873
|
"train_scaled": train_scaled,
|
|
876
874
|
"val_scaled": val_scaled,
|
|
877
875
|
"test_scaled": test_scaled,
|
|
878
|
-
"scalers_y": scalers_y,
|
|
879
876
|
}
|
|
880
877
|
|
|
881
878
|
# reshape data for time series
|
|
@@ -902,7 +899,7 @@ class PreprocessModel:
|
|
|
902
899
|
self.train = self.train[columns_to_keep]
|
|
903
900
|
|
|
904
901
|
scaler_x = joblib.load(f"{self.preprocessing_dir}/scaler_x.pkl")
|
|
905
|
-
scaled_data = scaler_x.transform(self.train)
|
|
902
|
+
scaled_data = scaler_x.transform(self.train)
|
|
906
903
|
scaled_data = pd.DataFrame(
|
|
907
904
|
scaled_data, columns=self.train.columns, index=self.train.index
|
|
908
905
|
)
|
|
@@ -951,7 +948,6 @@ class PreprocessModel:
|
|
|
951
948
|
X_scaled = pd.DataFrame(
|
|
952
949
|
scaler_x.fit_transform(X), columns=list(X.columns), index=X.index
|
|
953
950
|
)
|
|
954
|
-
joblib.dump(scaler_x, f"{self.preprocessing_dir}/scaler_x.pkl")
|
|
955
951
|
|
|
956
952
|
# Determine which targets need to be scaled
|
|
957
953
|
targets_numbers_to_scale = [
|
|
@@ -980,9 +976,8 @@ class PreprocessModel:
|
|
|
980
976
|
columns=y.columns,
|
|
981
977
|
index=y.index,
|
|
982
978
|
)
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
)
|
|
979
|
+
target_dir = f"{self.experiment_dir}/TARGET_{target_number}"
|
|
980
|
+
joblib.dump(scaler_y, f"{target_dir}/scaler_y.pkl")
|
|
986
981
|
|
|
987
982
|
scalers_y[f"scaler_y_{target_number}"] = scaler_y
|
|
988
983
|
scaled_targets[target_number] = scaled_y
|
lecrapaud/model_selection.py
CHANGED
|
@@ -145,11 +145,7 @@ class ModelEngine:
|
|
|
145
145
|
self.log_dir = log_dir
|
|
146
146
|
|
|
147
147
|
if self.path and self.need_scaling and self.target_type == "regression":
|
|
148
|
-
|
|
149
|
-
target_number = self.path.split("/")[-1].split("_")[-1]
|
|
150
|
-
self.scaler_y = joblib.load(
|
|
151
|
-
preprocessing_dir / f"scaler_y_{target_number}.pkl"
|
|
152
|
-
)
|
|
148
|
+
self.scaler_y = joblib.load(f"{self.path}/scaler_y.pkl")
|
|
153
149
|
else:
|
|
154
150
|
self.scaler_y = None
|
|
155
151
|
|
|
@@ -571,10 +567,10 @@ class ModelEngine:
|
|
|
571
567
|
if not self.path:
|
|
572
568
|
raise ValueError("Path is not set, cannot load model")
|
|
573
569
|
|
|
574
|
-
|
|
570
|
+
target_dir = Path(self.path)
|
|
575
571
|
|
|
576
572
|
# Load threshold
|
|
577
|
-
scores_tracking = pd.read_csv(f"{
|
|
573
|
+
scores_tracking = pd.read_csv(f"{target_dir}/scores_tracking.csv")
|
|
578
574
|
self.threshold = (
|
|
579
575
|
scores_tracking["THRESHOLD"].values[0]
|
|
580
576
|
if "THRESHOLD" in scores_tracking.columns
|
|
@@ -582,8 +578,8 @@ class ModelEngine:
|
|
|
582
578
|
)
|
|
583
579
|
|
|
584
580
|
# Search for files that contain '.best' or '.keras' in the name
|
|
585
|
-
best_files = list(
|
|
586
|
-
|
|
581
|
+
best_files = list(target_dir.glob("*.best*")) + list(
|
|
582
|
+
target_dir.glob("*.keras*")
|
|
587
583
|
)
|
|
588
584
|
# If any files are found, try loading the first one (or process as needed)
|
|
589
585
|
if best_files:
|
|
@@ -610,7 +606,7 @@ class ModelEngine:
|
|
|
610
606
|
)
|
|
611
607
|
else:
|
|
612
608
|
raise FileNotFoundError(
|
|
613
|
-
f"No files with '.best' or '.keras' found in the specified folder: {
|
|
609
|
+
f"No files with '.best' or '.keras' found in the specified folder: {target_dir}"
|
|
614
610
|
)
|
|
615
611
|
|
|
616
612
|
self.model_name = self._model.model_name
|
|
@@ -737,7 +733,7 @@ class ModelSelectionEngine:
|
|
|
737
733
|
self.experiment_id = self.experiment.id
|
|
738
734
|
self.data_dir = f"{self.experiment_dir}/data"
|
|
739
735
|
self.preprocessing_dir = f"{self.experiment_dir}/preprocessing"
|
|
740
|
-
self.
|
|
736
|
+
self.target_dir = f"{self.experiment_dir}/TARGET_{self.target_number}"
|
|
741
737
|
self.metric = "RMSE" if self.target_type == "regression" else "LOGLOSS"
|
|
742
738
|
self.features = self.experiment.get_features(self.target_number)
|
|
743
739
|
self.all_features = self.experiment.get_all_features(
|
|
@@ -826,7 +822,7 @@ class ModelSelectionEngine:
|
|
|
826
822
|
if recurrent is False and config[self.target_type] is None:
|
|
827
823
|
continue # for naive bayes models that cannot be used in regression
|
|
828
824
|
|
|
829
|
-
self.results_dir = f"{self.
|
|
825
|
+
self.results_dir = f"{self.target_dir}/{model_name}"
|
|
830
826
|
if not os.path.exists(f"{self.results_dir}"):
|
|
831
827
|
os.makedirs(f"{self.results_dir}")
|
|
832
828
|
elif preserve_model and contains_best(self.results_dir):
|
|
@@ -885,7 +881,7 @@ class ModelSelectionEngine:
|
|
|
885
881
|
x_test = test[self.features]
|
|
886
882
|
y_test = test[f"TARGET_{self.target_number}"].rename("TARGET")
|
|
887
883
|
|
|
888
|
-
log_dir = get_log_dir(self.
|
|
884
|
+
log_dir = get_log_dir(self.target_dir, model_name)
|
|
889
885
|
# instantiate model
|
|
890
886
|
model = ModelEngine(
|
|
891
887
|
model_name=model_name,
|
|
@@ -902,7 +898,7 @@ class ModelSelectionEngine:
|
|
|
902
898
|
best_params = self.hyperoptimize(x_train, y_train, x_val, y_val, model)
|
|
903
899
|
|
|
904
900
|
# save best params
|
|
905
|
-
best_params_file = f"{self.
|
|
901
|
+
best_params_file = f"{self.target_dir}/best_params.json"
|
|
906
902
|
try:
|
|
907
903
|
with open(best_params_file, "r") as f:
|
|
908
904
|
json_dict = json.load(f)
|
|
@@ -914,7 +910,7 @@ class ModelSelectionEngine:
|
|
|
914
910
|
json.dump(json_dict, f, indent=4)
|
|
915
911
|
else:
|
|
916
912
|
try:
|
|
917
|
-
with open(f"{self.
|
|
913
|
+
with open(f"{self.target_dir}/best_params.json") as f:
|
|
918
914
|
json_dict = json.load(f)
|
|
919
915
|
best_params = json_dict[model_name]
|
|
920
916
|
except Exception:
|
|
@@ -1028,7 +1024,7 @@ class ModelSelectionEngine:
|
|
|
1028
1024
|
best_score["MODEL_PATH"] = model_path
|
|
1029
1025
|
|
|
1030
1026
|
# Track scores
|
|
1031
|
-
scores_tracking_path = f"{self.
|
|
1027
|
+
scores_tracking_path = f"{self.target_dir}/scores_tracking.csv"
|
|
1032
1028
|
best_score_df = pd.DataFrame([best_score])
|
|
1033
1029
|
|
|
1034
1030
|
if os.path.exists(scores_tracking_path):
|
|
@@ -1072,26 +1068,26 @@ class ModelSelectionEngine:
|
|
|
1072
1068
|
logger.info(f"Model training finished in {training_time:.2f} seconds")
|
|
1073
1069
|
|
|
1074
1070
|
# find best model type
|
|
1075
|
-
scores_tracking_path = f"{self.
|
|
1071
|
+
scores_tracking_path = f"{self.target_dir}/scores_tracking.csv"
|
|
1076
1072
|
scores_tracking = pd.read_csv(scores_tracking_path)
|
|
1077
1073
|
best_score_overall = scores_tracking.iloc[0, :]
|
|
1078
1074
|
best_model_name = best_score_overall["MODEL_NAME"]
|
|
1079
1075
|
|
|
1080
1076
|
# Remove any .best or .keras files
|
|
1081
|
-
for file_path in glob.glob(
|
|
1082
|
-
os.path.join(self.
|
|
1083
|
-
)
|
|
1077
|
+
for file_path in glob.glob(os.path.join(self.target_dir, "*.best")) + glob.glob(
|
|
1078
|
+
os.path.join(self.target_dir, "*.keras")
|
|
1079
|
+
):
|
|
1084
1080
|
os.remove(file_path)
|
|
1085
1081
|
# Copy the best model in root training folder for this target
|
|
1086
1082
|
best_model_path = Path(
|
|
1087
|
-
f"{self.
|
|
1083
|
+
f"{self.target_dir}/{os.path.basename(best_score_overall['MODEL_PATH'])}"
|
|
1088
1084
|
).resolve()
|
|
1089
1085
|
copy_any(
|
|
1090
1086
|
best_score_overall["MODEL_PATH"],
|
|
1091
1087
|
best_model_path,
|
|
1092
1088
|
)
|
|
1093
1089
|
|
|
1094
|
-
with open(f"{self.
|
|
1090
|
+
with open(f"{self.target_dir}/best_params.json", "r") as f:
|
|
1095
1091
|
best_model_params = json.load(f)[best_model_name]
|
|
1096
1092
|
|
|
1097
1093
|
# save model_selection results to db
|
|
@@ -1111,8 +1107,8 @@ class ModelSelectionEngine:
|
|
|
1111
1107
|
def hyperoptimize(self, x_train, y_train, x_val, y_val, model: ModelEngine):
|
|
1112
1108
|
self.type_name = "hyperopts"
|
|
1113
1109
|
|
|
1114
|
-
def collect_error_logs(
|
|
1115
|
-
output_error_file = f"{
|
|
1110
|
+
def collect_error_logs(target_dir: int, storage_path: str):
|
|
1111
|
+
output_error_file = f"{target_dir}/errors.log"
|
|
1116
1112
|
|
|
1117
1113
|
with open(output_error_file, "a") as outfile:
|
|
1118
1114
|
# Walk through the ray_results directory
|
|
@@ -1176,9 +1172,7 @@ class ModelSelectionEngine:
|
|
|
1176
1172
|
f"Markdown table with all trials :\n{results.get_dataframe().to_markdown()}"
|
|
1177
1173
|
)
|
|
1178
1174
|
# Collect errors in single file
|
|
1179
|
-
collect_error_logs(
|
|
1180
|
-
training_target_dir=self.training_target_dir, storage_path=storage_path
|
|
1181
|
-
)
|
|
1175
|
+
collect_error_logs(target_dir=self.target_dir, storage_path=storage_path)
|
|
1182
1176
|
|
|
1183
1177
|
except Exception as e:
|
|
1184
1178
|
raise Exception(e)
|
|
@@ -1321,21 +1315,19 @@ def evaluate(
|
|
|
1321
1315
|
|
|
1322
1316
|
|
|
1323
1317
|
# utils
|
|
1324
|
-
def get_log_dir(
|
|
1318
|
+
def get_log_dir(target_dir: str, model_name="test_model"):
|
|
1325
1319
|
"""Generates a structured log directory path for TensorBoard."""
|
|
1326
1320
|
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M")
|
|
1327
|
-
log_dir = (
|
|
1328
|
-
Path(training_target_dir + "/tensorboard") / model_name / f"run_{timestamp}"
|
|
1329
|
-
)
|
|
1321
|
+
log_dir = Path(target_dir + "/tensorboard") / model_name / f"run_{timestamp}"
|
|
1330
1322
|
log_dir.mkdir(parents=True, exist_ok=True) # Create directories if they don't exist
|
|
1331
1323
|
return str(log_dir)
|
|
1332
1324
|
|
|
1333
1325
|
|
|
1334
|
-
def print_scores(
|
|
1326
|
+
def print_scores(target_dir: str):
|
|
1335
1327
|
"""
|
|
1336
1328
|
Monitor scores
|
|
1337
1329
|
"""
|
|
1338
|
-
scores_tracking = pd.read_csv(f"{
|
|
1330
|
+
scores_tracking = pd.read_csv(f"{target_dir}/scores_tracking.csv")
|
|
1339
1331
|
return scores_tracking
|
|
1340
1332
|
|
|
1341
1333
|
|
|
@@ -1631,22 +1623,22 @@ def plot_threshold(prediction, threshold, precision, recall):
|
|
|
1631
1623
|
|
|
1632
1624
|
|
|
1633
1625
|
# OLD - to sort out
|
|
1634
|
-
def get_pred_distribution(
|
|
1626
|
+
def get_pred_distribution(target_dir: str, model_name="linear"):
|
|
1635
1627
|
"""
|
|
1636
1628
|
Look at prediction distributions
|
|
1637
1629
|
"""
|
|
1638
1630
|
prediction = pd.read_csv(
|
|
1639
|
-
f"{
|
|
1631
|
+
f"{target_dir}/{model_name}/prediction.csv",
|
|
1640
1632
|
index_col="ID",
|
|
1641
1633
|
)
|
|
1642
1634
|
prediction.describe()
|
|
1643
1635
|
|
|
1644
1636
|
|
|
1645
|
-
def plot_feature_importance(
|
|
1637
|
+
def plot_feature_importance(target_dir: str, model_name="linear"):
|
|
1646
1638
|
"""
|
|
1647
1639
|
Monitor feature importance ranking to filter out unrelevant features
|
|
1648
1640
|
"""
|
|
1649
|
-
model = joblib.load(f"{
|
|
1641
|
+
model = joblib.load(f"{target_dir}/{model_name}/{model_name}.best")
|
|
1650
1642
|
if hasattr(model, "feature_importances_"):
|
|
1651
1643
|
feature_importances_ = model.feature_importances_.flatten()
|
|
1652
1644
|
elif hasattr(model, "feature_importance"):
|
|
@@ -1664,11 +1656,11 @@ def plot_feature_importance(training_target_dir: str, model_name="linear"):
|
|
|
1664
1656
|
)
|
|
1665
1657
|
|
|
1666
1658
|
|
|
1667
|
-
def print_model_estimators(
|
|
1659
|
+
def print_model_estimators(target_dir: str, model_name="linear"):
|
|
1668
1660
|
"""
|
|
1669
1661
|
Look at a specific trained model
|
|
1670
1662
|
"""
|
|
1671
|
-
model = joblib.load(f"{
|
|
1663
|
+
model = joblib.load(f"{target_dir}/{model_name}/{model_name}.best")
|
|
1672
1664
|
for i in range(0, 100):
|
|
1673
1665
|
logger.info(model.estimators_[i].get_depth())
|
|
1674
1666
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
lecrapaud/__init__.py,sha256=oCxbtw_nk8rlOXbXbWo0RRMlsh6w-hTiZ6e5PRG_wp0,28
|
|
2
|
-
lecrapaud/api.py,sha256=
|
|
2
|
+
lecrapaud/api.py,sha256=MJgu7gaZ2Ip0lF_TP1t8vkADRooaVRsBDALJvM6vSsg,10516
|
|
3
3
|
lecrapaud/config.py,sha256=WJglRV6-lUfYUy5LZjwv3aO_X6ossHY9BUT7_NCSY1I,942
|
|
4
4
|
lecrapaud/db/__init__.py,sha256=82o9fMfaqKXPh2_rt44EzNRVZV1R4LScEnQYvj_TjK0,34
|
|
5
5
|
lecrapaud/db/alembic/README,sha256=MVlc9TYmr57RbhXET6QxgyCcwWP7w-vLkEsirENqiIQ,38
|
|
@@ -11,7 +11,7 @@ lecrapaud/db/alembic/versions/2025_06_24_1711-86457e2f333f_.py,sha256=dl6tfvcqEr
|
|
|
11
11
|
lecrapaud/db/alembic.ini,sha256=zgvur-5jnLsT66_98FaTOTNgjwObGZCE0HqMwRAeJrs,3587
|
|
12
12
|
lecrapaud/db/models/__init__.py,sha256=Lhyw9fVLdom0Fc6yIP-ip8FjkU1EwVwjae5q2VM815Q,740
|
|
13
13
|
lecrapaud/db/models/base.py,sha256=CYtof_UjFwX3C7XUifequh_UtLHJ25bU7LCwT501uGE,7508
|
|
14
|
-
lecrapaud/db/models/experiment.py,sha256=
|
|
14
|
+
lecrapaud/db/models/experiment.py,sha256=KwFbPTDAEXU0I9_33xMg6Ujx8QR_ffcjKA2YBOUuys4,3997
|
|
15
15
|
lecrapaud/db/models/feature.py,sha256=5o77O2FyRObnLOCGNj8kaPSGM3pLv1Ov6mXXHYkmnYY,1136
|
|
16
16
|
lecrapaud/db/models/feature_selection.py,sha256=nXy_Lg3uDxid71vYll_qzdo8ajYsJEXskI7vLQ3uyW0,3315
|
|
17
17
|
lecrapaud/db/models/feature_selection_rank.py,sha256=PvEpdv-JJt2wZMtX5TO0wyZ3IONlPkeDaC49i0VA-qU,2074
|
|
@@ -22,22 +22,22 @@ lecrapaud/db/models/score.py,sha256=_yaa6yBxugcOZMvLxqqIaMN7QGvzAOzOGCYQO0_gBjw,
|
|
|
22
22
|
lecrapaud/db/models/target.py,sha256=DKnfeaLU8eT8J_oh_vuFo5-o1CaoXR13xBbswme6Bgk,1649
|
|
23
23
|
lecrapaud/db/session.py,sha256=K9dTyXmO-aF_2r9RRHsDsbW9_zLNDaOcchXlpiv7cSo,2719
|
|
24
24
|
lecrapaud/directories.py,sha256=t4PrnJR48MpDfBOTYTyGlDVMUr39mcaj7uCPTaocBRw,725
|
|
25
|
-
lecrapaud/experiment.py,sha256=
|
|
25
|
+
lecrapaud/experiment.py,sha256=_kuRARuw1pXe13K3MHz22KOJSiRmvhPb7Q2Mkli32t8,2519
|
|
26
26
|
lecrapaud/feature_engineering.py,sha256=U3YOftZBB3PEqGbu2aFY_3B3Ks9Hiu04UxixOkBz0UU,31168
|
|
27
|
-
lecrapaud/feature_selection.py,sha256=
|
|
27
|
+
lecrapaud/feature_selection.py,sha256=_Je2817Ah1v-6Rls4EiYC-fn3EbpBj6Uaq81KWBpQG4,43430
|
|
28
28
|
lecrapaud/integrations/openai_integration.py,sha256=hHLF3fk5Bps8KNbNrEL3NUFa945jwClE6LrLpuMZOd4,7459
|
|
29
29
|
lecrapaud/jobs/__init__.py,sha256=ZkrsyTOR21c_wN7RY8jPhm8jCrL1oCEtTsf3VFIlQiE,292
|
|
30
30
|
lecrapaud/jobs/config.py,sha256=AmO0j3RFjx8H66dfKw_7vnshaOJb9Ox5BAZ9cwwLFMY,377
|
|
31
31
|
lecrapaud/jobs/scheduler.py,sha256=SiYWPxokpKnR8V6btLOO6gbK0PEjSRoeG0kCbQvYPf4,990
|
|
32
32
|
lecrapaud/jobs/tasks.py,sha256=jfhOCsgZlZGTnsLB_K7-Y3NgJqpzpUCFu7EfDQuIeSY,1655
|
|
33
|
-
lecrapaud/model_selection.py,sha256=
|
|
33
|
+
lecrapaud/model_selection.py,sha256=GCA21LGs2G6RqQF188BiJZFP-DNpEhzpTvJlewHFAi4,61504
|
|
34
34
|
lecrapaud/search_space.py,sha256=-JkzuMhaomdwiWi4HvVQY5hiw3-oREemJA16tbwEIp4,34854
|
|
35
35
|
lecrapaud/speed_tests/test-gpu-bilstm.ipynb,sha256=4nLuZRJVe2kn6kEmauhRiz5wkWT9AVrYhI9CEk_dYUY,9608
|
|
36
36
|
lecrapaud/speed_tests/test-gpu-resnet.ipynb,sha256=27Vu7nYwujYeh3fOxBNCnKJn3MXNPKZU-U8oDDUbymg,4944
|
|
37
37
|
lecrapaud/speed_tests/test-gpu-transformers.ipynb,sha256=k6MBSs_Um1h4PykvE-LTBcdpbWLbIFST_xl_AFW2jgI,8444
|
|
38
38
|
lecrapaud/speed_tests/tests.ipynb,sha256=RjI7LDHSsbadUkea_hT14sD7ivljtIQk4NB5McXJ1bE,3835
|
|
39
39
|
lecrapaud/utils.py,sha256=Dy2uhK9cslzoqRHiIE6MdkxjNJWjmKmzGr6i7EYM28A,8106
|
|
40
|
-
lecrapaud-0.
|
|
41
|
-
lecrapaud-0.
|
|
42
|
-
lecrapaud-0.
|
|
43
|
-
lecrapaud-0.
|
|
40
|
+
lecrapaud-0.9.0.dist-info/LICENSE,sha256=MImCryu0AnqhJE_uAZD-PIDKXDKb8sT7v0i1NOYeHTM,11350
|
|
41
|
+
lecrapaud-0.9.0.dist-info/METADATA,sha256=GZItqrsRZu_QUCY0hFsdN03454boqWlYIaZh4fIEaHY,11623
|
|
42
|
+
lecrapaud-0.9.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
43
|
+
lecrapaud-0.9.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|