lecrapaud 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lecrapaud might be problematic. Click here for more details.
- lecrapaud/api.py +8 -2
- lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +24 -12
- lecrapaud/db/session.py +11 -0
- lecrapaud/experiment.py +1 -1
- lecrapaud/feature_engineering.py +11 -12
- lecrapaud/feature_selection.py +29 -48
- lecrapaud/model_selection.py +59 -59
- lecrapaud/utils.py +1 -1
- {lecrapaud-0.2.0.dist-info → lecrapaud-0.3.0.dist-info}/METADATA +27 -20
- {lecrapaud-0.2.0.dist-info → lecrapaud-0.3.0.dist-info}/RECORD +13 -16
- lecrapaud/predictions.py +0 -292
- lecrapaud/preprocessing.py +0 -984
- lecrapaud/training.py +0 -239
- /lecrapaud/{directory_management.py → directories.py} +0 -0
- {lecrapaud-0.2.0.dist-info → lecrapaud-0.3.0.dist-info}/LICENSE +0 -0
- {lecrapaud-0.2.0.dist-info → lecrapaud-0.3.0.dist-info}/WHEEL +0 -0
lecrapaud/api.py
CHANGED
|
@@ -32,6 +32,8 @@ experiment.model_selection(data) : return best_model
|
|
|
32
32
|
|
|
33
33
|
import joblib
|
|
34
34
|
import pandas as pd
|
|
35
|
+
import logging
|
|
36
|
+
from lecrapaud.utils import logger
|
|
35
37
|
from lecrapaud.db.session import init_db
|
|
36
38
|
from lecrapaud.feature_selection import FeatureSelectionEngine, PreprocessModel
|
|
37
39
|
from lecrapaud.model_selection import ModelSelectionEngine, ModelEngine
|
|
@@ -103,7 +105,12 @@ class Experiment:
|
|
|
103
105
|
std_data, reshaped_data = self.preprocess_model(train, val, test)
|
|
104
106
|
self.model_selection(std_data, reshaped_data)
|
|
105
107
|
|
|
106
|
-
def predict(self, new_data):
|
|
108
|
+
def predict(self, new_data, verbose: int = 0):
|
|
109
|
+
if verbose == 0:
|
|
110
|
+
logger.setLevel(logging.WARNING)
|
|
111
|
+
|
|
112
|
+
logger.warning("Running prediction...")
|
|
113
|
+
|
|
107
114
|
data = self.feature_engineering(
|
|
108
115
|
data=new_data,
|
|
109
116
|
for_training=False,
|
|
@@ -127,7 +134,6 @@ class Experiment:
|
|
|
127
134
|
else:
|
|
128
135
|
features = self.dataset.get_features(target_number)
|
|
129
136
|
model = ModelEngine(path=training_target_dir)
|
|
130
|
-
model.load()
|
|
131
137
|
|
|
132
138
|
# getting data
|
|
133
139
|
if model.recurrent:
|
|
@@ -5,6 +5,7 @@ Revises: 339927587383
|
|
|
5
5
|
Create Date: 2025-05-31 18:34:58.962966
|
|
6
6
|
|
|
7
7
|
"""
|
|
8
|
+
|
|
8
9
|
from typing import Sequence, Union
|
|
9
10
|
|
|
10
11
|
from alembic import op
|
|
@@ -12,27 +13,38 @@ import sqlalchemy as sa
|
|
|
12
13
|
from sqlalchemy.dialects import mysql
|
|
13
14
|
|
|
14
15
|
# revision identifiers, used by Alembic.
|
|
15
|
-
revision: str =
|
|
16
|
-
down_revision: Union[str, None] =
|
|
16
|
+
revision: str = "52b809a34371"
|
|
17
|
+
down_revision: Union[str, None] = "339927587383"
|
|
17
18
|
branch_labels: Union[str, Sequence[str], None] = None
|
|
18
19
|
depends_on: Union[str, Sequence[str], None] = None
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
def upgrade() -> None:
|
|
22
23
|
# ### commands auto generated by Alembic - please adjust! ###
|
|
23
|
-
op.alter_column(
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
24
|
+
op.alter_column(
|
|
25
|
+
"investment_runs",
|
|
26
|
+
"initial_portfolio",
|
|
27
|
+
existing_type=mysql.JSON(),
|
|
28
|
+
nullable=True,
|
|
29
|
+
)
|
|
30
|
+
op.create_foreign_key(
|
|
31
|
+
None,
|
|
32
|
+
"portfolios",
|
|
33
|
+
"investment_runs",
|
|
34
|
+
["investment_run_id"],
|
|
35
|
+
["id"],
|
|
36
|
+
ondelete="CASCADE",
|
|
37
|
+
)
|
|
28
38
|
# ### end Alembic commands ###
|
|
29
39
|
|
|
30
40
|
|
|
31
41
|
def downgrade() -> None:
|
|
32
42
|
# ### commands auto generated by Alembic - please adjust! ###
|
|
33
|
-
op.drop_constraint(None,
|
|
34
|
-
op.
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
43
|
+
op.drop_constraint(None, "portfolios", type_="foreignkey")
|
|
44
|
+
op.alter_column(
|
|
45
|
+
"investment_runs",
|
|
46
|
+
"initial_portfolio",
|
|
47
|
+
existing_type=mysql.JSON(),
|
|
48
|
+
nullable=False,
|
|
49
|
+
)
|
|
38
50
|
# ### end Alembic commands ###
|
lecrapaud/db/session.py
CHANGED
|
@@ -4,6 +4,9 @@ from contextlib import contextmanager
|
|
|
4
4
|
from sqlalchemy import create_engine, text
|
|
5
5
|
from sqlalchemy.orm import sessionmaker
|
|
6
6
|
from urllib.parse import urlparse
|
|
7
|
+
from alembic.config import Config
|
|
8
|
+
from alembic import command
|
|
9
|
+
import os
|
|
7
10
|
|
|
8
11
|
from lecrapaud.config import DB_USER, DB_PASSWORD, DB_HOST, DB_PORT, DB_NAME, DB_URI
|
|
9
12
|
|
|
@@ -39,6 +42,14 @@ def init_db(uri: str = None):
|
|
|
39
42
|
# Step 4: Create session factory
|
|
40
43
|
_SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=_engine)
|
|
41
44
|
|
|
45
|
+
# Step 5: Apply Alembic migrations programmatically
|
|
46
|
+
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
|
|
47
|
+
alembic_cfg_path = os.path.join(project_root, "alembic.ini")
|
|
48
|
+
|
|
49
|
+
alembic_cfg = Config(alembic_cfg_path)
|
|
50
|
+
alembic_cfg.set_main_option("sqlalchemy.url", uri or os.getenv("DATABASE_URL"))
|
|
51
|
+
command.upgrade(alembic_cfg, "head")
|
|
52
|
+
|
|
42
53
|
|
|
43
54
|
# Dependency to get a session instance
|
|
44
55
|
@contextmanager
|
lecrapaud/experiment.py
CHANGED
|
@@ -5,7 +5,7 @@ from pathlib import Path
|
|
|
5
5
|
os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
|
|
6
6
|
|
|
7
7
|
# Internal
|
|
8
|
-
from lecrapaud.
|
|
8
|
+
from lecrapaud.directories import tmp_dir
|
|
9
9
|
from lecrapaud.utils import logger
|
|
10
10
|
from lecrapaud.config import PYTHON_ENV
|
|
11
11
|
from lecrapaud.db import (
|
lecrapaud/feature_engineering.py
CHANGED
|
@@ -101,7 +101,7 @@ class FeatureEngineeringEngine:
|
|
|
101
101
|
|
|
102
102
|
def run(self) -> pd.DataFrame:
|
|
103
103
|
# drop columns
|
|
104
|
-
self.data = self.data.drop(columns=self.columns_drop)
|
|
104
|
+
self.data = self.data.drop(columns=self.columns_drop, errors="ignore")
|
|
105
105
|
|
|
106
106
|
# convert object columns to numeric if possible
|
|
107
107
|
self.data = convert_object_columns_that_are_numeric(self.data)
|
|
@@ -324,6 +324,8 @@ class PreprocessFeature:
|
|
|
324
324
|
**kwargs,
|
|
325
325
|
):
|
|
326
326
|
self.data = data
|
|
327
|
+
self.data.columns = self.data.columns.str.upper()
|
|
328
|
+
|
|
327
329
|
self.dataset = dataset
|
|
328
330
|
self.columns_pca = columns_pca
|
|
329
331
|
self.columns_onehot = columns_onehot
|
|
@@ -350,7 +352,7 @@ class PreprocessFeature:
|
|
|
350
352
|
self.train_val_test_split_time_series()
|
|
351
353
|
if self.time_series
|
|
352
354
|
else self.train_val_test_split(
|
|
353
|
-
stratify_col=f"
|
|
355
|
+
stratify_col=f"TARGET_{self.target_numbers[0]}"
|
|
354
356
|
)
|
|
355
357
|
) # TODO: only stratifying first target for now
|
|
356
358
|
|
|
@@ -359,8 +361,7 @@ class PreprocessFeature:
|
|
|
359
361
|
val, _ = self.add_pca_features(test, pcas=pcas)
|
|
360
362
|
test, _ = self.add_pca_features(val, pcas=pcas)
|
|
361
363
|
|
|
362
|
-
|
|
363
|
-
joblib.dump(pcas, f"{self.preprocessing_dir}/pcas.pkl")
|
|
364
|
+
joblib.dump(pcas, f"{self.preprocessing_dir}/pcas.pkl")
|
|
364
365
|
|
|
365
366
|
# Encoding
|
|
366
367
|
train, transformer = self.encode_categorical_features(train)
|
|
@@ -373,11 +374,10 @@ class PreprocessFeature:
|
|
|
373
374
|
transformer=transformer,
|
|
374
375
|
)
|
|
375
376
|
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
summary.to_csv(f"{self.dataset_dir}/feature_summary.csv", index=False)
|
|
377
|
+
joblib.dump(self.data, f"{self.data_dir}/full.pkl")
|
|
378
|
+
joblib.dump(transformer, f"{self.preprocessing_dir}/column_transformer.pkl")
|
|
379
|
+
summary = summarize_dataframe(train)
|
|
380
|
+
summary.to_csv(f"{self.dataset_dir}/feature_summary.csv", index=False)
|
|
381
381
|
|
|
382
382
|
return train, val, test
|
|
383
383
|
|
|
@@ -579,8 +579,8 @@ class PreprocessFeature:
|
|
|
579
579
|
columns_ordinal: list[str] = self.columns_ordinal
|
|
580
580
|
columns_frequency: list[str] = self.columns_frequency
|
|
581
581
|
|
|
582
|
-
X = df.loc[:, ~df.columns.str.contains("^
|
|
583
|
-
y = df.loc[:, df.columns.str.contains("^
|
|
582
|
+
X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
|
|
583
|
+
y = df.loc[:, df.columns.str.contains("^TARGET_")]
|
|
584
584
|
save_in_db = False
|
|
585
585
|
|
|
586
586
|
all_columns = (
|
|
@@ -643,7 +643,6 @@ class PreprocessFeature:
|
|
|
643
643
|
|
|
644
644
|
# Try to convert columns to best possible dtypes
|
|
645
645
|
X_transformed = X_transformed.convert_dtypes()
|
|
646
|
-
X_transformed.columns = X_transformed.columns.str.upper()
|
|
647
646
|
|
|
648
647
|
# Insert features in db
|
|
649
648
|
if save_in_db:
|
lecrapaud/feature_selection.py
CHANGED
|
@@ -37,7 +37,7 @@ from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
|
|
37
37
|
from scipy.stats import spearmanr, kendalltau
|
|
38
38
|
|
|
39
39
|
# Internal
|
|
40
|
-
from lecrapaud.
|
|
40
|
+
from lecrapaud.directories import tmp_dir, clean_directory
|
|
41
41
|
from lecrapaud.utils import logger
|
|
42
42
|
from lecrapaud.config import PYTHON_ENV
|
|
43
43
|
from lecrapaud.db import (
|
|
@@ -50,10 +50,6 @@ from lecrapaud.db import (
|
|
|
50
50
|
from lecrapaud.db.session import get_db
|
|
51
51
|
from lecrapaud.search_space import all_models
|
|
52
52
|
|
|
53
|
-
# Variables for targets handling
|
|
54
|
-
TARGETS_MCLF = [11]
|
|
55
|
-
GROUPING_COLUMN = "STOCK"
|
|
56
|
-
|
|
57
53
|
# Annoying Warnings
|
|
58
54
|
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
59
55
|
|
|
@@ -103,7 +99,7 @@ class FeatureSelectionEngine:
|
|
|
103
99
|
# Main feature selection function
|
|
104
100
|
def run(
|
|
105
101
|
self,
|
|
106
|
-
single_process: bool =
|
|
102
|
+
single_process: bool = True,
|
|
107
103
|
):
|
|
108
104
|
"""Function to do feature selection with a range of different feature selection technics
|
|
109
105
|
|
|
@@ -114,10 +110,7 @@ class FeatureSelectionEngine:
|
|
|
114
110
|
"""
|
|
115
111
|
target_number = self.target_number
|
|
116
112
|
target_type = self.target_type
|
|
117
|
-
|
|
118
|
-
fs_dir_target = self.fs_dir_target
|
|
119
|
-
else:
|
|
120
|
-
fs_dir_target = None
|
|
113
|
+
fs_dir_target = self.fs_dir_target
|
|
121
114
|
|
|
122
115
|
# Create the feature selection in db
|
|
123
116
|
target = Target.find_by(name=f"TARGET_{target_number}")
|
|
@@ -162,7 +155,7 @@ class FeatureSelectionEngine:
|
|
|
162
155
|
# handling categorical features (only if classification)
|
|
163
156
|
self.X_categorical, self.X_numerical = get_features_by_types(self.X)
|
|
164
157
|
|
|
165
|
-
if target_type == "classification":
|
|
158
|
+
if target_type == "classification" and self.X_categorical.shape[1] > 0:
|
|
166
159
|
feat_scores = self.select_categorical_features(
|
|
167
160
|
percentile=percentile, save_dir=fs_dir_target
|
|
168
161
|
)
|
|
@@ -292,24 +285,22 @@ class FeatureSelectionEngine:
|
|
|
292
285
|
f"We selected {len(features_selected_list)} features and {len(features_selected_by_every_methods)} were selected unanimously:"
|
|
293
286
|
)
|
|
294
287
|
logger.debug(features_selected_by_every_methods)
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
)
|
|
288
|
+
pd.Series(features_selected_list).to_csv(
|
|
289
|
+
f"{fs_dir_target}/features_before_corr.csv",
|
|
290
|
+
index=True,
|
|
291
|
+
header=True,
|
|
292
|
+
index_label="ID",
|
|
293
|
+
)
|
|
302
294
|
|
|
303
295
|
# removing correlated features
|
|
304
296
|
self.X = self.X[features_selected_list]
|
|
305
297
|
features, features_correlated = self.remove_correlated_features(corr_threshold)
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
)
|
|
298
|
+
pd.Series(features).to_csv(
|
|
299
|
+
f"{fs_dir_target}/features_before_max.csv",
|
|
300
|
+
index=True,
|
|
301
|
+
header=True,
|
|
302
|
+
index_label="ID",
|
|
303
|
+
)
|
|
313
304
|
features = features[:max_features]
|
|
314
305
|
|
|
315
306
|
# adding categorical features selected
|
|
@@ -337,8 +328,7 @@ class FeatureSelectionEngine:
|
|
|
337
328
|
best_features_path = Path(
|
|
338
329
|
f"{self.preprocessing_dir}/features_{target_number}.pkl"
|
|
339
330
|
).resolve()
|
|
340
|
-
|
|
341
|
-
joblib.dump(features, best_features_path)
|
|
331
|
+
joblib.dump(features, best_features_path)
|
|
342
332
|
|
|
343
333
|
# save in db
|
|
344
334
|
db_features = Feature.filter(name__in=features)
|
|
@@ -798,6 +788,7 @@ class PreprocessModel:
|
|
|
798
788
|
|
|
799
789
|
self.dataset_dir = dataset.path
|
|
800
790
|
self.data_dir = f"{self.dataset_dir}/data"
|
|
791
|
+
self.preprocessing_dir = f"{self.dataset_dir}/preprocessing"
|
|
801
792
|
|
|
802
793
|
self.all_features = dataset.get_all_features(
|
|
803
794
|
date_column=date_column, group_column=group_column
|
|
@@ -819,31 +810,23 @@ class PreprocessModel:
|
|
|
819
810
|
|
|
820
811
|
def run(self):
|
|
821
812
|
# save data
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
joblib.dump(self.test, f"{self.data_dir}/test.pkl")
|
|
826
|
-
preprocessing_dir = f"{self.dataset_dir}/preprocessing"
|
|
827
|
-
else:
|
|
828
|
-
preprocessing_dir = None
|
|
813
|
+
joblib.dump(self.train, f"{self.data_dir}/train.pkl")
|
|
814
|
+
joblib.dump(self.val, f"{self.data_dir}/val.pkl")
|
|
815
|
+
joblib.dump(self.test, f"{self.data_dir}/test.pkl")
|
|
829
816
|
|
|
830
817
|
# scaling features
|
|
831
818
|
if any(t not in self.target_clf for t in self.target_numbers) and any(
|
|
832
819
|
all_models[i].get("need_scaling") for i in self.models_idx
|
|
833
820
|
):
|
|
834
821
|
logger.info("Scaling features...")
|
|
835
|
-
train_scaled, scaler_x, scalers_y = self.scale_data(
|
|
836
|
-
self.train, save_dir=preprocessing_dir
|
|
837
|
-
)
|
|
822
|
+
train_scaled, scaler_x, scalers_y = self.scale_data(self.train)
|
|
838
823
|
val_scaled, _, _ = self.scale_data(
|
|
839
824
|
self.val,
|
|
840
|
-
save_dir=preprocessing_dir,
|
|
841
825
|
scaler_x=scaler_x,
|
|
842
826
|
scalers_y=scalers_y,
|
|
843
827
|
)
|
|
844
828
|
test_scaled, _, _ = self.scale_data(
|
|
845
829
|
self.test,
|
|
846
|
-
save_dir=preprocessing_dir,
|
|
847
830
|
scaler_x=scaler_x,
|
|
848
831
|
scalers_y=scalers_y,
|
|
849
832
|
)
|
|
@@ -853,10 +836,9 @@ class PreprocessModel:
|
|
|
853
836
|
test_scaled = None
|
|
854
837
|
|
|
855
838
|
# save data
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
joblib.dump(test_scaled, f"{self.data_dir}/test_scaled.pkl")
|
|
839
|
+
joblib.dump(train_scaled, f"{self.data_dir}/train_scaled.pkl")
|
|
840
|
+
joblib.dump(val_scaled, f"{self.data_dir}/val_scaled.pkl")
|
|
841
|
+
joblib.dump(test_scaled, f"{self.data_dir}/test_scaled.pkl")
|
|
860
842
|
|
|
861
843
|
data = {
|
|
862
844
|
"train": self.train,
|
|
@@ -923,7 +905,6 @@ class PreprocessModel:
|
|
|
923
905
|
def scale_data(
|
|
924
906
|
self,
|
|
925
907
|
df: pd.DataFrame,
|
|
926
|
-
save_dir: str,
|
|
927
908
|
scaler_x=None,
|
|
928
909
|
scalers_y: Optional[list] = None,
|
|
929
910
|
):
|
|
@@ -939,8 +920,7 @@ class PreprocessModel:
|
|
|
939
920
|
X_scaled = pd.DataFrame(
|
|
940
921
|
scaler_x.fit_transform(X), columns=list(X.columns), index=X.index
|
|
941
922
|
)
|
|
942
|
-
|
|
943
|
-
joblib.dump(scaler_x, f"{save_dir}/scaler_x.pkl")
|
|
923
|
+
joblib.dump(scaler_x, f"{self.preprocessing_dir}/scaler_x.pkl")
|
|
944
924
|
|
|
945
925
|
# Determine which targets need to be scaled
|
|
946
926
|
targets_numbers_to_scale = [
|
|
@@ -969,8 +949,9 @@ class PreprocessModel:
|
|
|
969
949
|
columns=y.columns,
|
|
970
950
|
index=y.index,
|
|
971
951
|
)
|
|
972
|
-
|
|
973
|
-
|
|
952
|
+
joblib.dump(
|
|
953
|
+
scaler_y, f"{self.preprocessing_dir}/scaler_y_{target_number}.pkl"
|
|
954
|
+
)
|
|
974
955
|
|
|
975
956
|
scalers_y[f"scaler_y_{target_number}"] = scaler_y
|
|
976
957
|
scaled_targets[target_number] = scaled_y
|
lecrapaud/model_selection.py
CHANGED
|
@@ -65,7 +65,7 @@ from ray.air import session
|
|
|
65
65
|
|
|
66
66
|
# Internal library
|
|
67
67
|
from lecrapaud.search_space import all_models
|
|
68
|
-
from lecrapaud.
|
|
68
|
+
from lecrapaud.directories import clean_directory
|
|
69
69
|
from lecrapaud.utils import copy_any, contains_best, logger, serialize_for_json
|
|
70
70
|
from lecrapaud.config import PYTHON_ENV
|
|
71
71
|
from lecrapaud.feature_selection import load_train_data
|
|
@@ -120,8 +120,9 @@ class ModelEngine:
|
|
|
120
120
|
plot: bool = False,
|
|
121
121
|
log_dir: str = None,
|
|
122
122
|
):
|
|
123
|
+
self.path = path
|
|
123
124
|
if path:
|
|
124
|
-
self.load(
|
|
125
|
+
self.load()
|
|
125
126
|
else:
|
|
126
127
|
self.model_name = model_name
|
|
127
128
|
self.target_type = target_type
|
|
@@ -134,6 +135,7 @@ class ModelEngine:
|
|
|
134
135
|
f"Model {self.model_name} is not supported by this library."
|
|
135
136
|
f"Choose a model from the list of supported models: {[model['model_name'] for model in all_models].join(', ')}"
|
|
136
137
|
)
|
|
138
|
+
config = config[0]
|
|
137
139
|
|
|
138
140
|
self.recurrent = config["recurrent"]
|
|
139
141
|
self.need_scaling = config["need_scaling"]
|
|
@@ -147,7 +149,7 @@ class ModelEngine:
|
|
|
147
149
|
else:
|
|
148
150
|
self.scaler_y = None
|
|
149
151
|
|
|
150
|
-
self.
|
|
152
|
+
self.threshold = None
|
|
151
153
|
|
|
152
154
|
def fit(self, *args):
|
|
153
155
|
if self.recurrent:
|
|
@@ -629,9 +631,6 @@ class ModelEngine:
|
|
|
629
631
|
self.model_name = self._model.model_name
|
|
630
632
|
self.target_type = self._model.target_type
|
|
631
633
|
|
|
632
|
-
def __getattr__(self, attr):
|
|
633
|
-
return getattr(self._model, attr)
|
|
634
|
-
|
|
635
634
|
|
|
636
635
|
def trainable(
|
|
637
636
|
params,
|
|
@@ -778,20 +777,20 @@ class ModelSelectionEngine:
|
|
|
778
777
|
raise ValueError("Please provide a dataset.")
|
|
779
778
|
|
|
780
779
|
if self.data:
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
780
|
+
train = self.data["train"]
|
|
781
|
+
val = self.data["val"]
|
|
782
|
+
test = self.data["test"]
|
|
783
|
+
train_scaled = self.data["train_scaled"]
|
|
784
|
+
val_scaled = self.data["val_scaled"]
|
|
785
|
+
test_scaled = self.data["test_scaled"]
|
|
787
786
|
else:
|
|
788
787
|
(
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
788
|
+
train,
|
|
789
|
+
val,
|
|
790
|
+
test,
|
|
791
|
+
train_scaled,
|
|
792
|
+
val_scaled,
|
|
793
|
+
test_scaled,
|
|
795
794
|
) = load_train_data(self.dataset_dir, self.target_number, self.target_clf)
|
|
796
795
|
|
|
797
796
|
if (
|
|
@@ -810,10 +809,12 @@ class ModelSelectionEngine:
|
|
|
810
809
|
raise ValueError("reshaped_data is not provided.")
|
|
811
810
|
|
|
812
811
|
logger.info("Loading reshaped data...")
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
812
|
+
x_train_reshaped = self.reshaped_data["x_train_reshaped"]
|
|
813
|
+
y_train_reshaped = self.reshaped_data["y_train_reshaped"]
|
|
814
|
+
x_val_reshaped = self.reshaped_data["x_val_reshaped"]
|
|
815
|
+
y_val_reshaped = self.reshaped_data["y_val_reshaped"]
|
|
816
|
+
x_test_reshaped = self.reshaped_data["x_test_reshaped"]
|
|
817
|
+
y_test_reshaped = self.reshaped_data["y_test_reshaped"]
|
|
817
818
|
|
|
818
819
|
# create model selection in db
|
|
819
820
|
target = Target.find_by(name=f"TARGET_{self.target_number}")
|
|
@@ -864,42 +865,41 @@ class ModelSelectionEngine:
|
|
|
864
865
|
if e in set(self.features)
|
|
865
866
|
]
|
|
866
867
|
# TODO: Verify that features_idx are the right one, because scaling can re-arrange columns...
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
868
|
+
x_train = x_train_reshaped[:, :, features_idx]
|
|
869
|
+
y_train = y_train_reshaped[:, [self.target_number, 0]]
|
|
870
|
+
x_val = x_val_reshaped[:, :, features_idx]
|
|
871
|
+
y_val = y_val_reshaped[:, [self.target_number, 0]]
|
|
872
|
+
x_test = x_test_reshaped[:, :, features_idx]
|
|
873
|
+
y_test = y_test_reshaped[:, [self.target_number, 0]]
|
|
871
874
|
else:
|
|
872
875
|
config = config[self.target_type]
|
|
873
876
|
|
|
874
877
|
if need_scaling and self.target_type == "regression":
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
f"TARGET_{self.target_number}"
|
|
878
|
-
].rename("TARGET")
|
|
879
|
-
self.x_val = self.val_scaled[self.features]
|
|
880
|
-
self.y_val = self.val_scaled[f"TARGET_{self.target_number}"].rename(
|
|
878
|
+
x_train = train_scaled[self.features]
|
|
879
|
+
y_train = train_scaled[f"TARGET_{self.target_number}"].rename(
|
|
881
880
|
"TARGET"
|
|
882
881
|
)
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
)
|
|
888
|
-
self.x_val = self.val[self.features]
|
|
889
|
-
self.y_val = self.val[f"TARGET_{self.target_number}"].rename(
|
|
882
|
+
x_val = val_scaled[self.features]
|
|
883
|
+
y_val = val_scaled[f"TARGET_{self.target_number}"].rename("TARGET")
|
|
884
|
+
x_test = test_scaled[self.features]
|
|
885
|
+
y_test = test_scaled[f"TARGET_{self.target_number}"].rename(
|
|
890
886
|
"TARGET"
|
|
891
887
|
)
|
|
888
|
+
else:
|
|
889
|
+
x_train = train[self.features]
|
|
890
|
+
y_train = train[f"TARGET_{self.target_number}"].rename("TARGET")
|
|
891
|
+
x_val = val[self.features]
|
|
892
|
+
y_val = val[f"TARGET_{self.target_number}"].rename("TARGET")
|
|
893
|
+
x_test = test[self.features]
|
|
894
|
+
y_test = test[f"TARGET_{self.target_number}"].rename("TARGET")
|
|
892
895
|
|
|
893
896
|
log_dir = get_log_dir(self.training_target_dir, model_name)
|
|
894
897
|
# instantiate model
|
|
895
898
|
model = ModelEngine(
|
|
896
899
|
model_name=model_name,
|
|
897
|
-
recurrent=recurrent,
|
|
898
|
-
need_scaling=need_scaling,
|
|
899
900
|
search_params=config["search_params"],
|
|
900
901
|
target_type=self.target_type,
|
|
901
902
|
create_model=config["create_model"],
|
|
902
|
-
scaler_y=self.scaler_y,
|
|
903
903
|
plot=self.plot,
|
|
904
904
|
log_dir=log_dir,
|
|
905
905
|
)
|
|
@@ -907,7 +907,7 @@ class ModelSelectionEngine:
|
|
|
907
907
|
start = time.time()
|
|
908
908
|
# Tuning hyperparameters
|
|
909
909
|
if perform_hyperopt:
|
|
910
|
-
best_params = self.hyperoptimize(model)
|
|
910
|
+
best_params = self.hyperoptimize(x_train, y_train, x_val, y_val, model)
|
|
911
911
|
|
|
912
912
|
# save best params
|
|
913
913
|
best_params_file = f"{self.training_target_dir}/best_params.json"
|
|
@@ -932,8 +932,8 @@ class ModelSelectionEngine:
|
|
|
932
932
|
|
|
933
933
|
# Perform cross-validation of the best model on k-folds of train + val set
|
|
934
934
|
if perform_crossval:
|
|
935
|
-
x_train_val = pd.concat([
|
|
936
|
-
y_train_val = pd.concat([
|
|
935
|
+
x_train_val = pd.concat([x_train, x_val, x_test], axis=0)
|
|
936
|
+
y_train_val = pd.concat([y_train, y_val, y_test], axis=0)
|
|
937
937
|
n_splits = 4
|
|
938
938
|
n_samples = len(x_train_val)
|
|
939
939
|
test_size = int(n_samples / (n_splits + 4))
|
|
@@ -946,7 +946,7 @@ class ModelSelectionEngine:
|
|
|
946
946
|
self.type_name = f"crossval_fold_{i}"
|
|
947
947
|
|
|
948
948
|
if self.time_series:
|
|
949
|
-
date_series =
|
|
949
|
+
date_series = train[self.date_column].copy()
|
|
950
950
|
|
|
951
951
|
if need_scaling:
|
|
952
952
|
date_series = date_series.map(pd.Timestamp.fromordinal)
|
|
@@ -1000,10 +1000,10 @@ class ModelSelectionEngine:
|
|
|
1000
1000
|
# Retrain on entire training set, but keep score on cross-validation folds
|
|
1001
1001
|
best_score, best_model, best_pred = self.train_model(
|
|
1002
1002
|
params=best_params,
|
|
1003
|
-
x_train=pd.concat([
|
|
1004
|
-
y_train=pd.concat([
|
|
1005
|
-
x_val=
|
|
1006
|
-
y_val=
|
|
1003
|
+
x_train=pd.concat([x_train, x_val], axis=0),
|
|
1004
|
+
y_train=pd.concat([y_train, y_val], axis=0),
|
|
1005
|
+
x_val=x_test,
|
|
1006
|
+
y_val=y_test,
|
|
1007
1007
|
model=model,
|
|
1008
1008
|
)
|
|
1009
1009
|
best_score = cross_validation_mean_score
|
|
@@ -1012,10 +1012,10 @@ class ModelSelectionEngine:
|
|
|
1012
1012
|
self.type_name = "validation"
|
|
1013
1013
|
best_score, best_model, best_pred = self.train_model(
|
|
1014
1014
|
params=best_params,
|
|
1015
|
-
x_train=pd.concat([
|
|
1016
|
-
y_train=pd.concat([
|
|
1017
|
-
x_val=
|
|
1018
|
-
y_val=
|
|
1015
|
+
x_train=pd.concat([x_train, x_val], axis=0),
|
|
1016
|
+
y_train=pd.concat([y_train, y_val], axis=0),
|
|
1017
|
+
x_val=x_test,
|
|
1018
|
+
y_val=y_test,
|
|
1019
1019
|
model=model,
|
|
1020
1020
|
)
|
|
1021
1021
|
|
|
@@ -1117,7 +1117,7 @@ class ModelSelectionEngine:
|
|
|
1117
1117
|
|
|
1118
1118
|
logger.info(f"Best model overall is : {best_score_overall}")
|
|
1119
1119
|
|
|
1120
|
-
def hyperoptimize(self, model: ModelEngine):
|
|
1120
|
+
def hyperoptimize(self, x_train, y_train, x_val, y_val, model: ModelEngine):
|
|
1121
1121
|
self.type_name = "hyperopts"
|
|
1122
1122
|
|
|
1123
1123
|
def collect_error_logs(training_target_dir: int, storage_path: str):
|
|
@@ -1143,10 +1143,10 @@ class ModelSelectionEngine:
|
|
|
1143
1143
|
tuner = Tuner(
|
|
1144
1144
|
trainable=with_parameters(
|
|
1145
1145
|
trainable,
|
|
1146
|
-
x_train=
|
|
1147
|
-
y_train=
|
|
1148
|
-
x_val=
|
|
1149
|
-
y_val=
|
|
1146
|
+
x_train=x_train,
|
|
1147
|
+
y_train=y_train,
|
|
1148
|
+
x_val=x_val,
|
|
1149
|
+
y_val=y_val,
|
|
1150
1150
|
model_name=model.model_name,
|
|
1151
1151
|
target_type=self.target_type,
|
|
1152
1152
|
session_name=self.session_name,
|
lecrapaud/utils.py
CHANGED