lecrapaud 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

lecrapaud/api.py CHANGED
@@ -32,6 +32,8 @@ experiment.model_selection(data) : return best_model
32
32
 
33
33
  import joblib
34
34
  import pandas as pd
35
+ import logging
36
+ from lecrapaud.utils import logger
35
37
  from lecrapaud.db.session import init_db
36
38
  from lecrapaud.feature_selection import FeatureSelectionEngine, PreprocessModel
37
39
  from lecrapaud.model_selection import ModelSelectionEngine, ModelEngine
@@ -103,7 +105,12 @@ class Experiment:
103
105
  std_data, reshaped_data = self.preprocess_model(train, val, test)
104
106
  self.model_selection(std_data, reshaped_data)
105
107
 
106
- def predict(self, new_data):
108
+ def predict(self, new_data, verbose: int = 0):
109
+ if verbose == 0:
110
+ logger.setLevel(logging.WARNING)
111
+
112
+ logger.warning("Running prediction...")
113
+
107
114
  data = self.feature_engineering(
108
115
  data=new_data,
109
116
  for_training=False,
@@ -127,7 +134,6 @@ class Experiment:
127
134
  else:
128
135
  features = self.dataset.get_features(target_number)
129
136
  model = ModelEngine(path=training_target_dir)
130
- model.load()
131
137
 
132
138
  # getting data
133
139
  if model.recurrent:
@@ -5,6 +5,7 @@ Revises: 339927587383
5
5
  Create Date: 2025-05-31 18:34:58.962966
6
6
 
7
7
  """
8
+
8
9
  from typing import Sequence, Union
9
10
 
10
11
  from alembic import op
@@ -12,27 +13,38 @@ import sqlalchemy as sa
12
13
  from sqlalchemy.dialects import mysql
13
14
 
14
15
  # revision identifiers, used by Alembic.
15
- revision: str = '52b809a34371'
16
- down_revision: Union[str, None] = '339927587383'
16
+ revision: str = "52b809a34371"
17
+ down_revision: Union[str, None] = "339927587383"
17
18
  branch_labels: Union[str, Sequence[str], None] = None
18
19
  depends_on: Union[str, Sequence[str], None] = None
19
20
 
20
21
 
21
22
  def upgrade() -> None:
22
23
  # ### commands auto generated by Alembic - please adjust! ###
23
- op.alter_column('investment_runs', 'initial_portfolio',
24
- existing_type=mysql.JSON(),
25
- nullable=True)
26
- op.create_index(op.f('ix_investment_runs_id'), 'investment_runs', ['id'], unique=False)
27
- op.create_foreign_key(None, 'portfolios', 'investment_runs', ['investment_run_id'], ['id'], ondelete='CASCADE')
24
+ op.alter_column(
25
+ "investment_runs",
26
+ "initial_portfolio",
27
+ existing_type=mysql.JSON(),
28
+ nullable=True,
29
+ )
30
+ op.create_foreign_key(
31
+ None,
32
+ "portfolios",
33
+ "investment_runs",
34
+ ["investment_run_id"],
35
+ ["id"],
36
+ ondelete="CASCADE",
37
+ )
28
38
  # ### end Alembic commands ###
29
39
 
30
40
 
31
41
  def downgrade() -> None:
32
42
  # ### commands auto generated by Alembic - please adjust! ###
33
- op.drop_constraint(None, 'portfolios', type_='foreignkey')
34
- op.drop_index(op.f('ix_investment_runs_id'), table_name='investment_runs')
35
- op.alter_column('investment_runs', 'initial_portfolio',
36
- existing_type=mysql.JSON(),
37
- nullable=False)
43
+ op.drop_constraint(None, "portfolios", type_="foreignkey")
44
+ op.alter_column(
45
+ "investment_runs",
46
+ "initial_portfolio",
47
+ existing_type=mysql.JSON(),
48
+ nullable=False,
49
+ )
38
50
  # ### end Alembic commands ###
lecrapaud/db/session.py CHANGED
@@ -4,6 +4,9 @@ from contextlib import contextmanager
4
4
  from sqlalchemy import create_engine, text
5
5
  from sqlalchemy.orm import sessionmaker
6
6
  from urllib.parse import urlparse
7
+ from alembic.config import Config
8
+ from alembic import command
9
+ import os
7
10
 
8
11
  from lecrapaud.config import DB_USER, DB_PASSWORD, DB_HOST, DB_PORT, DB_NAME, DB_URI
9
12
 
@@ -39,6 +42,14 @@ def init_db(uri: str = None):
39
42
  # Step 4: Create session factory
40
43
  _SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=_engine)
41
44
 
45
+ # Step 5: Apply Alembic migrations programmatically
46
+ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
47
+ alembic_cfg_path = os.path.join(project_root, "alembic.ini")
48
+
49
+ alembic_cfg = Config(alembic_cfg_path)
50
+ alembic_cfg.set_main_option("sqlalchemy.url", uri or os.getenv("DATABASE_URL"))
51
+ command.upgrade(alembic_cfg, "head")
52
+
42
53
 
43
54
  # Dependency to get a session instance
44
55
  @contextmanager
lecrapaud/experiment.py CHANGED
@@ -5,7 +5,7 @@ from pathlib import Path
5
5
  os.environ["COVERAGE_FILE"] = str(Path(".coverage").resolve())
6
6
 
7
7
  # Internal
8
- from lecrapaud.directory_management import tmp_dir
8
+ from lecrapaud.directories import tmp_dir
9
9
  from lecrapaud.utils import logger
10
10
  from lecrapaud.config import PYTHON_ENV
11
11
  from lecrapaud.db import (
@@ -101,7 +101,7 @@ class FeatureEngineeringEngine:
101
101
 
102
102
  def run(self) -> pd.DataFrame:
103
103
  # drop columns
104
- self.data = self.data.drop(columns=self.columns_drop)
104
+ self.data = self.data.drop(columns=self.columns_drop, errors="ignore")
105
105
 
106
106
  # convert object columns to numeric if possible
107
107
  self.data = convert_object_columns_that_are_numeric(self.data)
@@ -324,6 +324,8 @@ class PreprocessFeature:
324
324
  **kwargs,
325
325
  ):
326
326
  self.data = data
327
+ self.data.columns = self.data.columns.str.upper()
328
+
327
329
  self.dataset = dataset
328
330
  self.columns_pca = columns_pca
329
331
  self.columns_onehot = columns_onehot
@@ -350,7 +352,7 @@ class PreprocessFeature:
350
352
  self.train_val_test_split_time_series()
351
353
  if self.time_series
352
354
  else self.train_val_test_split(
353
- stratify_col=f"target_{self.target_numbers[0]}"
355
+ stratify_col=f"TARGET_{self.target_numbers[0]}"
354
356
  )
355
357
  ) # TODO: only stratifying first target for now
356
358
 
@@ -359,8 +361,7 @@ class PreprocessFeature:
359
361
  val, _ = self.add_pca_features(test, pcas=pcas)
360
362
  test, _ = self.add_pca_features(val, pcas=pcas)
361
363
 
362
- if PYTHON_ENV != "Test":
363
- joblib.dump(pcas, f"{self.preprocessing_dir}/pcas.pkl")
364
+ joblib.dump(pcas, f"{self.preprocessing_dir}/pcas.pkl")
364
365
 
365
366
  # Encoding
366
367
  train, transformer = self.encode_categorical_features(train)
@@ -373,11 +374,10 @@ class PreprocessFeature:
373
374
  transformer=transformer,
374
375
  )
375
376
 
376
- if PYTHON_ENV != "Test":
377
- joblib.dump(self.data, f"{self.data_dir}/full.pkl")
378
- joblib.dump(transformer, f"{self.preprocessing_dir}/column_transformer.pkl")
379
- summary = summarize_dataframe(train)
380
- summary.to_csv(f"{self.dataset_dir}/feature_summary.csv", index=False)
377
+ joblib.dump(self.data, f"{self.data_dir}/full.pkl")
378
+ joblib.dump(transformer, f"{self.preprocessing_dir}/column_transformer.pkl")
379
+ summary = summarize_dataframe(train)
380
+ summary.to_csv(f"{self.dataset_dir}/feature_summary.csv", index=False)
381
381
 
382
382
  return train, val, test
383
383
 
@@ -579,8 +579,8 @@ class PreprocessFeature:
579
579
  columns_ordinal: list[str] = self.columns_ordinal
580
580
  columns_frequency: list[str] = self.columns_frequency
581
581
 
582
- X = df.loc[:, ~df.columns.str.contains("^target_")]
583
- y = df.loc[:, df.columns.str.contains("^target_")]
582
+ X = df.loc[:, ~df.columns.str.contains("^TARGET_")]
583
+ y = df.loc[:, df.columns.str.contains("^TARGET_")]
584
584
  save_in_db = False
585
585
 
586
586
  all_columns = (
@@ -643,7 +643,6 @@ class PreprocessFeature:
643
643
 
644
644
  # Try to convert columns to best possible dtypes
645
645
  X_transformed = X_transformed.convert_dtypes()
646
- X_transformed.columns = X_transformed.columns.str.upper()
647
646
 
648
647
  # Insert features in db
649
648
  if save_in_db:
@@ -37,7 +37,7 @@ from sklearn.preprocessing import StandardScaler, MinMaxScaler
37
37
  from scipy.stats import spearmanr, kendalltau
38
38
 
39
39
  # Internal
40
- from lecrapaud.directory_management import tmp_dir, clean_directory
40
+ from lecrapaud.directories import tmp_dir, clean_directory
41
41
  from lecrapaud.utils import logger
42
42
  from lecrapaud.config import PYTHON_ENV
43
43
  from lecrapaud.db import (
@@ -50,10 +50,6 @@ from lecrapaud.db import (
50
50
  from lecrapaud.db.session import get_db
51
51
  from lecrapaud.search_space import all_models
52
52
 
53
- # Variables for targets handling
54
- TARGETS_MCLF = [11]
55
- GROUPING_COLUMN = "STOCK"
56
-
57
53
  # Annoying Warnings
58
54
  warnings.filterwarnings("ignore", category=FutureWarning)
59
55
 
@@ -103,7 +99,7 @@ class FeatureSelectionEngine:
103
99
  # Main feature selection function
104
100
  def run(
105
101
  self,
106
- single_process: bool = False,
102
+ single_process: bool = True,
107
103
  ):
108
104
  """Function to do feature selection with a range of different feature selection technics
109
105
 
@@ -114,10 +110,7 @@ class FeatureSelectionEngine:
114
110
  """
115
111
  target_number = self.target_number
116
112
  target_type = self.target_type
117
- if PYTHON_ENV != "Test":
118
- fs_dir_target = self.fs_dir_target
119
- else:
120
- fs_dir_target = None
113
+ fs_dir_target = self.fs_dir_target
121
114
 
122
115
  # Create the feature selection in db
123
116
  target = Target.find_by(name=f"TARGET_{target_number}")
@@ -162,7 +155,7 @@ class FeatureSelectionEngine:
162
155
  # handling categorical features (only if classification)
163
156
  self.X_categorical, self.X_numerical = get_features_by_types(self.X)
164
157
 
165
- if target_type == "classification":
158
+ if target_type == "classification" and self.X_categorical.shape[1] > 0:
166
159
  feat_scores = self.select_categorical_features(
167
160
  percentile=percentile, save_dir=fs_dir_target
168
161
  )
@@ -292,24 +285,22 @@ class FeatureSelectionEngine:
292
285
  f"We selected {len(features_selected_list)} features and {len(features_selected_by_every_methods)} were selected unanimously:"
293
286
  )
294
287
  logger.debug(features_selected_by_every_methods)
295
- if PYTHON_ENV != "Test":
296
- pd.Series(features_selected_list).to_csv(
297
- f"{fs_dir_target}/features_before_corr.csv",
298
- index=True,
299
- header=True,
300
- index_label="ID",
301
- )
288
+ pd.Series(features_selected_list).to_csv(
289
+ f"{fs_dir_target}/features_before_corr.csv",
290
+ index=True,
291
+ header=True,
292
+ index_label="ID",
293
+ )
302
294
 
303
295
  # removing correlated features
304
296
  self.X = self.X[features_selected_list]
305
297
  features, features_correlated = self.remove_correlated_features(corr_threshold)
306
- if PYTHON_ENV != "Test":
307
- pd.Series(features).to_csv(
308
- f"{fs_dir_target}/features_before_max.csv",
309
- index=True,
310
- header=True,
311
- index_label="ID",
312
- )
298
+ pd.Series(features).to_csv(
299
+ f"{fs_dir_target}/features_before_max.csv",
300
+ index=True,
301
+ header=True,
302
+ index_label="ID",
303
+ )
313
304
  features = features[:max_features]
314
305
 
315
306
  # adding categorical features selected
@@ -337,8 +328,7 @@ class FeatureSelectionEngine:
337
328
  best_features_path = Path(
338
329
  f"{self.preprocessing_dir}/features_{target_number}.pkl"
339
330
  ).resolve()
340
- if PYTHON_ENV != "Test":
341
- joblib.dump(features, best_features_path)
331
+ joblib.dump(features, best_features_path)
342
332
 
343
333
  # save in db
344
334
  db_features = Feature.filter(name__in=features)
@@ -798,6 +788,7 @@ class PreprocessModel:
798
788
 
799
789
  self.dataset_dir = dataset.path
800
790
  self.data_dir = f"{self.dataset_dir}/data"
791
+ self.preprocessing_dir = f"{self.dataset_dir}/preprocessing"
801
792
 
802
793
  self.all_features = dataset.get_all_features(
803
794
  date_column=date_column, group_column=group_column
@@ -819,31 +810,23 @@ class PreprocessModel:
819
810
 
820
811
  def run(self):
821
812
  # save data
822
- if PYTHON_ENV != "Test":
823
- joblib.dump(self.train, f"{self.data_dir}/train.pkl")
824
- joblib.dump(self.val, f"{self.data_dir}/val.pkl")
825
- joblib.dump(self.test, f"{self.data_dir}/test.pkl")
826
- preprocessing_dir = f"{self.dataset_dir}/preprocessing"
827
- else:
828
- preprocessing_dir = None
813
+ joblib.dump(self.train, f"{self.data_dir}/train.pkl")
814
+ joblib.dump(self.val, f"{self.data_dir}/val.pkl")
815
+ joblib.dump(self.test, f"{self.data_dir}/test.pkl")
829
816
 
830
817
  # scaling features
831
818
  if any(t not in self.target_clf for t in self.target_numbers) and any(
832
819
  all_models[i].get("need_scaling") for i in self.models_idx
833
820
  ):
834
821
  logger.info("Scaling features...")
835
- train_scaled, scaler_x, scalers_y = self.scale_data(
836
- self.train, save_dir=preprocessing_dir
837
- )
822
+ train_scaled, scaler_x, scalers_y = self.scale_data(self.train)
838
823
  val_scaled, _, _ = self.scale_data(
839
824
  self.val,
840
- save_dir=preprocessing_dir,
841
825
  scaler_x=scaler_x,
842
826
  scalers_y=scalers_y,
843
827
  )
844
828
  test_scaled, _, _ = self.scale_data(
845
829
  self.test,
846
- save_dir=preprocessing_dir,
847
830
  scaler_x=scaler_x,
848
831
  scalers_y=scalers_y,
849
832
  )
@@ -853,10 +836,9 @@ class PreprocessModel:
853
836
  test_scaled = None
854
837
 
855
838
  # save data
856
- if PYTHON_ENV != "Test":
857
- joblib.dump(train_scaled, f"{self.data_dir}/train_scaled.pkl")
858
- joblib.dump(val_scaled, f"{self.data_dir}/val_scaled.pkl")
859
- joblib.dump(test_scaled, f"{self.data_dir}/test_scaled.pkl")
839
+ joblib.dump(train_scaled, f"{self.data_dir}/train_scaled.pkl")
840
+ joblib.dump(val_scaled, f"{self.data_dir}/val_scaled.pkl")
841
+ joblib.dump(test_scaled, f"{self.data_dir}/test_scaled.pkl")
860
842
 
861
843
  data = {
862
844
  "train": self.train,
@@ -923,7 +905,6 @@ class PreprocessModel:
923
905
  def scale_data(
924
906
  self,
925
907
  df: pd.DataFrame,
926
- save_dir: str,
927
908
  scaler_x=None,
928
909
  scalers_y: Optional[list] = None,
929
910
  ):
@@ -939,8 +920,7 @@ class PreprocessModel:
939
920
  X_scaled = pd.DataFrame(
940
921
  scaler_x.fit_transform(X), columns=list(X.columns), index=X.index
941
922
  )
942
- if save_dir:
943
- joblib.dump(scaler_x, f"{save_dir}/scaler_x.pkl")
923
+ joblib.dump(scaler_x, f"{self.preprocessing_dir}/scaler_x.pkl")
944
924
 
945
925
  # Determine which targets need to be scaled
946
926
  targets_numbers_to_scale = [
@@ -969,8 +949,9 @@ class PreprocessModel:
969
949
  columns=y.columns,
970
950
  index=y.index,
971
951
  )
972
- if save_dir:
973
- joblib.dump(scaler_y, f"{save_dir}/scaler_y_{target_number}.pkl")
952
+ joblib.dump(
953
+ scaler_y, f"{self.preprocessing_dir}/scaler_y_{target_number}.pkl"
954
+ )
974
955
 
975
956
  scalers_y[f"scaler_y_{target_number}"] = scaler_y
976
957
  scaled_targets[target_number] = scaled_y
@@ -65,7 +65,7 @@ from ray.air import session
65
65
 
66
66
  # Internal library
67
67
  from lecrapaud.search_space import all_models
68
- from lecrapaud.directory_management import clean_directory
68
+ from lecrapaud.directories import clean_directory
69
69
  from lecrapaud.utils import copy_any, contains_best, logger, serialize_for_json
70
70
  from lecrapaud.config import PYTHON_ENV
71
71
  from lecrapaud.feature_selection import load_train_data
@@ -120,8 +120,9 @@ class ModelEngine:
120
120
  plot: bool = False,
121
121
  log_dir: str = None,
122
122
  ):
123
+ self.path = path
123
124
  if path:
124
- self.load(path)
125
+ self.load()
125
126
  else:
126
127
  self.model_name = model_name
127
128
  self.target_type = target_type
@@ -134,6 +135,7 @@ class ModelEngine:
134
135
  f"Model {self.model_name} is not supported by this library."
135
136
  f"Choose a model from the list of supported models: {[model['model_name'] for model in all_models].join(', ')}"
136
137
  )
138
+ config = config[0]
137
139
 
138
140
  self.recurrent = config["recurrent"]
139
141
  self.need_scaling = config["need_scaling"]
@@ -147,7 +149,7 @@ class ModelEngine:
147
149
  else:
148
150
  self.scaler_y = None
149
151
 
150
- self.path = path
152
+ self.threshold = None
151
153
 
152
154
  def fit(self, *args):
153
155
  if self.recurrent:
@@ -629,9 +631,6 @@ class ModelEngine:
629
631
  self.model_name = self._model.model_name
630
632
  self.target_type = self._model.target_type
631
633
 
632
- def __getattr__(self, attr):
633
- return getattr(self._model, attr)
634
-
635
634
 
636
635
  def trainable(
637
636
  params,
@@ -778,20 +777,20 @@ class ModelSelectionEngine:
778
777
  raise ValueError("Please provide a dataset.")
779
778
 
780
779
  if self.data:
781
- self.train = self.data["train"]
782
- self.val = self.data["val"]
783
- self.test = self.data["test"]
784
- self.train_scaled = self.data["train_scaled"]
785
- self.val_scaled = self.data["val_scaled"]
786
- self.test_scaled = self.data["test_scaled"]
780
+ train = self.data["train"]
781
+ val = self.data["val"]
782
+ test = self.data["test"]
783
+ train_scaled = self.data["train_scaled"]
784
+ val_scaled = self.data["val_scaled"]
785
+ test_scaled = self.data["test_scaled"]
787
786
  else:
788
787
  (
789
- self.train,
790
- self.val,
791
- self.test,
792
- self.train_scaled,
793
- self.val_scaled,
794
- self.test_scaled,
788
+ train,
789
+ val,
790
+ test,
791
+ train_scaled,
792
+ val_scaled,
793
+ test_scaled,
795
794
  ) = load_train_data(self.dataset_dir, self.target_number, self.target_clf)
796
795
 
797
796
  if (
@@ -810,10 +809,12 @@ class ModelSelectionEngine:
810
809
  raise ValueError("reshaped_data is not provided.")
811
810
 
812
811
  logger.info("Loading reshaped data...")
813
- self.x_train_reshaped = self.reshaped_data["x_train_reshaped"]
814
- self.y_train_reshaped = self.reshaped_data["y_train_reshaped"]
815
- self.x_val_reshaped = self.reshaped_data["x_val_reshaped"]
816
- self.y_val_reshaped = self.reshaped_data["y_val_reshaped"]
812
+ x_train_reshaped = self.reshaped_data["x_train_reshaped"]
813
+ y_train_reshaped = self.reshaped_data["y_train_reshaped"]
814
+ x_val_reshaped = self.reshaped_data["x_val_reshaped"]
815
+ y_val_reshaped = self.reshaped_data["y_val_reshaped"]
816
+ x_test_reshaped = self.reshaped_data["x_test_reshaped"]
817
+ y_test_reshaped = self.reshaped_data["y_test_reshaped"]
817
818
 
818
819
  # create model selection in db
819
820
  target = Target.find_by(name=f"TARGET_{self.target_number}")
@@ -864,42 +865,41 @@ class ModelSelectionEngine:
864
865
  if e in set(self.features)
865
866
  ]
866
867
  # TODO: Verify that features_idx are the right one, because scaling can re-arrange columns...
867
- self.x_train = self.x_train_reshaped[:, :, features_idx]
868
- self.y_train = self.y_train_reshaped[:, [self.target_number, 0]]
869
- self.x_val = self.x_val_reshaped[:, :, features_idx]
870
- self.y_val = self.y_val_reshaped[:, [self.target_number, 0]]
868
+ x_train = x_train_reshaped[:, :, features_idx]
869
+ y_train = y_train_reshaped[:, [self.target_number, 0]]
870
+ x_val = x_val_reshaped[:, :, features_idx]
871
+ y_val = y_val_reshaped[:, [self.target_number, 0]]
872
+ x_test = x_test_reshaped[:, :, features_idx]
873
+ y_test = y_test_reshaped[:, [self.target_number, 0]]
871
874
  else:
872
875
  config = config[self.target_type]
873
876
 
874
877
  if need_scaling and self.target_type == "regression":
875
- self.x_train = self.train_scaled[self.features]
876
- self.y_train = self.train_scaled[
877
- f"TARGET_{self.target_number}"
878
- ].rename("TARGET")
879
- self.x_val = self.val_scaled[self.features]
880
- self.y_val = self.val_scaled[f"TARGET_{self.target_number}"].rename(
878
+ x_train = train_scaled[self.features]
879
+ y_train = train_scaled[f"TARGET_{self.target_number}"].rename(
881
880
  "TARGET"
882
881
  )
883
- else:
884
- self.x_train = self.train[self.features]
885
- self.y_train = self.train[f"TARGET_{self.target_number}"].rename(
886
- "TARGET"
887
- )
888
- self.x_val = self.val[self.features]
889
- self.y_val = self.val[f"TARGET_{self.target_number}"].rename(
882
+ x_val = val_scaled[self.features]
883
+ y_val = val_scaled[f"TARGET_{self.target_number}"].rename("TARGET")
884
+ x_test = test_scaled[self.features]
885
+ y_test = test_scaled[f"TARGET_{self.target_number}"].rename(
890
886
  "TARGET"
891
887
  )
888
+ else:
889
+ x_train = train[self.features]
890
+ y_train = train[f"TARGET_{self.target_number}"].rename("TARGET")
891
+ x_val = val[self.features]
892
+ y_val = val[f"TARGET_{self.target_number}"].rename("TARGET")
893
+ x_test = test[self.features]
894
+ y_test = test[f"TARGET_{self.target_number}"].rename("TARGET")
892
895
 
893
896
  log_dir = get_log_dir(self.training_target_dir, model_name)
894
897
  # instantiate model
895
898
  model = ModelEngine(
896
899
  model_name=model_name,
897
- recurrent=recurrent,
898
- need_scaling=need_scaling,
899
900
  search_params=config["search_params"],
900
901
  target_type=self.target_type,
901
902
  create_model=config["create_model"],
902
- scaler_y=self.scaler_y,
903
903
  plot=self.plot,
904
904
  log_dir=log_dir,
905
905
  )
@@ -907,7 +907,7 @@ class ModelSelectionEngine:
907
907
  start = time.time()
908
908
  # Tuning hyperparameters
909
909
  if perform_hyperopt:
910
- best_params = self.hyperoptimize(model)
910
+ best_params = self.hyperoptimize(x_train, y_train, x_val, y_val, model)
911
911
 
912
912
  # save best params
913
913
  best_params_file = f"{self.training_target_dir}/best_params.json"
@@ -932,8 +932,8 @@ class ModelSelectionEngine:
932
932
 
933
933
  # Perform cross-validation of the best model on k-folds of train + val set
934
934
  if perform_crossval:
935
- x_train_val = pd.concat([self.x_train, self.x_val, self.x_test], axis=0)
936
- y_train_val = pd.concat([self.y_train, self.y_val, self.y_test], axis=0)
935
+ x_train_val = pd.concat([x_train, x_val, x_test], axis=0)
936
+ y_train_val = pd.concat([y_train, y_val, y_test], axis=0)
937
937
  n_splits = 4
938
938
  n_samples = len(x_train_val)
939
939
  test_size = int(n_samples / (n_splits + 4))
@@ -946,7 +946,7 @@ class ModelSelectionEngine:
946
946
  self.type_name = f"crossval_fold_{i}"
947
947
 
948
948
  if self.time_series:
949
- date_series = self.train[self.date_column].copy()
949
+ date_series = train[self.date_column].copy()
950
950
 
951
951
  if need_scaling:
952
952
  date_series = date_series.map(pd.Timestamp.fromordinal)
@@ -1000,10 +1000,10 @@ class ModelSelectionEngine:
1000
1000
  # Retrain on entire training set, but keep score on cross-validation folds
1001
1001
  best_score, best_model, best_pred = self.train_model(
1002
1002
  params=best_params,
1003
- x_train=pd.concat([self.x_train, self.x_val], axis=0),
1004
- y_train=pd.concat([self.y_train, self.y_val], axis=0),
1005
- x_val=self.x_test,
1006
- y_val=self.y_test,
1003
+ x_train=pd.concat([x_train, x_val], axis=0),
1004
+ y_train=pd.concat([y_train, y_val], axis=0),
1005
+ x_val=x_test,
1006
+ y_val=y_test,
1007
1007
  model=model,
1008
1008
  )
1009
1009
  best_score = cross_validation_mean_score
@@ -1012,10 +1012,10 @@ class ModelSelectionEngine:
1012
1012
  self.type_name = "validation"
1013
1013
  best_score, best_model, best_pred = self.train_model(
1014
1014
  params=best_params,
1015
- x_train=pd.concat([self.x_train, self.x_val], axis=0),
1016
- y_train=pd.concat([self.y_train, self.y_val], axis=0),
1017
- x_val=self.x_test,
1018
- y_val=self.y_test,
1015
+ x_train=pd.concat([x_train, x_val], axis=0),
1016
+ y_train=pd.concat([y_train, y_val], axis=0),
1017
+ x_val=x_test,
1018
+ y_val=y_test,
1019
1019
  model=model,
1020
1020
  )
1021
1021
 
@@ -1117,7 +1117,7 @@ class ModelSelectionEngine:
1117
1117
 
1118
1118
  logger.info(f"Best model overall is : {best_score_overall}")
1119
1119
 
1120
- def hyperoptimize(self, model: ModelEngine):
1120
+ def hyperoptimize(self, x_train, y_train, x_val, y_val, model: ModelEngine):
1121
1121
  self.type_name = "hyperopts"
1122
1122
 
1123
1123
  def collect_error_logs(training_target_dir: int, storage_path: str):
@@ -1143,10 +1143,10 @@ class ModelSelectionEngine:
1143
1143
  tuner = Tuner(
1144
1144
  trainable=with_parameters(
1145
1145
  trainable,
1146
- x_train=self.x_train,
1147
- y_train=self.y_train,
1148
- x_val=self.x_val,
1149
- y_val=self.y_val,
1146
+ x_train=x_train,
1147
+ y_train=y_train,
1148
+ x_val=x_val,
1149
+ y_val=y_val,
1150
1150
  model_name=model.model_name,
1151
1151
  target_type=self.target_type,
1152
1152
  session_name=self.session_name,
lecrapaud/utils.py CHANGED
@@ -10,7 +10,7 @@ import unicodedata
10
10
  import re
11
11
  import string
12
12
 
13
- from lecrapaud.directory_management import logger_dir
13
+ from lecrapaud.directories import logger_dir
14
14
  from lecrapaud.config import LOGGING_LEVEL, PYTHON_ENV
15
15
 
16
16
  _LOGGER_ALREADY_CONFIGURED = False