lecrapaud 0.18.7__py3-none-any.whl → 0.22.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. lecrapaud/__init__.py +22 -1
  2. lecrapaud/{api.py → base.py} +331 -241
  3. lecrapaud/config.py +15 -3
  4. lecrapaud/db/alembic/versions/2025_08_25_1434-7ed9963e732f_add_best_score_to_model_selection.py +9 -4
  5. lecrapaud/db/alembic/versions/2025_08_28_1516-c36e9fee22b9_add_avg_precision_to_score.py +34 -0
  6. lecrapaud/db/alembic/versions/2025_08_28_1622-8b11c1ba982e_change_name_column.py +44 -0
  7. lecrapaud/db/alembic/versions/2025_10_25_0635-07e303521594_add_unique_constraint_to_score.py +39 -0
  8. lecrapaud/db/alembic/versions/2025_10_26_1727-033e0f7eca4f_merge_score_and_model_trainings_into_.py +264 -0
  9. lecrapaud/db/alembic/versions/2025_10_28_2006-0a8fb7826e9b_add_number_of_targets_and_remove_other_.py +75 -0
  10. lecrapaud/db/models/__init__.py +2 -4
  11. lecrapaud/db/models/base.py +122 -67
  12. lecrapaud/db/models/experiment.py +196 -183
  13. lecrapaud/db/models/feature_selection.py +0 -3
  14. lecrapaud/db/models/feature_selection_rank.py +0 -18
  15. lecrapaud/db/models/model_selection.py +2 -2
  16. lecrapaud/db/models/{score.py → model_selection_score.py} +30 -12
  17. lecrapaud/db/session.py +33 -4
  18. lecrapaud/experiment.py +44 -17
  19. lecrapaud/feature_engineering.py +45 -674
  20. lecrapaud/feature_preprocessing.py +1202 -0
  21. lecrapaud/feature_selection.py +145 -332
  22. lecrapaud/integrations/sentry_integration.py +46 -0
  23. lecrapaud/misc/tabpfn_tests.ipynb +2 -2
  24. lecrapaud/mixins.py +247 -0
  25. lecrapaud/model_preprocessing.py +295 -0
  26. lecrapaud/model_selection.py +725 -249
  27. lecrapaud/pipeline.py +548 -0
  28. lecrapaud/search_space.py +38 -1
  29. lecrapaud/utils.py +36 -3
  30. lecrapaud-0.22.6.dist-info/METADATA +423 -0
  31. lecrapaud-0.22.6.dist-info/RECORD +51 -0
  32. {lecrapaud-0.18.7.dist-info → lecrapaud-0.22.6.dist-info}/WHEEL +1 -1
  33. {lecrapaud-0.18.7.dist-info → lecrapaud-0.22.6.dist-info/licenses}/LICENSE +1 -1
  34. lecrapaud/db/models/model_training.py +0 -64
  35. lecrapaud/jobs/__init__.py +0 -13
  36. lecrapaud/jobs/config.py +0 -17
  37. lecrapaud/jobs/scheduler.py +0 -30
  38. lecrapaud/jobs/tasks.py +0 -17
  39. lecrapaud-0.18.7.dist-info/METADATA +0 -248
  40. lecrapaud-0.18.7.dist-info/RECORD +0 -46
@@ -5,24 +5,22 @@ It allows for end-to-end ML workflows including data preprocessing, feature engi
5
5
  model training, and prediction.
6
6
 
7
7
  Basic Usage:
8
- # Create a LeCrapaud instance
9
- lc = LeCrapaud()
10
-
11
8
  # Create a new experiment
12
- experiment = lc.create_experiment(data, target_numbers=[1], target_clf=[1])
9
+ experiment = LeCrapaud(data=data, target_numbers=[1], target_clf=[1])
13
10
 
14
- # Train a model
15
- best_features, artifacts, best_model = experiment.train(data)
11
+ # Train the model
12
+ experiment.fit(data)
16
13
 
17
14
  # Make predictions
18
15
  predictions, scores_reg, scores_clf = experiment.predict(new_data)
19
16
 
20
- # Or use individual pipeline steps:
21
- processed_data = experiment.feature_engineering(data) # Feature engineering
22
- train, val, test = experiment.preprocess_feature(data) # Data splitting and encoding
23
- selected_features = experiment.feature_selection(train) # Feature selection
24
- model_data = experiment.preprocess_model(train, val, test) # Model preprocessing
25
- best_model = experiment.model_selection(model_data) # Model selection
17
+ # Load existing experiment
18
+ experiment = LeCrapaud(id=123)
19
+ predictions = experiment.predict(new_data)
20
+
21
+ # Class methods for experiment management
22
+ best_exp = LeCrapaud.get_best_experiment_by_name('my_experiment')
23
+ all_exps = LeCrapaud.list_experiments('my_experiment')
26
24
  """
27
25
 
28
26
  import joblib
@@ -34,84 +32,172 @@ import seaborn as sns
34
32
  import numpy as np
35
33
  import matplotlib.pyplot as plt
36
34
  from lecrapaud.db.session import init_db
37
- from lecrapaud.feature_selection import FeatureSelectionEngine, PreprocessModel
35
+ from lecrapaud.feature_selection import FeatureSelector
36
+ from lecrapaud.model_preprocessing import ModelPreprocessor
38
37
  from lecrapaud.model_selection import (
39
- ModelSelectionEngine,
40
- ModelEngine,
38
+ ModelSelector,
39
+ BaseModel,
41
40
  evaluate,
42
41
  load_model,
43
42
  plot_threshold,
44
43
  plot_evaluation_for_classification,
45
44
  )
46
- from lecrapaud.feature_engineering import FeatureEngineeringEngine, PreprocessFeature
45
+ from lecrapaud.feature_engineering import FeatureEngineering
46
+ from lecrapaud.feature_preprocessing import FeaturePreprocessor
47
47
  from lecrapaud.experiment import create_experiment
48
48
  from lecrapaud.db import Experiment
49
- from lecrapaud.search_space import normalize_models_idx
49
+ from lecrapaud.search_space import normalize_models_idx, all_models
50
50
  from lecrapaud.utils import logger
51
51
  from lecrapaud.directories import tmp_dir
52
52
 
53
53
 
54
54
  class LeCrapaud:
55
- """Main class for interacting with the LeCrapaud ML pipeline.
55
+ """
56
+ Unified LeCrapaud class for machine learning experiments.
56
57
 
57
- This class provides methods to create and retrieve experiments.
58
+ This class provides both the ML pipeline functionality and experiment management.
59
+ It can be initialized either with new data to create an experiment or with an
60
+ experiment ID to load an existing one.
58
61
 
59
- Args:
60
- uri (str, optional): Database connection URI. If None, uses default connection.
61
- """
62
+ Usage:
63
+ # Create new experiment
64
+ experiment = LeCrapaud(data=df, target_numbers=[1, 2], ...)
62
65
 
63
- def __init__(self, uri: str = None):
64
- """Initialize LeCrapaud with optional database URI."""
65
- init_db(uri=uri)
66
+ # Load existing experiment
67
+ experiment = LeCrapaud(id=123)
66
68
 
67
- def create_experiment(self, data: pd.DataFrame, **kwargs) -> "ExperimentEngine":
68
- """Create a new experiment.
69
+ # Train the model
70
+ experiment.fit(data)
69
71
 
70
- Args:
71
- data (pd.DataFrame): Input data for the experiment
72
- **kwargs: Additional arguments to configure the experiment
72
+ # Make predictions
73
+ predictions = experiment.predict(new_data)
73
74
 
74
- Returns:
75
- ExperimentEngine: A new experiment instance
76
- """
77
- return ExperimentEngine(data=data, **kwargs)
75
+ Args:
76
+ id (int, optional): ID of an existing experiment to load
77
+ data (pd.DataFrame, optional): Input data for a new experiment
78
+ uri (str, optional): Database connection URI
79
+ **kwargs: Additional configuration parameters
80
+ """
78
81
 
79
- def get_experiment(self, id: int, **kwargs) -> "ExperimentEngine":
80
- """Retrieve an existing experiment by ID.
82
+ def __init__(
83
+ self, id: int = None, data: pd.DataFrame = None, uri: str = None, **kwargs
84
+ ):
85
+ """Initialize LeCrapaud with either new or existing experiment."""
86
+ # Initialize database connection
87
+ init_db(uri=uri)
81
88
 
82
- Args:
83
- id (int): The ID of the experiment to retrieve
84
- **kwargs: Additional arguments to pass to the experiment
89
+ if id:
90
+ # Load existing experiment
91
+ self.experiment = Experiment.get(id)
92
+ # Context from DB takes precedence over kwargs
93
+ effective_kwargs = {
94
+ **self.DEFAULT_PARAMS,
95
+ **kwargs,
96
+ **self.experiment.context,
97
+ }
98
+ else:
99
+ if data is None:
100
+ raise ValueError(
101
+ "Either id or data must be provided. Data can be a path to a folder containing trained models"
102
+ )
103
+ # New experiment: merge defaults with provided kwargs
104
+ effective_kwargs = {**self.DEFAULT_PARAMS, **kwargs}
85
105
 
86
- Returns:
87
- ExperimentEngine: The retrieved experiment instance
88
- """
89
- return ExperimentEngine(id=id, **kwargs)
106
+ # Normalize models_idx if present
107
+ if "models_idx" in effective_kwargs:
108
+ effective_kwargs["models_idx"] = normalize_models_idx(
109
+ effective_kwargs["models_idx"]
110
+ )
111
+
112
+ # Set all parameters as instance attributes
113
+ for key, value in effective_kwargs.items():
114
+ setattr(self, key, value)
90
115
 
91
- def get_last_experiment_by_name(self, name: str, **kwargs) -> "ExperimentEngine":
116
+ # Create experiment if new
117
+ if not id:
118
+ self.experiment = create_experiment(data=data, **effective_kwargs)
119
+
120
+ # Create directories
121
+ experiment_dir = f"{tmp_dir}/{self.experiment.name}"
122
+ preprocessing_dir = f"{experiment_dir}/preprocessing"
123
+ data_dir = f"{experiment_dir}/data"
124
+ os.makedirs(preprocessing_dir, exist_ok=True)
125
+ os.makedirs(data_dir, exist_ok=True)
126
+
127
+ # Default values for all experiment parameters
128
+ DEFAULT_PARAMS = {
129
+ # Feature Engineering
130
+ "columns_drop": [],
131
+ "columns_boolean": [],
132
+ "columns_date": [],
133
+ "columns_te_groupby": [],
134
+ "columns_te_target": [],
135
+ "for_training": True,
136
+ # Preprocessing
137
+ "time_series": False,
138
+ "val_size": 0.2,
139
+ "test_size": 0.2,
140
+ "columns_pca": [],
141
+ "pca_temporal": [],
142
+ "pca_cross_sectional": [],
143
+ "columns_onehot": [],
144
+ "columns_binary": [],
145
+ "columns_ordinal": [],
146
+ "columns_frequency": [],
147
+ # Feature Selection
148
+ "percentile": 20,
149
+ "corr_threshold": 80,
150
+ "max_features": 50,
151
+ "max_p_value_categorical": 0.05,
152
+ # Model Selection
153
+ "target_numbers": [],
154
+ "target_clf": [],
155
+ "models_idx": [],
156
+ "max_timesteps": 120,
157
+ "perform_hyperopt": True,
158
+ "number_of_trials": 20,
159
+ "perform_crossval": False,
160
+ "plot": True,
161
+ "preserve_model": True,
162
+ "target_clf_thresholds": {},
163
+ # Data structure
164
+ "date_column": None,
165
+ "group_column": None,
166
+ }
167
+
168
+ @classmethod
169
+ def get_default_params(cls):
170
+ """Get the default parameters for experiments."""
171
+ return cls.DEFAULT_PARAMS.copy()
172
+
173
+ def get_effective_context(self):
174
+ """Get the effective context (merged defaults + experiment context)."""
175
+ return {k: getattr(self, k, v) for k, v in self.DEFAULT_PARAMS.items()}
176
+
177
+ @classmethod
178
+ def get_last_experiment_by_name(cls, name: str, **kwargs):
92
179
  """Retrieve the last experiment by name."""
93
- return ExperimentEngine(id=Experiment.get_last_by_name(name).id, **kwargs)
180
+ return cls(id=Experiment.get_last_by_name(name).id, **kwargs)
94
181
 
95
- def get_best_experiment_by_name(
96
- self, name: str, metric: str = "both", **kwargs
97
- ) -> "ExperimentEngine":
182
+ @classmethod
183
+ def get_best_experiment_by_name(cls, name: str, **kwargs):
98
184
  """Retrieve the best experiment by score."""
99
- best_exp = Experiment.get_best_by_score(name=name, metric=metric)
185
+ best_exp = Experiment.get_best_by_score(name=name)
100
186
  if not best_exp:
101
187
  return None
102
- return ExperimentEngine(id=best_exp.id, **kwargs)
188
+ return cls(id=best_exp.id, **kwargs)
103
189
 
104
- def compare_experiment_scores(self, name: str):
105
- """Compare scores of experiments with matching names.
190
+ @classmethod
191
+ def list_experiments(cls, name: str = None, limit: int = 1000):
192
+ """List all experiments in the database."""
193
+ return [
194
+ cls(id=exp.id) for exp in Experiment.get_all_by_name(name=name, limit=limit)
195
+ ]
106
196
 
107
- Args:
108
- name (str): Name or partial name of experiments to compare
109
-
110
- Returns:
111
- dict: Dictionary containing experiment names as keys and their scores as values
112
- """
113
- # Get all experiments with the given name pattern
114
- experiments = self.list_experiments(name=name)
197
+ @classmethod
198
+ def compare_experiment_scores(cls, name: str):
199
+ """Compare scores of experiments with matching names."""
200
+ experiments = cls.list_experiments(name=name)
115
201
 
116
202
  if not experiments:
117
203
  return {"error": f"No experiments found with name containing '{name}'"}
@@ -120,9 +206,7 @@ class LeCrapaud:
120
206
 
121
207
  for exp in experiments:
122
208
  for model_sel in exp.experiment.model_selections:
123
-
124
209
  if model_sel.best_score:
125
-
126
210
  scores = {
127
211
  "rmse": model_sel.best_score["rmse"],
128
212
  "logloss": model_sel.best_score["logloss"],
@@ -131,7 +215,6 @@ class LeCrapaud:
131
215
  "roc_auc": model_sel.best_score["roc_auc"],
132
216
  }
133
217
  target_name = model_sel.target.name
134
-
135
218
  comparison[exp.experiment.name][target_name] = scores
136
219
  else:
137
220
  logger.warning(
@@ -140,71 +223,143 @@ class LeCrapaud:
140
223
 
141
224
  return comparison
142
225
 
143
- def list_experiments(
144
- self, name: str = None, limit: int = 1000
145
- ) -> list["ExperimentEngine"]:
146
- """List all experiments in the database."""
147
- return [
148
- ExperimentEngine(id=exp.id)
149
- for exp in Experiment.get_all_by_name(name=name, limit=limit)
150
- ]
151
-
152
-
153
- class ExperimentEngine:
154
- """Engine for managing ML experiments.
155
-
156
- This class handles the complete ML pipeline including feature engineering,
157
- model training, and prediction. It can be initialized with either new data
158
- or by loading an existing experiment by ID.
159
-
160
- Args:
161
- id (int, optional): ID of an existing experiment to load
162
- data (pd.DataFrame, optional): Input data for a new experiment
163
- **kwargs: Additional configuration parameters
164
- """
226
+ # Main ML Pipeline Methods
227
+ # ========================
165
228
 
166
- def __init__(self, id: int = None, data: pd.DataFrame = None, **kwargs):
167
- """Initialize the experiment engine with either new or existing experiment."""
168
- if id:
169
- self.experiment = Experiment.get(id)
170
- kwargs.update(self.experiment.context)
171
- experiment_dir = f"{tmp_dir}/{self.experiment.name}"
172
- preprocessing_dir = f"{experiment_dir}/preprocessing"
173
- data_dir = f"{experiment_dir}/data"
174
- os.makedirs(preprocessing_dir, exist_ok=True)
175
- os.makedirs(data_dir, exist_ok=True)
176
- else:
177
- if data is None:
178
- raise ValueError(
179
- "Either id or data must be provided. Data can be a path to a folder containing trained models"
180
- )
181
- self.experiment = create_experiment(data=data, **kwargs)
229
+ def fit(self, data, best_params=None):
230
+ """
231
+ Fit the complete ML pipeline on the provided data.
182
232
 
183
- # Set all kwargs as instance attributes
184
- for key, value in kwargs.items():
185
- if key == "models_idx":
186
- value = normalize_models_idx(value)
187
- setattr(self, key, value)
233
+ Args:
234
+ data (pd.DataFrame): Input training data
235
+ best_params (dict, optional): Pre-defined best parameters
188
236
 
189
- def train(self, data, best_params=None):
237
+ Returns:
238
+ self: Returns self for chaining
239
+ """
190
240
  logger.info("Running training...")
191
241
 
192
- data_eng = self.feature_engineering(data)
242
+ # Step 1: Feature Engineering
243
+ logger.info("Starting feature engineering...")
244
+ feature_eng = FeatureEngineering(experiment=self.experiment)
245
+ feature_eng.fit(data)
246
+ data_eng = feature_eng.get_data()
193
247
  logger.info("Feature engineering done.")
194
248
 
195
- train, val, test = self.preprocess_feature(data_eng)
249
+ # Step 2: Feature Preprocessing (split data)
250
+ logger.info("Starting feature preprocessing...")
251
+ from lecrapaud.feature_preprocessing import split_data
252
+
253
+ train, val, test = split_data(data_eng, experiment=self.experiment)
254
+
255
+ # Apply feature preprocessing transformations
256
+ feature_preprocessor = FeaturePreprocessor(experiment=self.experiment)
257
+ feature_preprocessor.fit(train)
258
+ train = feature_preprocessor.transform(train)
259
+ if val is not None:
260
+ val = feature_preprocessor.transform(val)
261
+ if test is not None:
262
+ test = feature_preprocessor.transform(test)
196
263
  logger.info("Feature preprocessing done.")
197
264
 
198
- self.feature_selection(train)
265
+ # Step 3: Feature Selection (for each target)
266
+ logger.info("Starting feature selection...")
267
+ for target_number in self.target_numbers:
268
+ feature_selector = FeatureSelector(
269
+ experiment=self.experiment, target_number=target_number
270
+ )
271
+ feature_selector.fit(train)
272
+
273
+ # Refresh experiment to get updated features
274
+ self.experiment = Experiment.get(self.experiment.id)
275
+ all_features = self.experiment.get_all_features(
276
+ date_column=self.date_column, group_column=self.group_column
277
+ )
278
+ joblib.dump(
279
+ all_features, f"{self.experiment.path}/preprocessing/all_features.pkl"
280
+ )
199
281
  logger.info("Feature selection done.")
200
282
 
201
- std_data, reshaped_data = self.preprocess_model(train, val, test)
283
+ # Step 4: Model Preprocessing (scaling)
284
+ logger.info("Starting model preprocessing...")
285
+ model_preprocessor = ModelPreprocessor(experiment=self.experiment)
286
+
287
+ # Fit and transform training data, then transform val/test
288
+ model_preprocessor.fit(train)
289
+ train_scaled = model_preprocessor.transform(train)
290
+ val_scaled = model_preprocessor.transform(val) if val is not None else None
291
+ test_scaled = model_preprocessor.transform(test) if test is not None else None
292
+
293
+ # Create data dict for model selection (keep both raw and scaled splits)
294
+ std_data = {
295
+ "train": train,
296
+ "val": val,
297
+ "test": test,
298
+ "train_scaled": train_scaled,
299
+ "val_scaled": val_scaled,
300
+ "test_scaled": test_scaled,
301
+ }
302
+
303
+ # Handle time series reshaping if needed
304
+ reshaped_data = None
305
+ # Check if any model requires recurrent processing
306
+ need_reshaping = (
307
+ any(all_models[i].get("recurrent") for i in self.models_idx)
308
+ and self.time_series
309
+ )
310
+
311
+ if need_reshaping:
312
+ # Sanity check: make sure we have enough data for max_timesteps
313
+ if (
314
+ self.group_column
315
+ and train_scaled.groupby(self.group_column).size().min()
316
+ < self.max_timesteps
317
+ ) or train_scaled.shape[0] < self.max_timesteps:
318
+ raise ValueError(
319
+ f"Not enough data for group_column {self.group_column} to reshape data for recurrent models"
320
+ )
321
+
322
+ from lecrapaud.model_preprocessing import reshape_time_series
323
+
324
+ features = self.experiment.get_all_features(
325
+ date_column=self.date_column, group_column=self.group_column
326
+ )
327
+ reshaped_data = reshape_time_series(
328
+ self.experiment,
329
+ features,
330
+ train_scaled,
331
+ val_scaled,
332
+ test_scaled,
333
+ timesteps=self.max_timesteps,
334
+ )
202
335
  logger.info("Model preprocessing done.")
203
336
 
204
- self.model_selection(std_data, reshaped_data, best_params=best_params)
337
+ # Step 5: Model Selection (for each target)
338
+ logger.info("Starting model selection...")
339
+ self.models_ = {}
340
+ for target_number in self.target_numbers:
341
+ model_selector = ModelSelector(
342
+ experiment=self.experiment, target_number=target_number
343
+ )
344
+ model_selector.fit(
345
+ std_data, reshaped_data=reshaped_data, best_params=best_params
346
+ )
347
+ self.models_[target_number] = model_selector.get_best_model()
205
348
  logger.info("Model selection done.")
206
349
 
350
+ return self
351
+
207
352
  def predict(self, new_data, verbose: int = 0):
353
+ """
354
+ Make predictions on new data using the trained pipeline.
355
+
356
+ Args:
357
+ new_data (pd.DataFrame): Input data for prediction
358
+ verbose (int): Verbosity level (0=warnings only, 1=all logs)
359
+
360
+ Returns:
361
+ tuple: (predictions_df, scores_regression, scores_classification)
362
+ """
208
363
  # for scores if TARGET is in columns
209
364
  scores_reg = []
210
365
  scores_clf = []
@@ -214,28 +369,67 @@ class ExperimentEngine:
214
369
 
215
370
  logger.warning("Running prediction...")
216
371
 
217
- # feature engineering + preprocessing
218
- data = self.feature_engineering(
219
- data=new_data,
220
- for_training=False,
221
- )
222
- data = self.preprocess_feature(data, for_training=False)
223
- data, scaled_data, reshaped_data = self.preprocess_model(
224
- data, for_training=False
372
+ # Apply the same preprocessing pipeline as training
373
+ # Step 1: Feature Engineering
374
+ feature_eng = FeatureEngineering(experiment=self.experiment)
375
+ feature_eng.fit(new_data)
376
+ data = feature_eng.get_data()
377
+
378
+ # Step 2: Feature Preprocessing (no splitting for prediction)
379
+ feature_preprocessor = FeaturePreprocessor(experiment=self.experiment)
380
+ # Load existing transformations and apply
381
+ data = feature_preprocessor.transform(data)
382
+
383
+ # Step 3: Model Preprocessing (scaling)
384
+ model_preprocessor = ModelPreprocessor(experiment=self.experiment)
385
+ # Apply existing scaling
386
+ scaled_data = model_preprocessor.transform(data)
387
+
388
+ # Step 4: Time series reshaping if needed
389
+ reshaped_data = None
390
+ # Check if any model requires recurrent processing
391
+ need_reshaping = (
392
+ any(all_models[i].get("recurrent") for i in self.models_idx)
393
+ and self.time_series
225
394
  )
226
395
 
227
- for target_number in self.target_numbers:
396
+ if need_reshaping:
397
+ # Sanity check: make sure we have enough data for max_timesteps
398
+ if (
399
+ self.group_column
400
+ and scaled_data.groupby(self.group_column).size().min()
401
+ < self.max_timesteps
402
+ ) or scaled_data.shape[0] < self.max_timesteps:
403
+ raise ValueError(
404
+ f"Not enough data for group_column {self.group_column} to reshape data for recurrent models"
405
+ )
228
406
 
229
- # loading model
407
+ from lecrapaud.model_preprocessing import reshape_time_series
408
+
409
+ all_features = self.experiment.get_all_features(
410
+ date_column=self.date_column, group_column=self.group_column
411
+ )
412
+ # For prediction, we reshape the entire dataset
413
+ reshaped_data = reshape_time_series(
414
+ self.experiment, all_features, scaled_data, timesteps=self.max_timesteps
415
+ )
416
+ reshaped_data = reshaped_data[
417
+ "x_train_reshaped"
418
+ ] # Only need X data for prediction
419
+
420
+ # Step 5: Predict for each target
421
+ for target_number in self.target_numbers:
422
+ # Load the trained model
230
423
  target_dir = f"{self.experiment.path}/TARGET_{target_number}"
424
+ model = BaseModel(path=target_dir, target_number=target_number)
425
+
426
+ # Get features for this target
231
427
  all_features = self.experiment.get_all_features(
232
428
  date_column=self.date_column, group_column=self.group_column
233
429
  )
234
430
  features = self.experiment.get_features(target_number)
235
431
 
236
- model = ModelEngine(path=target_dir, target_number=target_number)
237
-
238
- # getting data
432
+ # Prepare prediction data
239
433
  if model.recurrent:
240
434
  features_idx = [
241
435
  i for i, e in enumerate(all_features) if e in set(features)
@@ -244,16 +438,14 @@ class ExperimentEngine:
244
438
  else:
245
439
  x_pred = scaled_data[features] if model.need_scaling else data[features]
246
440
 
247
- # predicting
441
+ # Make prediction
248
442
  y_pred = model.predict(x_pred)
249
443
 
250
- # fix for recurrent model because x_val has no index as it is a 3D np array
444
+ # Fix index for recurrent models
251
445
  if model.recurrent:
252
- y_pred.index = (
253
- new_data.index
254
- ) # TODO: not sure this will work for old experiment not aligned with data_for_training for test use case (done, this is why we decode the test set)
446
+ y_pred.index = new_data.index
255
447
 
256
- # unscaling prediction
448
+ # Unscale prediction if needed
257
449
  if (
258
450
  model.need_scaling
259
451
  and model.target_type == "regression"
@@ -267,7 +459,7 @@ class ExperimentEngine:
267
459
  )
268
460
  y_pred.name = "PRED"
269
461
 
270
- # evaluate if TARGET is in columns (case-insensitive check)
462
+ # Evaluate if target is present in new_data
271
463
  target_col = next(
272
464
  (
273
465
  col
@@ -291,126 +483,21 @@ class ExperimentEngine:
291
483
  else:
292
484
  scores_reg.append(score)
293
485
 
294
- # renaming and concatenating with initial data
486
+ # Add predictions to the output dataframe
295
487
  if isinstance(y_pred, pd.DataFrame):
296
488
  y_pred = y_pred.add_prefix(f"TARGET_{target_number}_")
297
489
  new_data = pd.concat([new_data, y_pred], axis=1)
298
-
299
490
  else:
300
491
  y_pred.name = f"TARGET_{target_number}_PRED"
301
492
  new_data = pd.concat([new_data, y_pred], axis=1)
302
493
 
494
+ # Format scores
303
495
  if len(scores_reg) > 0:
304
496
  scores_reg = pd.DataFrame(scores_reg).set_index("TARGET")
305
497
  if len(scores_clf) > 0:
306
498
  scores_clf = pd.DataFrame(scores_clf).set_index("TARGET")
307
- return new_data, scores_reg, scores_clf
308
-
309
- def feature_engineering(self, data, for_training=True):
310
- app = FeatureEngineeringEngine(
311
- data=data,
312
- columns_drop=getattr(self, "columns_drop", []),
313
- columns_boolean=getattr(self, "columns_boolean", []),
314
- columns_date=getattr(self, "columns_date", []),
315
- columns_te_groupby=getattr(self, "columns_te_groupby", []),
316
- columns_te_target=getattr(self, "columns_te_target", []),
317
- for_training=getattr(self, "for_training", True),
318
- )
319
- data = app.run()
320
- return data
321
-
322
- def preprocess_feature(self, data, for_training=True):
323
- app = PreprocessFeature(
324
- data=data,
325
- experiment=getattr(self, "experiment", None),
326
- time_series=getattr(self, "time_series", False),
327
- date_column=getattr(self, "date_column", None),
328
- group_column=getattr(self, "group_column", None),
329
- val_size=getattr(self, "val_size", 0.2),
330
- test_size=getattr(self, "test_size", 0.2),
331
- columns_pca=getattr(self, "columns_pca", []),
332
- pca_temporal=getattr(self, "pca_temporal", []),
333
- pca_cross_sectional=getattr(self, "pca_cross_sectional", []),
334
- columns_onehot=getattr(self, "columns_onehot", []),
335
- columns_binary=getattr(self, "columns_binary", []),
336
- columns_ordinal=getattr(self, "columns_ordinal", []),
337
- columns_frequency=getattr(self, "columns_frequency", []),
338
- target_numbers=getattr(self, "target_numbers", []),
339
- target_clf=getattr(self, "target_clf", []),
340
- )
341
- if for_training:
342
- train, val, test = app.run()
343
- return train, val, test
344
- else:
345
- data = app.inference()
346
- return data
347
499
 
348
- def feature_selection(self, train):
349
- for target_number in self.target_numbers:
350
- app = FeatureSelectionEngine(
351
- train=train,
352
- target_number=target_number,
353
- experiment=self.experiment,
354
- target_clf=self.target_clf,
355
- )
356
- app.run()
357
- self.experiment = Experiment.get(self.experiment.id)
358
- all_features = self.experiment.get_all_features(
359
- date_column=self.date_column, group_column=self.group_column
360
- )
361
- joblib.dump(
362
- all_features, f"{self.experiment.path}/preprocessing/all_features.pkl"
363
- )
364
- return all_features
365
-
366
- def preprocess_model(self, train, val=None, test=None, for_training=True):
367
- app = PreprocessModel(
368
- train=train,
369
- val=val,
370
- test=test,
371
- experiment=getattr(self, "experiment", None),
372
- target_numbers=getattr(self, "target_numbers", []),
373
- target_clf=getattr(self, "target_clf", []),
374
- models_idx=getattr(self, "models_idx", []),
375
- time_series=getattr(self, "time_series", False),
376
- max_timesteps=getattr(self, "max_timesteps", 120),
377
- date_column=getattr(self, "date_column", None),
378
- group_column=getattr(self, "group_column", None),
379
- )
380
- if for_training:
381
- data, reshaped_data = app.run()
382
- return data, reshaped_data
383
- else:
384
- data, scaled_data, reshaped_data = app.inference()
385
- return data, scaled_data, reshaped_data
386
-
387
- def model_selection(self, data, reshaped_data, best_params=None):
388
- for target_number in self.target_numbers:
389
- app = ModelSelectionEngine(
390
- data=data,
391
- reshaped_data=reshaped_data,
392
- target_number=target_number,
393
- experiment=getattr(self, "experiment", None),
394
- target_clf=getattr(self, "target_clf", []),
395
- models_idx=getattr(self, "models_idx", []),
396
- time_series=getattr(self, "time_series", False),
397
- date_column=getattr(self, "date_column", None),
398
- group_column=getattr(self, "group_column", None),
399
- target_clf_thresholds=getattr(self, "target_clf_thresholds", {}),
400
- )
401
- if best_params and target_number not in best_params.keys():
402
- raise ValueError(
403
- f"Target {target_number} not found in best_params passed as argument"
404
- )
405
- app.run(
406
- self.experiment_name,
407
- perform_hyperopt=self.perform_hyperopt,
408
- number_of_trials=self.number_of_trials,
409
- perform_crossval=self.perform_crossval,
410
- plot=self.plot,
411
- preserve_model=self.preserve_model,
412
- best_params=best_params[target_number] if best_params else None,
413
- )
500
+ return new_data, scores_reg, scores_clf
414
501
 
415
502
  def get_scores(self, target_number: int):
416
503
  return pd.read_csv(
@@ -475,6 +562,9 @@ class ExperimentEngine:
475
562
  # For lightgbm models
476
563
  importances = model.feature_importance(importance_type="split")
477
564
  importance_type = "Split"
565
+ elif hasattr(model, "get_feature_importance"):
566
+ importances = model.get_feature_importance()
567
+ importance_type = "Feature importance"
478
568
  elif hasattr(model, "coef_"):
479
569
  # For linear models
480
570
  importances = np.abs(model.coef_.flatten())
@@ -557,7 +647,7 @@ class ExperimentEngine:
557
647
  if not os.path.exists(params_file):
558
648
  raise FileNotFoundError(
559
649
  f"Best parameters file not found at {params_file}. "
560
- "Make sure to run model training first."
650
+ "Make sure to fit model training first."
561
651
  )
562
652
 
563
653
  try: