autogluon.core 1.2.1b20250206__py3-none-any.whl → 1.2.1b20250208__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,73 +1,15 @@
1
1
  from __future__ import annotations
2
2
 
3
- import copy
4
- import logging
5
3
  import os
6
- import shutil
7
- import time
8
- import traceback
9
- from collections import defaultdict
10
- from pathlib import Path
11
- from typing import Any, Generic, Literal, Optional, Type, TypeVar
4
+ from typing import Any, Generic, Type, TypeVar
12
5
 
13
6
  import networkx as nx
14
- import numpy as np
15
- import pandas as pd
16
7
  from typing_extensions import Self
17
8
 
18
- from autogluon.common.features.feature_metadata import FeatureMetadata
19
- from autogluon.common.features.types import R_FLOAT, S_STACK
20
- from autogluon.common.utils.distribute_utils import DistributedContext
21
- from autogluon.common.utils.lite import disable_if_lite_mode
22
- from autogluon.common.utils.log_utils import convert_time_in_s_to_log_friendly, reset_logger_for_remote_call
23
- from autogluon.common.utils.resource_utils import ResourceManager, get_resource_manager
24
- from autogluon.common.utils.try_import import try_import_ray, try_import_torch
25
- from autogluon.core.augmentation.distill_utils import augment_data, format_distillation_labels
26
- from autogluon.core.calibrate import calibrate_decision_threshold
27
- from autogluon.core.calibrate.conformity_score import compute_conformity_score
28
- from autogluon.core.calibrate.temperature_scaling import apply_temperature_scaling, tune_temperature_scaling
29
- from autogluon.core.callbacks import AbstractCallback
30
- from autogluon.core.constants import BINARY, MULTICLASS, QUANTILE, REFIT_FULL_NAME, REGRESSION, SOFTCLASS
31
- from autogluon.core.data.label_cleaner import LabelCleanerMulticlassToBinary
32
- from autogluon.core.metrics import Scorer, compute_metric, get_metric
33
- from autogluon.core.models import (
34
- AbstractModel,
35
- BaggedEnsembleModel,
36
- GreedyWeightedEnsembleModel,
37
- SimpleWeightedEnsembleModel,
38
- StackerEnsembleModel,
39
- WeightedEnsembleModel,
40
- )
41
- from autogluon.core.pseudolabeling.pseudolabeling import assert_pseudo_column_match
42
- from autogluon.core.ray.distributed_jobs_managers import ParallelFitManager
43
- from autogluon.core.utils import (
44
- compute_permutation_feature_importance,
45
- convert_pred_probas_to_df,
46
- default_holdout_frac,
47
- extract_column,
48
- generate_train_test_split,
49
- get_pred_from_proba,
50
- infer_eval_metric,
51
- )
52
- from autogluon.core.utils.exceptions import (
53
- InsufficientTime,
54
- NoGPUError,
55
- NoStackFeatures,
56
- NotEnoughCudaMemoryError,
57
- NotEnoughMemoryError,
58
- NotValidStacker,
59
- NoValidFeatures,
60
- TimeLimitExceeded,
61
- )
62
- from autogluon.core.utils.feature_selection import FeatureSelector
9
+ from autogluon.core.models import AbstractModel
63
10
  from autogluon.core.utils.loaders import load_pkl
64
11
  from autogluon.core.utils.savers import save_json, save_pkl
65
12
 
66
- from .utils import process_hyperparameters
67
-
68
- logger = logging.getLogger(__name__)
69
-
70
-
71
13
  ModelTypeT = TypeVar("ModelTypeT", bound=AbstractModel)
72
14
 
73
15
 
@@ -83,10 +25,15 @@ class AbstractTrainer(Generic[ModelTypeT]):
83
25
  self.low_memory: bool = low_memory
84
26
  self.save_data: bool = save_data
85
27
 
28
+ #: dict of model name -> model object. A key, value pair only exists if a model is persisted in memory.
86
29
  self.models: dict[str, Any] = {}
30
+
31
+ #: Directed Acyclic Graph (DAG) of model interactions. Describes how certain models depend on the predictions of certain
32
+ #: other models. Contains numerous metadata regarding each model.
87
33
  self.model_graph = nx.DiGraph()
88
34
  self.model_best: str | None = None
89
35
 
36
+ #: Names which are banned but are not used by a trained model.
90
37
  self._extra_banned_names: set[str] = set()
91
38
 
92
39
  def _get_banned_model_names(self) -> list[str]:
@@ -115,7 +62,7 @@ class AbstractTrainer(Generic[ModelTypeT]):
115
62
  path = path_context
116
63
  return path
117
64
 
118
- def save_model(self, model: ModelTypeT, **kwargs) -> None:
65
+ def save_model(self, model: ModelTypeT) -> None:
119
66
  model.save()
120
67
  if not self.low_memory:
121
68
  self.models[model.name] = model
@@ -141,7 +88,7 @@ class AbstractTrainer(Generic[ModelTypeT]):
141
88
  return os.path.join(*self.model_graph.nodes[model][attribute])
142
89
  return self.model_graph.nodes[model][attribute]
143
90
 
144
- def set_model_attribute(self, model: str | ModelTypeT, attribute: str, val: Any):
91
+ def set_model_attribute(self, model: str | ModelTypeT, attribute: str, val: Any) -> None:
145
92
  if not isinstance(model, str):
146
93
  model = model.name
147
94
  self.model_graph.nodes[model][attribute] = val
@@ -168,8 +115,8 @@ class AbstractTrainer(Generic[ModelTypeT]):
168
115
  else:
169
116
  model_info = model.get_info()
170
117
  return model_info
171
-
172
- def get_model_names(self, **kwargs) -> list[str]:
118
+
119
+ def get_model_names(self) -> list[str]:
173
120
  """Get all model names that are registered in the model graph, in no particular order."""
174
121
  return list(self.model_graph.nodes)
175
122
 
@@ -182,20 +129,24 @@ class AbstractTrainer(Generic[ModelTypeT]):
182
129
  return model_info_dict
183
130
 
184
131
  # TODO: model_name change to model in params
185
- def load_model(self, model_name: str | ModelTypeT, path: str | None = None, model_type: Type[ModelTypeT] | None = None) -> ModelTypeT:
132
+ def load_model(
133
+ self, model_name: str | ModelTypeT, path: str | None = None, model_type: Type[ModelTypeT] | None = None
134
+ ) -> ModelTypeT:
186
135
  if isinstance(model_name, AbstractModel):
187
136
  return model_name
188
137
  if model_name in self.models.keys():
189
138
  return self.models[model_name]
190
139
  else:
191
140
  if path is None:
192
- path = self.get_model_attribute(model=model_name, attribute="path") # get relative location of the model to the trainer
141
+ path = self.get_model_attribute(
142
+ model=model_name, attribute="path"
143
+ ) # get relative location of the model to the trainer
193
144
  assert path is not None
194
145
  if model_type is None:
195
146
  model_type = self.get_model_attribute(model=model_name, attribute="type")
196
147
  assert model_type is not None
197
148
  return model_type.load(path=os.path.join(self.path, path), reset_paths=self.reset_paths)
198
-
149
+
199
150
  @classmethod
200
151
  def load_info(cls, path: str, reset_paths: bool = False, load_model_if_required: bool = True) -> dict[str, Any]:
201
152
  load_path = os.path.join(path, cls.trainer_info_name)
@@ -216,14 +167,14 @@ class AbstractTrainer(Generic[ModelTypeT]):
216
167
  return info
217
168
 
218
169
  def construct_model_templates(
219
- self, hyperparameters: str | dict[str, Any], **kwargs
170
+ self, hyperparameters: dict[str, Any]
220
171
  ) -> tuple[list[ModelTypeT], dict] | list[ModelTypeT]:
221
172
  raise NotImplementedError
222
173
 
223
- def get_model_best(self, *args, **kwargs) -> str:
174
+ def get_model_best(self) -> str:
224
175
  raise NotImplementedError
225
176
 
226
- def get_info(self, include_model_info: bool = False, **kwargs) -> dict[str, Any]:
177
+ def get_info(self, include_model_info: bool = False) -> dict[str, Any]:
227
178
  raise NotImplementedError
228
179
 
229
180
  def save(self) -> None:
@@ -245,4666 +196,3 @@ class AbstractTrainer(Generic[ModelTypeT]):
245
196
 
246
197
  def predict(self, *args, **kwargs) -> Any:
247
198
  raise NotImplementedError
248
-
249
-
250
- # TODO: This class will be moved to autogluon.tabular
251
- class AbstractTabularTrainer(AbstractTrainer[AbstractModel]):
252
- """
253
- AbstractTabularTrainer contains logic to train a variety of models under a variety of constraints and automatically generate a multi-layer stack ensemble.
254
- Beyond the basic functionality, it also has support for model refitting, distillation, pseudo-labelling, unlabeled data, and much more.
255
-
256
- It is not recommended to directly use Trainer. Instead, use Predictor or Learner which internally uses Trainer.
257
- This documentation is for developers. Users should avoid this class.
258
-
259
- Due to the complexity of the logic within this class, a text description will not give the full picture.
260
- It is recommended to carefully read the code and use a debugger to understand how it works.
261
-
262
- AbstractTabularTrainer makes much fewer assumptions about the problem than Learner and Predictor.
263
- It expects these ambiguities to have already been resolved upstream. For example, problem_type, feature_metadata, num_classes, etc.
264
-
265
- Parameters
266
- ----------
267
- path : str
268
- Path to save and load trainer artifacts to disk.
269
- Path should end in `/` or `os.path.sep()`.
270
- problem_type : str
271
- One of ['binary', 'multiclass', 'regression', 'quantile', 'softclass']
272
- num_classes : int
273
- The number of classes in the problem.
274
- If problem_type is in ['regression', 'quantile'], this must be None.
275
- If problem_type is 'binary', this must be 2.
276
- If problem_type is in ['multiclass', 'softclass'], this must be >= 2.
277
- feature_metadata : FeatureMetadata
278
- FeatureMetadata for X. Sent to each model during fit.
279
- eval_metric : Scorer, default = None
280
- Metric to optimize. If None, a default metric is used depending on the problem_type.
281
- quantile_levels : list[float] | np.ndarray, default = None
282
- # TODO: Add documentation, not documented in Predictor.
283
- Only used when problem_type=quantile
284
- low_memory : bool, default = True
285
- Deprecated parameter, likely to be removed in future versions.
286
- If True, caches models to disk separately instead of containing all models within memory.
287
- If False, may cause a variety of bugs.
288
- k_fold : int, default = 0
289
- If <2, then non-bagged mode is used.
290
- If >= 2, then bagged mode is used with num_bag_folds == k_fold for each model.
291
- Bagged mode changes the way models are trained and ensembled.
292
- Bagged mode enables multi-layer stacking and repeated bagging.
293
- n_repeats : int, default = 1
294
- The maximum repeats of bagging to do when in bagged mode.
295
- Larger values take linearly longer to train and infer, but improves quality slightly.
296
- sample_weight : str, default = None
297
- Column name of the sample weight in X
298
- weight_evaluation : bool, default = False
299
- If True, the eval_metric is calculated with sample_weight incorporated into the score.
300
- save_data : bool, default = True
301
- Whether to cache the data (X, y, X_val, y_val) to disk.
302
- Required for a variety of advanced post-fit functionality.
303
- It is recommended to keep as True.
304
- random_state : int, default = 0
305
- Random state for data splitting in bagged mode.
306
- verbosity : int, default = 2
307
- Verbosity levels range from 0 to 4 and control how much information is printed.
308
- Higher levels correspond to more detailed print statements (you can set verbosity = 0 to suppress warnings).
309
- If using logging, you can alternatively control amount of information printed via `logger.setLevel(L)`,
310
- where `L` ranges from 0 to 50 (Note: higher values of `L` correspond to fewer print statements, opposite of verbosity levels).
311
- """
312
-
313
- distill_stackname = "distill" # name of stack-level for distilled student models
314
-
315
- def __init__(
316
- self,
317
- path: str,
318
- *,
319
- problem_type: str,
320
- num_classes: int | None = None,
321
- feature_metadata: FeatureMetadata | None = None,
322
- eval_metric: Scorer | None = None,
323
- quantile_levels: list[float] | np.ndarray | None = None,
324
- low_memory: bool = True,
325
- k_fold: int = 0,
326
- n_repeats: int = 1,
327
- sample_weight: str | None = None,
328
- weight_evaluation: bool = False,
329
- save_data: bool = False,
330
- random_state: int = 0,
331
- verbosity: int = 2,
332
- ):
333
- super().__init__(
334
- path=path,
335
- low_memory=low_memory,
336
- save_data=save_data,
337
- )
338
- self._validate_num_classes(num_classes=num_classes, problem_type=problem_type)
339
- self._validate_quantile_levels(quantile_levels=quantile_levels, problem_type=problem_type)
340
- self.problem_type = problem_type
341
- self.feature_metadata = feature_metadata
342
- self.save_data = save_data
343
- self.random_state = (
344
- random_state # Integer value added to the stack level to get the random_state for kfold splits or the train/val split if bagging is disabled
345
- )
346
- self.verbosity = verbosity
347
- self.sample_weight = sample_weight # TODO: consider redesign where Trainer doesn't need sample_weight column name and weights are separate from X
348
- self.weight_evaluation = weight_evaluation
349
- if eval_metric is not None:
350
- self.eval_metric = eval_metric
351
- else:
352
- self.eval_metric = infer_eval_metric(problem_type=self.problem_type)
353
-
354
- logger.log(20, f"AutoGluon will gauge predictive performance using evaluation metric: '{self.eval_metric.name}'")
355
- if not self.eval_metric.greater_is_better_internal:
356
- logger.log(
357
- 20,
358
- "\tThis metric's sign has been flipped to adhere to being higher_is_better. "
359
- "The metric score can be multiplied by -1 to get the metric value.",
360
- )
361
- if not (self.eval_metric.needs_pred or self.eval_metric.needs_quantile):
362
- logger.log(
363
- 20,
364
- "\tThis metric expects predicted probabilities rather than predicted class labels, "
365
- "so you'll need to use predict_proba() instead of predict()",
366
- )
367
-
368
- logger.log(20, "\tTo change this, specify the eval_metric parameter of Predictor()")
369
- self.num_classes = num_classes
370
- self.quantile_levels = quantile_levels
371
- self.feature_prune = False # will be set to True if feature-pruning is turned on.
372
- self.low_memory = low_memory
373
- self.bagged_mode = True if k_fold >= 2 else False
374
- if self.bagged_mode:
375
- self.k_fold = k_fold # int number of folds to do model bagging, < 2 means disabled
376
- self.n_repeats = n_repeats
377
- else:
378
- self.k_fold = 0
379
- self.n_repeats = 1
380
-
381
- self.model_best = None
382
-
383
- self.models = {} # dict of model name -> model object. A key, value pair only exists if a model is persisted in memory. # TODO: v0.1 Rename and consider making private
384
- self.model_graph = nx.DiGraph() # Directed Acyclic Graph (DAG) of model interactions. Describes how certain models depend on the predictions of certain other models. Contains numerous metadata regarding each model.
385
- self.reset_paths = False
386
-
387
- self._time_limit = None # Internal float of the total time limit allowed for a given fit call. Used in logging statements.
388
- self._time_train_start = None # Internal timestamp of the time training started for a given fit call. Used in logging statements.
389
- self._time_train_start_last = None # Same as `self._time_train_start` except it is not reset to None after the fit call completes.
390
-
391
- self._num_rows_train = None
392
- self._num_cols_train = None
393
- self._num_rows_val = None
394
- self._num_rows_test = None
395
-
396
- self.is_data_saved = False
397
- self._X_saved = False
398
- self._y_saved = False
399
- self._X_val_saved = False
400
- self._y_val_saved = False
401
-
402
- self._groups = None # custom split indices
403
-
404
- self._regress_preds_asprobas = False # whether to treat regression predictions as class-probabilities (during distillation)
405
-
406
- self._extra_banned_names = set() # Names which are banned but are not used by a trained model.
407
-
408
- self._models_failed_to_train_errors = dict() # dict of model name -> model failure metadata
409
-
410
- # self._exceptions_list = [] # TODO: Keep exceptions list for debugging during benchmarking.
411
-
412
- self.callbacks: list[AbstractCallback] = []
413
- self._callback_early_stop = False
414
-
415
- @property
416
- def _path_attr(self) -> str:
417
- """Path to cached model graph attributes"""
418
- return os.path.join(self.path_utils, "attr")
419
-
420
- @property
421
- def has_val(self) -> bool:
422
- """Whether the trainer uses validation data"""
423
- return self._num_rows_val is not None
424
-
425
- @property
426
- def num_rows_val_for_calibration(self) -> int:
427
- """The number of rows available to optimize model calibration"""
428
- if self._num_rows_val is not None:
429
- return self._num_rows_val
430
- elif self.bagged_mode:
431
- assert self._num_rows_train is not None
432
- return self._num_rows_train
433
- else:
434
- return 0
435
-
436
- @property
437
- def time_left(self) -> float | None:
438
- """
439
- Remaining time left in the fit call.
440
- None if time_limit was unspecified.
441
- """
442
- if self._time_train_start is None:
443
- return None
444
- elif self._time_limit is None:
445
- return None
446
- time_elapsed = time.time() - self._time_train_start
447
- time_left = self._time_limit - time_elapsed
448
- return time_left
449
-
450
- @property
451
- def logger(self) -> logging.Logger:
452
- return logger
453
-
454
- def log(self, level: int, msg, *args, **kwargs):
455
- self.logger.log(level, msg, *args, **kwargs)
456
-
457
- def load_X(self):
458
- if self._X_saved:
459
- path = os.path.join(self.path_data, "X.pkl")
460
- return load_pkl.load(path=path)
461
- return None
462
-
463
- def load_X_val(self):
464
- if self._X_val_saved:
465
- path = os.path.join(self.path_data, "X_val.pkl")
466
- return load_pkl.load(path=path)
467
- return None
468
-
469
- def load_y(self):
470
- if self._y_saved:
471
- path = os.path.join(self.path_data, "y.pkl")
472
- return load_pkl.load(path=path)
473
- return None
474
-
475
- def load_y_val(self):
476
- if self._y_val_saved:
477
- path = os.path.join(self.path_data, "y_val.pkl")
478
- return load_pkl.load(path=path)
479
- return None
480
-
481
- def load_data(self):
482
- X = self.load_X()
483
- y = self.load_y()
484
- X_val = self.load_X_val()
485
- y_val = self.load_y_val()
486
-
487
- return X, y, X_val, y_val
488
-
489
- def save_X(self, X, verbose=True):
490
- path = os.path.join(self.path_data, "X.pkl")
491
- save_pkl.save(path=path, object=X, verbose=verbose)
492
- self._X_saved = True
493
-
494
- def save_X_val(self, X, verbose=True):
495
- path = os.path.join(self.path_data, "X_val.pkl")
496
- save_pkl.save(path=path, object=X, verbose=verbose)
497
- self._X_val_saved = True
498
-
499
- def save_X_test(self, X, verbose=True):
500
- path = os.path.join(self.path_data, "X_test.pkl")
501
- save_pkl.save(path=path, object=X, verbose=verbose)
502
- self._X_test_saved = True
503
-
504
- def save_y(self, y, verbose=True):
505
- path = os.path.join(self.path_data, "y.pkl")
506
- save_pkl.save(path=path, object=y, verbose=verbose)
507
- self._y_saved = True
508
-
509
- def save_y_val(self, y, verbose=True):
510
- path = os.path.join(self.path_data, "y_val.pkl")
511
- save_pkl.save(path=path, object=y, verbose=verbose)
512
- self._y_val_saved = True
513
-
514
- def save_y_test(self, y, verbose=True):
515
- path = os.path.join(self.path_data, "y_test.pkl")
516
- save_pkl.save(path=path, object=y, verbose=verbose)
517
- self._y_test_saved = True
518
-
519
- def get_model_names(
520
- self,
521
- stack_name: list[str] | str | None = None,
522
- level: list[int] | int | None = None,
523
- can_infer: bool | None = None,
524
- models: list[str] | None = None
525
- ) -> list[str]:
526
- if models is None:
527
- models = list(self.model_graph.nodes)
528
- if stack_name is not None:
529
- if not isinstance(stack_name, list):
530
- stack_name = [stack_name]
531
- node_attributes: dict = self.get_models_attribute_dict(attribute="stack_name", models=models)
532
- models = [model_name for model_name in models if node_attributes[model_name] in stack_name]
533
- if level is not None:
534
- if not isinstance(level, list):
535
- level = [level]
536
- node_attributes: dict = self.get_models_attribute_dict(attribute="level", models=models)
537
- models = [model_name for model_name in models if node_attributes[model_name] in level]
538
- # TODO: can_infer is technically more complicated, if an ancestor can't infer then the model can't infer.
539
- if can_infer is not None:
540
- node_attributes = self.get_models_attribute_full(attribute="can_infer", models=models, func=min)
541
- models = [model for model in models if node_attributes[model] == can_infer]
542
- return models
543
-
544
- def get_max_level(self, stack_name: str | None = None, models: list[str] | None = None) -> int:
545
- models = self.get_model_names(stack_name=stack_name, models=models)
546
- models_attribute_dict = self.get_models_attribute_dict(attribute="level", models=models)
547
- if models_attribute_dict:
548
- return max(list(models_attribute_dict.values()))
549
- else:
550
- return -1
551
-
552
- def construct_model_templates(self, hyperparameters: dict, **kwargs) -> tuple[list[AbstractModel], dict]:
553
- """Constructs a list of unfit models based on the hyperparameters dict."""
554
- raise NotImplementedError
555
-
556
- def construct_model_templates_distillation(self, hyperparameters: dict, **kwargs) -> tuple[list[AbstractModel], dict]:
557
- """Constructs a list of unfit models based on the hyperparameters dict for softclass distillation."""
558
- raise NotImplementedError
559
-
560
- def get_model_level(self, model_name: str) -> int:
561
- return self.get_model_attribute(model=model_name, attribute="level")
562
-
563
- def fit(self, X, y, hyperparameters: dict, X_val=None, y_val=None, **kwargs):
564
- raise NotImplementedError
565
-
566
- # TODO: Enable easier re-mapping of trained models -> hyperparameters input (They don't share a key since name can change)
567
- def train_multi_levels(
568
- self,
569
- X,
570
- y,
571
- hyperparameters: dict,
572
- X_val=None,
573
- y_val=None,
574
- X_test=None,
575
- y_test=None,
576
- X_unlabeled=None,
577
- base_model_names: list[str] | None = None,
578
- core_kwargs: dict | None = None,
579
- aux_kwargs: dict | None = None,
580
- level_start=1,
581
- level_end=1,
582
- time_limit=None,
583
- name_suffix: str | None = None,
584
- relative_stack=True,
585
- level_time_modifier=0.333,
586
- infer_limit=None,
587
- infer_limit_batch_size=None,
588
- callbacks: list[AbstractCallback] | None = None,
589
- ) -> list[str]:
590
- """
591
- Trains a multi-layer stack ensemble using the input data on the hyperparameters dict input.
592
- hyperparameters is used to determine the models used in each stack layer.
593
- If continuing a stack ensemble with level_start>1, ensure that base_model_names is set to the appropriate base models that will be used by the level_start level models.
594
- Trains both core and aux models.
595
- core models are standard models which are fit on the data features. Core models will also use model predictions if base_model_names was specified or if level != 1.
596
- aux models are ensemble models which only use the predictions of core models as features. These models never use the original features.
597
-
598
- level_time_modifier : float, default 0.333
599
- The amount of extra time given relatively to early stack levels compared to later stack levels.
600
- If 0, then all stack levels are given 100%/L of the time, where L is the number of stack levels.
601
- If 1, then all stack levels are given 100% of the time, meaning if the first level uses all of the time given to it, the other levels won't train.
602
- Time given to a level = remaining_time / remaining_levels * (1 + level_time_modifier), capped by total remaining time.
603
-
604
- Returns a list of the model names that were trained from this method call, in order of fit.
605
- """
606
- self._fit_setup(time_limit=time_limit, callbacks=callbacks)
607
- time_train_start = self._time_train_start
608
- assert time_train_start is not None
609
-
610
- if self.callbacks:
611
- callback_classes = [c.__class__.__name__ for c in self.callbacks]
612
- logger.log(20, f"User-specified callbacks ({len(self.callbacks)}): {callback_classes}")
613
-
614
- hyperparameters = self._process_hyperparameters(hyperparameters=hyperparameters)
615
-
616
- if relative_stack:
617
- if level_start != 1:
618
- raise AssertionError(f"level_start must be 1 when `relative_stack=True`. (level_start = {level_start})")
619
- level_add = 0
620
- if base_model_names:
621
- max_base_model_level = self.get_max_level(models=base_model_names)
622
- level_start = max_base_model_level + 1
623
- level_add = level_start - 1
624
- level_end += level_add
625
- if level_start != 1:
626
- hyperparameters_relative = {}
627
- for key in hyperparameters:
628
- if isinstance(key, int):
629
- hyperparameters_relative[key + level_add] = hyperparameters[key]
630
- else:
631
- hyperparameters_relative[key] = hyperparameters[key]
632
- hyperparameters = hyperparameters_relative
633
-
634
- core_kwargs = {} if core_kwargs is None else core_kwargs.copy()
635
- aux_kwargs = {} if aux_kwargs is None else aux_kwargs.copy()
636
-
637
- self._callbacks_setup(
638
- X=X,
639
- y=y,
640
- hyperparameters=hyperparameters,
641
- X_val=X_val,
642
- y_val=y_val,
643
- X_unlabeled=X_unlabeled,
644
- level_start=level_start,
645
- level_end=level_end,
646
- time_limit=time_limit,
647
- base_model_names=base_model_names,
648
- core_kwargs=core_kwargs,
649
- aux_kwargs=aux_kwargs,
650
- name_suffix=name_suffix,
651
- level_time_modifier=level_time_modifier,
652
- infer_limit=infer_limit,
653
- infer_limit_batch_size=infer_limit_batch_size,
654
- )
655
- # TODO: Add logic for callbacks to specify that the rest of the trainer logic should be skipped in the case where they are overriding the trainer logic.
656
-
657
- model_names_fit = []
658
- if level_start != level_end:
659
- logger.log(20, f"AutoGluon will fit {level_end - level_start + 1} stack levels (L{level_start} to L{level_end}) ...")
660
- for level in range(level_start, level_end + 1):
661
- core_kwargs_level = core_kwargs.copy()
662
- aux_kwargs_level = aux_kwargs.copy()
663
- full_weighted_ensemble = aux_kwargs_level.pop("fit_full_last_level_weighted_ensemble", True) and (level == level_end) and (level > 1)
664
- additional_full_weighted_ensemble = aux_kwargs_level.pop("full_weighted_ensemble_additionally", False) and full_weighted_ensemble
665
- if time_limit is not None:
666
- time_train_level_start = time.time()
667
- levels_left = level_end - level + 1
668
- time_left = time_limit - (time_train_level_start - time_train_start)
669
- time_limit_for_level = min(time_left / levels_left * (1 + level_time_modifier), time_left)
670
- time_limit_core = time_limit_for_level
671
- time_limit_aux = max(time_limit_for_level * 0.1, min(time_limit, 360)) # Allows aux to go over time_limit, but only by a small amount
672
- core_kwargs_level["time_limit"] = core_kwargs_level.get("time_limit", time_limit_core)
673
- aux_kwargs_level["time_limit"] = aux_kwargs_level.get("time_limit", time_limit_aux)
674
- base_model_names, aux_models = self.stack_new_level(
675
- X=X,
676
- y=y,
677
- X_val=X_val,
678
- y_val=y_val,
679
- X_test=X_test,
680
- y_test=y_test,
681
- X_unlabeled=X_unlabeled,
682
- models=hyperparameters,
683
- level=level,
684
- base_model_names=base_model_names,
685
- core_kwargs=core_kwargs_level,
686
- aux_kwargs=aux_kwargs_level,
687
- name_suffix=name_suffix,
688
- infer_limit=infer_limit,
689
- infer_limit_batch_size=infer_limit_batch_size,
690
- full_weighted_ensemble=full_weighted_ensemble,
691
- additional_full_weighted_ensemble=additional_full_weighted_ensemble,
692
- )
693
- model_names_fit += base_model_names + aux_models
694
- if (self.model_best is None or infer_limit is not None) and len(model_names_fit) != 0:
695
- self.model_best = self.get_model_best(infer_limit=infer_limit, infer_limit_as_child=True)
696
- self._callbacks_conclude()
697
- self._fit_cleanup()
698
- self.save()
699
- return model_names_fit
700
-
701
- def _fit_setup(self, time_limit: float | None = None, callbacks: list[AbstractCallback] | None = None):
702
- """
703
- Prepare the trainer state at the start of / prior to a fit call.
704
- Should be paired with a `self._fit_cleanup()` at the conclusion of the fit call.
705
- """
706
- self._time_train_start = time.time()
707
- self._time_train_start_last = self._time_train_start
708
- self._time_limit = time_limit
709
- self.reset_callbacks()
710
- if callbacks is not None:
711
- assert isinstance(callbacks, list), f"`callbacks` must be a list. Found invalid type: `{type(callbacks)}`."
712
- for callback in callbacks:
713
- assert isinstance(
714
- callback, AbstractCallback
715
- ), f"Elements in `callbacks` must be of type AbstractCallback. Found invalid type: `{type(callback)}`."
716
- else:
717
- callbacks = []
718
- self.callbacks = callbacks
719
-
720
- def _fit_cleanup(self):
721
- """
722
- Cleanup the trainer state after fit call completes.
723
- This ensures that future fit calls are not corrupted by prior fit calls.
724
- Should be paired with an earlier `self._fit_setup()` call.
725
- """
726
- self._time_limit = None
727
- self._time_train_start = None
728
- self.reset_callbacks()
729
-
730
- def _callbacks_setup(self, **kwargs):
731
- for callback in self.callbacks:
732
- callback.before_trainer_fit(trainer=self, **kwargs)
733
-
734
- def _callbacks_conclude(self):
735
- for callback in self.callbacks:
736
- callback.after_trainer_fit(trainer=self)
737
-
738
- def reset_callbacks(self):
739
- """Deletes callback objects and resets `self._callback_early_stop` to False."""
740
- self.callbacks = []
741
- self._callback_early_stop = False
742
-
743
- # TODO: Consider better greedy approximation method such as via fitting a weighted ensemble to evaluate the value of a subset.
744
- def _filter_base_models_via_infer_limit(
745
- self,
746
- base_model_names: list[str],
747
- infer_limit: float | None,
748
- infer_limit_modifier: float = 1.0,
749
- as_child: bool = True,
750
- verbose: bool = True,
751
- ) -> list[str]:
752
- """
753
- Returns a subset of base_model_names whose combined prediction time for 1 row of data does not exceed infer_limit seconds.
754
- With the goal of selecting the best valid subset that is most valuable to stack ensembles who use them as base models,
755
- this is a variant of the constrained knapsack problem and is NP-Hard and infeasible to exactly solve even with fewer than 10 models.
756
- For practical purposes, this method applies a greedy approximation approach to selecting the subset
757
- by simply removing models in reverse order of validation score until the remaining subset is valid.
758
-
759
- Parameters
760
- ----------
761
- base_model_names: list[str]
762
- list of model names. These models must already be added to the trainer.
763
- infer_limit: float, optional
764
- Inference limit in seconds for 1 row of data. This is compared against values pre-computed during fit for the models.
765
- infer_limit_modifier: float, default = 1.0
766
- Modifier to multiply infer_limit by.
767
- Set to <1.0 to provide headroom for stack models who take the returned subset as base models
768
- so that the stack models are less likely to exceed infer_limit.
769
- as_child: bool, default = True
770
- If True, use the inference time of only 1 child model for bags instead of the overall inference time of the bag.
771
- This is useful if the intent is to refit the models, as this will best estimate the inference time of the refit model.
772
- verbose: bool, default = True
773
- Whether to log the models that are removed.
774
-
775
- Returns
776
- -------
777
- Returns valid subset of models that satisfy constraints.
778
- """
779
- if infer_limit is None or not base_model_names:
780
- return base_model_names
781
-
782
- base_model_names = base_model_names.copy()
783
- num_models_og = len(base_model_names)
784
- infer_limit_threshold = infer_limit * infer_limit_modifier # Add headroom
785
-
786
- if as_child:
787
- attribute = "predict_1_child_time"
788
- else:
789
- attribute = "predict_1_time"
790
-
791
- predict_1_time_full_set = self.get_model_attribute_full(model=base_model_names, attribute=attribute)
792
-
793
- messages_to_log = []
794
-
795
- base_model_names_copy = base_model_names.copy()
796
- # Prune models that by themselves have larger inference latency than the infer_limit, as they can never be valid
797
- for base_model_name in base_model_names_copy:
798
- predict_1_time_full = self.get_model_attribute_full(model=base_model_name, attribute=attribute)
799
- if predict_1_time_full >= infer_limit_threshold:
800
- predict_1_time_full_set_old = predict_1_time_full_set
801
- base_model_names.remove(base_model_name)
802
- predict_1_time_full_set = self.get_model_attribute_full(model=base_model_names, attribute=attribute)
803
- if verbose:
804
- predict_1_time_full_set_log, time_unit = convert_time_in_s_to_log_friendly(time_in_sec=predict_1_time_full_set)
805
- predict_1_time_full_set_old_log, time_unit_old = convert_time_in_s_to_log_friendly(time_in_sec=predict_1_time_full_set_old)
806
- messages_to_log.append(
807
- f"\t{round(predict_1_time_full_set_old_log, 3)}{time_unit_old}\t-> {round(predict_1_time_full_set_log, 3)}{time_unit}\t({base_model_name})"
808
- )
809
-
810
- score_val_dict = self.get_models_attribute_dict(attribute="val_score", models=base_model_names)
811
- sorted_scores = sorted(score_val_dict.items(), key=lambda x: x[1])
812
- i = 0
813
- # Prune models by ascending validation score until the remaining subset's combined inference latency satisfies infer_limit
814
- while base_model_names and (predict_1_time_full_set >= infer_limit_threshold):
815
- # TODO: Incorporate score vs inference speed tradeoff in a smarter way
816
- base_model_to_remove = sorted_scores[i][0]
817
- predict_1_time_full_set_old = predict_1_time_full_set
818
- base_model_names.remove(base_model_to_remove)
819
- i += 1
820
- predict_1_time_full_set = self.get_model_attribute_full(model=base_model_names, attribute=attribute)
821
- if verbose:
822
- predict_1_time_full_set_log, time_unit = convert_time_in_s_to_log_friendly(time_in_sec=predict_1_time_full_set)
823
- predict_1_time_full_set_old_log, time_unit_old = convert_time_in_s_to_log_friendly(time_in_sec=predict_1_time_full_set_old)
824
- messages_to_log.append(
825
- f"\t{round(predict_1_time_full_set_old_log, 3)}{time_unit_old}\t-> {round(predict_1_time_full_set_log, 3)}{time_unit}\t({base_model_to_remove})"
826
- )
827
-
828
- if messages_to_log:
829
- infer_limit_threshold_log, time_unit_threshold = convert_time_in_s_to_log_friendly(time_in_sec=infer_limit_threshold)
830
- logger.log(
831
- 20,
832
- f"Removing {len(messages_to_log)}/{num_models_og} base models to satisfy inference constraint "
833
- f"(constraint={round(infer_limit_threshold_log, 3)}{time_unit_threshold}) ...",
834
- )
835
- for msg in messages_to_log:
836
- logger.log(20, msg)
837
-
838
- return base_model_names
839
-
840
- def stack_new_level(
841
- self,
842
- X,
843
- y,
844
- models: list[AbstractModel] | dict,
845
- X_val=None,
846
- y_val=None,
847
- X_test=None,
848
- y_test=None,
849
- X_unlabeled=None,
850
- level=1,
851
- base_model_names: list[str] | None = None,
852
- core_kwargs: dict | None = None,
853
- aux_kwargs: dict | None = None,
854
- name_suffix: str | None = None,
855
- infer_limit=None,
856
- infer_limit_batch_size=None,
857
- full_weighted_ensemble: bool = False,
858
- additional_full_weighted_ensemble: bool = False,
859
- ) -> tuple[list[str], list[str]]:
860
- """
861
- Similar to calling self.stack_new_level_core, except auxiliary models will also be trained via a call to self.stack_new_level_aux, with the models trained from self.stack_new_level_core used as base models.
862
- """
863
- if base_model_names is None:
864
- base_model_names = []
865
- core_kwargs = {} if core_kwargs is None else core_kwargs.copy()
866
- aux_kwargs = {} if aux_kwargs is None else aux_kwargs.copy()
867
- if level < 1:
868
- raise AssertionError(f"Stack level must be >= 1, but level={level}.")
869
- if base_model_names and level == 1:
870
- raise AssertionError(f"Stack level 1 models cannot have base models, but base_model_names={base_model_names}.")
871
- if name_suffix:
872
- core_kwargs["name_suffix"] = core_kwargs.get("name_suffix", "") + name_suffix
873
- aux_kwargs["name_suffix"] = aux_kwargs.get("name_suffix", "") + name_suffix
874
- core_models = self.stack_new_level_core(
875
- X=X,
876
- y=y,
877
- X_val=X_val,
878
- y_val=y_val,
879
- X_test=X_test,
880
- y_test=y_test,
881
- X_unlabeled=X_unlabeled,
882
- models=models,
883
- level=level,
884
- infer_limit=infer_limit,
885
- infer_limit_batch_size=infer_limit_batch_size,
886
- base_model_names=base_model_names,
887
- **core_kwargs,
888
- )
889
-
890
- aux_models = []
891
- if full_weighted_ensemble:
892
- full_aux_kwargs = aux_kwargs.copy()
893
- if additional_full_weighted_ensemble:
894
- full_aux_kwargs["name_extra"] = "_ALL"
895
- all_base_model_names = self.get_model_names(stack_name="core") # Fit weighted ensemble on all previously fitted core models
896
- aux_models += self._stack_new_level_aux(X_val, y_val, X, y, all_base_model_names, level, infer_limit, infer_limit_batch_size, **full_aux_kwargs)
897
-
898
- if (not full_weighted_ensemble) or additional_full_weighted_ensemble:
899
- aux_models += self._stack_new_level_aux(X_val, y_val, X, y, core_models, level, infer_limit, infer_limit_batch_size, **aux_kwargs)
900
-
901
- return core_models, aux_models
902
-
903
- def stack_new_level_core(
904
- self,
905
- X,
906
- y,
907
- models: list[AbstractModel] | dict,
908
- X_val=None,
909
- y_val=None,
910
- X_test=None,
911
- y_test=None,
912
- X_unlabeled=None,
913
- level=1,
914
- base_model_names: list[str] | None = None,
915
- fit_strategy: Literal["sequential", "parallel"] = "sequential",
916
- stack_name="core",
917
- ag_args=None,
918
- ag_args_fit=None,
919
- ag_args_ensemble=None,
920
- included_model_types=None,
921
- excluded_model_types=None,
922
- ensemble_type=StackerEnsembleModel,
923
- name_suffix: str | None = None,
924
- get_models_func=None,
925
- refit_full=False,
926
- infer_limit=None,
927
- infer_limit_batch_size=None,
928
- **kwargs,
929
- ) -> list[str]:
930
- """
931
- Trains all models using the data provided.
932
- If level > 1, then the models will use base model predictions as additional features.
933
- The base models used can be specified via base_model_names.
934
- If self.bagged_mode, then models will be trained as StackerEnsembleModels.
935
- The data provided in this method should not contain stack features, as they will be automatically generated if necessary.
936
- """
937
- if self._callback_early_stop:
938
- return []
939
- if get_models_func is None:
940
- get_models_func = self.construct_model_templates
941
- if base_model_names is None:
942
- base_model_names = []
943
- if not self.bagged_mode and level != 1:
944
- raise ValueError("Stack Ensembling is not valid for non-bagged mode.")
945
-
946
- base_model_names = self._filter_base_models_via_infer_limit(
947
- base_model_names=base_model_names,
948
- infer_limit=infer_limit,
949
- infer_limit_modifier=0.8,
950
- )
951
- if ag_args_fit is None:
952
- ag_args_fit = {}
953
- ag_args_fit = ag_args_fit.copy()
954
- if infer_limit_batch_size is not None:
955
- ag_args_fit["predict_1_batch_size"] = infer_limit_batch_size
956
-
957
- if isinstance(models, dict):
958
- get_models_kwargs = dict(
959
- level=level,
960
- name_suffix=name_suffix,
961
- ag_args=ag_args,
962
- ag_args_fit=ag_args_fit,
963
- included_model_types=included_model_types,
964
- excluded_model_types=excluded_model_types,
965
- )
966
-
967
- if self.bagged_mode:
968
- if level == 1:
969
- (base_model_names, base_model_paths, base_model_types) = (None, None, None)
970
- elif level > 1:
971
- base_model_names, base_model_paths, base_model_types = self._get_models_load_info(model_names=base_model_names)
972
- if len(base_model_names) == 0: # type: ignore
973
- logger.log(20, f"No base models to train on, skipping stack level {level}...")
974
- return []
975
- else:
976
- raise AssertionError(f"Stack level cannot be less than 1! level = {level}")
977
-
978
- ensemble_kwargs = {
979
- "base_model_names": base_model_names,
980
- "base_model_paths_dict": base_model_paths,
981
- "base_model_types_dict": base_model_types,
982
- "base_model_types_inner_dict": self.get_models_attribute_dict(attribute="type_inner", models=base_model_names),
983
- "base_model_performances_dict": self.get_models_attribute_dict(attribute="val_score", models=base_model_names),
984
- "random_state": level + self.random_state,
985
- }
986
- get_models_kwargs.update(
987
- dict(
988
- ag_args_ensemble=ag_args_ensemble,
989
- ensemble_type=ensemble_type,
990
- ensemble_kwargs=ensemble_kwargs,
991
- )
992
- )
993
- models, model_args_fit = get_models_func(hyperparameters=models, **get_models_kwargs)
994
- if model_args_fit:
995
- hyperparameter_tune_kwargs = {
996
- model_name: model_args_fit[model_name]["hyperparameter_tune_kwargs"]
997
- for model_name in model_args_fit
998
- if "hyperparameter_tune_kwargs" in model_args_fit[model_name]
999
- }
1000
- kwargs["hyperparameter_tune_kwargs"] = hyperparameter_tune_kwargs
1001
-
1002
- logger.log(10 if ((not refit_full) and DistributedContext.is_distributed_mode()) else 20, f'Fitting {len(models)} L{level} models, fit_strategy="{fit_strategy}" ...')
1003
-
1004
- X_init = self.get_inputs_to_stacker(X, base_models=base_model_names, fit=True)
1005
- feature_metadata = self.get_feature_metadata(use_orig_features=True, base_models=base_model_names)
1006
- if X_val is not None:
1007
- X_val = self.get_inputs_to_stacker(X_val, base_models=base_model_names, fit=False, use_val_cache=True)
1008
- if X_test is not None:
1009
- X_test = self.get_inputs_to_stacker(X_test, base_models=base_model_names, fit=False, use_val_cache=False)
1010
- compute_score = not refit_full
1011
- if refit_full and X_val is not None:
1012
- X_init = pd.concat([X_init, X_val])
1013
- y = pd.concat([y, y_val])
1014
- X_val = None
1015
- y_val = None
1016
- if X_unlabeled is not None:
1017
- X_unlabeled = self.get_inputs_to_stacker(X_unlabeled, base_models=base_model_names, fit=False)
1018
-
1019
- fit_kwargs = dict(
1020
- num_classes=self.num_classes,
1021
- feature_metadata=feature_metadata,
1022
- )
1023
-
1024
- # FIXME: TODO: v0.1 X_unlabeled isn't cached so it won't be available during refit_full or fit_extra.
1025
- return self._train_multi(
1026
- X=X_init,
1027
- y=y,
1028
- X_val=X_val,
1029
- y_val=y_val,
1030
- X_test=X_test,
1031
- y_test=y_test,
1032
- X_unlabeled=X_unlabeled,
1033
- models=models,
1034
- level=level,
1035
- stack_name=stack_name,
1036
- compute_score=compute_score,
1037
- fit_kwargs=fit_kwargs,
1038
- fit_strategy=fit_strategy,
1039
- **kwargs,
1040
- )
1041
-
1042
- def _stack_new_level_aux(self, X_val, y_val, X, y, core_models, level, infer_limit, infer_limit_batch_size, **kwargs):
1043
- if X_val is None:
1044
- aux_models = self.stack_new_level_aux(
1045
- X=X, y=y, base_model_names=core_models, level=level + 1, infer_limit=infer_limit, infer_limit_batch_size=infer_limit_batch_size, **kwargs
1046
- )
1047
- else:
1048
- aux_models = self.stack_new_level_aux(
1049
- X=X_val,
1050
- y=y_val,
1051
- fit=False,
1052
- base_model_names=core_models,
1053
- level=level + 1,
1054
- infer_limit=infer_limit,
1055
- infer_limit_batch_size=infer_limit_batch_size,
1056
- **kwargs,
1057
- )
1058
- return aux_models
1059
-
1060
- # TODO: Consider making level be auto-determined based off of max(base_model_levels)+1
1061
- # TODO: Remove name_suffix, hacked in
1062
- # TODO: X can be optional because it isn't needed if fit=True
1063
- def stack_new_level_aux(
1064
- self,
1065
- X,
1066
- y,
1067
- base_model_names: list[str],
1068
- level: int | str = "auto",
1069
- fit=True,
1070
- stack_name="aux1",
1071
- time_limit=None,
1072
- name_suffix: str = None,
1073
- get_models_func=None,
1074
- check_if_best=True,
1075
- infer_limit=None,
1076
- infer_limit_batch_size=None,
1077
- use_val_cache=True,
1078
- fit_weighted_ensemble: bool = True,
1079
- name_extra: str | None = None,
1080
- total_resources: dict | None = None,
1081
- ) -> list[str]:
1082
- """
1083
- Trains auxiliary models (currently a single weighted ensemble) using the provided base models.
1084
- Level must be greater than the level of any of the base models.
1085
- Auxiliary models never use the original features and only train with the predictions of other models as features.
1086
- """
1087
- if self._callback_early_stop:
1088
- return []
1089
- if fit_weighted_ensemble is False:
1090
- # Skip fitting of aux models
1091
- return []
1092
-
1093
- base_model_names = self._filter_base_models_via_infer_limit(base_model_names=base_model_names, infer_limit=infer_limit, infer_limit_modifier=0.95)
1094
-
1095
- if len(base_model_names) == 0:
1096
- logger.log(20, f"No base models to train on, skipping auxiliary stack level {level}...")
1097
- return []
1098
-
1099
- if isinstance(level, str):
1100
- assert level == "auto", f"level must be 'auto' if str, found: {level}"
1101
- levels_dict = self.get_models_attribute_dict(attribute="level", models=base_model_names)
1102
- base_model_level_max = None
1103
- for k, v in levels_dict.items():
1104
- if base_model_level_max is None or v > base_model_level_max:
1105
- base_model_level_max = v
1106
- level = base_model_level_max + 1
1107
-
1108
- if infer_limit_batch_size is not None:
1109
- ag_args_fit = dict()
1110
- ag_args_fit["predict_1_batch_size"] = infer_limit_batch_size
1111
- else:
1112
- ag_args_fit = None
1113
- X_stack_preds = self.get_inputs_to_stacker(X, base_models=base_model_names, fit=fit, use_orig_features=False, use_val_cache=use_val_cache)
1114
- if self.weight_evaluation:
1115
- X, w = extract_column(X, self.sample_weight) # TODO: consider redesign with w as separate arg instead of bundled inside X
1116
- if w is not None:
1117
- X_stack_preds[self.sample_weight] = w.values / w.mean()
1118
- child_hyperparameters = None
1119
- if name_extra is not None:
1120
- child_hyperparameters = {"ag_args": {"name_suffix": name_extra}}
1121
- return self.generate_weighted_ensemble(
1122
- X=X_stack_preds,
1123
- y=y,
1124
- level=level,
1125
- base_model_names=base_model_names,
1126
- k_fold=1,
1127
- n_repeats=1,
1128
- ag_args_fit=ag_args_fit,
1129
- stack_name=stack_name,
1130
- time_limit=time_limit,
1131
- name_suffix=name_suffix,
1132
- get_models_func=get_models_func,
1133
- check_if_best=check_if_best,
1134
- child_hyperparameters=child_hyperparameters,
1135
- total_resources=total_resources,
1136
- )
1137
-
1138
- def predict(self, X: pd.DataFrame, model: str = None) -> np.ndarray:
1139
- if model is None:
1140
- model = self._get_best()
1141
- return self._predict_model(X=X, model=model)
1142
-
1143
- def predict_proba(self, X: pd.DataFrame, model: str = None) -> np.ndarray:
1144
- if model is None:
1145
- model = self._get_best()
1146
- return self._predict_proba_model(X=X, model=model)
1147
-
1148
- def _get_best(self) -> str:
1149
- if self.model_best is not None:
1150
- return self.model_best
1151
- else:
1152
- return self.get_model_best()
1153
-
1154
- # Note: model_pred_proba_dict is mutated in this function to minimize memory usage
1155
- def get_inputs_to_model(self, model: str | AbstractModel, X: pd.DataFrame, model_pred_proba_dict: dict[str, np.ndarray] = None, fit=False, preprocess_nonadaptive=False):
1156
- """
1157
- For output X:
1158
- If preprocess_nonadaptive=False, call model.predict(X)
1159
- If preprocess_nonadaptive=True, call model.predict(X, preprocess_nonadaptive=False)
1160
- """
1161
- if isinstance(model, str):
1162
- # TODO: Remove unnecessary load when no stacking
1163
- model = self.load_model(model)
1164
- model_level = self.get_model_level(model.name)
1165
- if model_level > 1 and isinstance(model, StackerEnsembleModel):
1166
- if fit:
1167
- model_pred_proba_dict = None
1168
- else:
1169
- model_set = self.get_minimum_model_set(model)
1170
- model_set = [m for m in model_set if m != model.name] # TODO: Can probably be faster, get this result from graph
1171
- model_pred_proba_dict = self.get_model_pred_proba_dict(X=X, models=model_set, model_pred_proba_dict=model_pred_proba_dict)
1172
- X = model.preprocess(X=X, preprocess_nonadaptive=preprocess_nonadaptive, fit=fit, model_pred_proba_dict=model_pred_proba_dict)
1173
- elif preprocess_nonadaptive:
1174
- X = model.preprocess(X=X, preprocess_stateful=False)
1175
- return X
1176
-
1177
- def score(self, X: pd.DataFrame, y: np.ndarray, model: str = None, metric: Scorer = None, weights: np.ndarray = None, as_error: bool = False) -> float:
1178
- if metric is None:
1179
- metric = self.eval_metric
1180
- if metric.needs_pred or metric.needs_quantile:
1181
- y_pred = self.predict(X=X, model=model)
1182
- y_pred_proba = None
1183
- else:
1184
- y_pred = None
1185
- y_pred_proba = self.predict_proba(X=X, model=model)
1186
- return compute_metric(
1187
- y=y,
1188
- y_pred=y_pred,
1189
- y_pred_proba=y_pred_proba,
1190
- metric=metric,
1191
- weights=weights,
1192
- weight_evaluation=self.weight_evaluation,
1193
- as_error=as_error,
1194
- quantile_levels=self.quantile_levels,
1195
- )
1196
-
1197
- def score_with_y_pred_proba(self, y: np.ndarray, y_pred_proba: np.ndarray, metric: Scorer = None, weights: np.ndarray = None, as_error: bool = False) -> float:
1198
- if metric is None:
1199
- metric = self.eval_metric
1200
- if metric.needs_pred or metric.needs_quantile:
1201
- y_pred = get_pred_from_proba(y_pred_proba=y_pred_proba, problem_type=self.problem_type)
1202
- y_pred_proba = None
1203
- else:
1204
- y_pred = None
1205
- return compute_metric(
1206
- y=y,
1207
- y_pred=y_pred,
1208
- y_pred_proba=y_pred_proba,
1209
- metric=metric,
1210
- weights=weights,
1211
- weight_evaluation=self.weight_evaluation,
1212
- as_error=as_error,
1213
- quantile_levels=self.quantile_levels,
1214
- )
1215
-
1216
- def score_with_y_pred(self, y: np.ndarray, y_pred: np.ndarray, weights: np.ndarray = None, metric: Scorer = None, as_error: bool = False) -> float:
1217
- if metric is None:
1218
- metric = self.eval_metric
1219
- return compute_metric(
1220
- y=y,
1221
- y_pred=y_pred,
1222
- y_pred_proba=None,
1223
- metric=metric,
1224
- weights=weights,
1225
- weight_evaluation=self.weight_evaluation,
1226
- as_error=as_error,
1227
- quantile_levels=self.quantile_levels,
1228
- )
1229
-
1230
- # TODO: Slow if large ensemble with many models, could cache output result to speed up during inference
1231
- def _construct_model_pred_order(self, models: list[str]) -> list[str]:
1232
- """
1233
- Constructs a list of model names in order of inference calls required to infer on all the models.
1234
-
1235
- Parameters
1236
- ----------
1237
- models : list[str]
1238
- The list of models to construct the prediction order from.
1239
- If a model has dependencies, the dependency models will be put earlier in the output list.
1240
- Models explicitly mentioned in the `models` input will be placed as early as possible in the output list.
1241
- Models earlier in `models` will attempt to be placed earlier in the output list than those later in `models`.
1242
- It is recommended that earlier elements do not have dependency models that are listed later in `models`.
1243
-
1244
- Returns
1245
- -------
1246
- Returns list of models in inference call order, including dependency models of those specified in the input.
1247
- """
1248
- model_set = set()
1249
- model_order = []
1250
- for model in models:
1251
- if model in model_set:
1252
- continue
1253
- min_models_set = set(self.get_minimum_model_set(model))
1254
- models_to_load = list(min_models_set.difference(model_set))
1255
- subgraph = nx.subgraph(self.model_graph, models_to_load)
1256
- model_pred_order = list(nx.lexicographical_topological_sort(subgraph))
1257
- model_order += [m for m in model_pred_order if m not in model_set]
1258
- model_set = set(model_order)
1259
- return model_order
1260
-
1261
- def _construct_model_pred_order_with_pred_dict(self, models: list[str], models_to_ignore: list[str] = None) -> list[str]:
1262
- """
1263
- Constructs a list of model names in order of inference calls required to infer on all the models.
1264
- Unlike `_construct_model_pred_order`, this method's output is in undefined order when multiple models are valid to infer at the same time.
1265
-
1266
- Parameters
1267
- ----------
1268
- models : list[str]
1269
- The list of models to construct the prediction order from.
1270
- If a model has dependencies, the dependency models will be put earlier in the output list.
1271
- models_to_ignore : list[str], optional
1272
- A list of models that have already been computed and can be ignored.
1273
- Models in this list and their dependencies (if not depended on by other models in `models`) will be pruned from the final output.
1274
-
1275
- Returns
1276
- -------
1277
- Returns list of models in inference call order, including dependency models of those specified in the input.
1278
- """
1279
- model_set = set()
1280
- for model in models:
1281
- if model in model_set:
1282
- continue
1283
- min_model_set = set(self.get_minimum_model_set(model))
1284
- model_set = model_set.union(min_model_set)
1285
- if models_to_ignore is not None:
1286
- model_set = model_set.difference(set(models_to_ignore))
1287
- models_to_load = list(model_set)
1288
- subgraph = nx.DiGraph(nx.subgraph(self.model_graph, models_to_load)) # Wrap subgraph in DiGraph to unfreeze it
1289
- # For model in models_to_ignore, remove model node from graph and all ancestors that have no remaining descendants and are not in `models`
1290
- models_to_ignore = [model for model in models_to_load if (model not in models) and (not list(subgraph.successors(model)))]
1291
- while models_to_ignore:
1292
- model = models_to_ignore[0]
1293
- predecessors = list(subgraph.predecessors(model))
1294
- subgraph.remove_node(model)
1295
- models_to_ignore = models_to_ignore[1:]
1296
- for predecessor in predecessors:
1297
- if (predecessor not in models) and (not list(subgraph.successors(predecessor))) and (predecessor not in models_to_ignore):
1298
- models_to_ignore.append(predecessor)
1299
-
1300
- # Get model prediction order
1301
- return list(nx.lexicographical_topological_sort(subgraph))
1302
-
1303
- def get_models_attribute_dict(self, attribute: str, models: list | None = None) -> dict[str, Any]:
1304
- """Returns dictionary of model name -> attribute value for the provided attribute.
1305
- """
1306
- models_attribute_dict = nx.get_node_attributes(self.model_graph, attribute)
1307
- if models is not None:
1308
- model_names = []
1309
- for model in models:
1310
- if not isinstance(model, str):
1311
- model = model.name
1312
- model_names.append(model)
1313
- if attribute == "path":
1314
- models_attribute_dict = {key: os.path.join(*val) for key, val in models_attribute_dict.items() if key in model_names}
1315
- else:
1316
- models_attribute_dict = {key: val for key, val in models_attribute_dict.items() if key in model_names}
1317
- return models_attribute_dict
1318
-
1319
- # TODO: Consider adding persist to disk functionality for pred_proba dictionary to lessen memory burden on large multiclass problems.
1320
- # For datasets with 100+ classes, this function could potentially run the system OOM due to each pred_proba numpy array taking significant amounts of space.
1321
- # This issue already existed in the previous level-based version but only had the minimum required predictions in memory at a time, whereas this has all model predictions in memory.
1322
- # TODO: Add memory optimal topological ordering -> Minimize amount of pred_probas in memory at a time, delete pred probas that are no longer required
1323
- def get_model_pred_proba_dict(
1324
- self,
1325
- X: pd.DataFrame,
1326
- models: list[str],
1327
- model_pred_proba_dict: dict = None,
1328
- model_pred_time_dict: dict = None,
1329
- record_pred_time: bool = False,
1330
- use_val_cache: bool = False,
1331
- ):
1332
- """
1333
- Optimally computes pred_probas (or predictions if regression) for each model in `models`.
1334
- Will compute each necessary model only once and store predictions in a `model_pred_proba_dict` dictionary.
1335
- Note: Mutates model_pred_proba_dict and model_pred_time_dict input if present to minimize memory usage.
1336
-
1337
- Parameters
1338
- ----------
1339
- X : pd.DataFrame
1340
- Input data to predict on.
1341
- models : list[str]
1342
- The list of models to predict with.
1343
- Note that if models have dependency models, their dependencies will also be predicted with and included in the output.
1344
- model_pred_proba_dict : dict, optional
1345
- A dict of predict_probas that could have been computed by a prior call to `get_model_pred_proba_dict` to avoid redundant computations.
1346
- Models already present in model_pred_proba_dict will not be predicted on.
1347
- get_model_pred_proba_dict(X, models=['A', 'B', 'C']) is equivalent to
1348
- get_model_pred_proba_dict(X, models=['C'], model_pred_proba_dict=get_model_pred_proba_dict(X, models=['A', 'B']))
1349
- Note: Mutated in-place to minimize memory usage
1350
- model_pred_time_dict : dict, optional
1351
- If `record_pred_time==True`, this is a dict of model name to marginal time taken in seconds for the prediction of X.
1352
- Must be specified alongside `model_pred_proba_dict` if `record_pred_time=True` and `model_pred_proba_dict != None`.
1353
- Ignored if `record_pred_time=False`.
1354
- Note: Mutated in-place to minimize memory usage
1355
- record_pred_time : bool, default = False
1356
- Whether to store marginal inference times of each model as an extra output `model_pred_time_dict`.
1357
- use_val_cache : bool, default = False
1358
- Whether to fetch cached val prediction probabilities for models instead of predicting on the data.
1359
- Only set to True if X is equal to the validation data and you want to skip live predictions.
1360
-
1361
- Returns
1362
- -------
1363
- If `record_pred_time==True`, outputs tuple of dicts (model_pred_proba_dict, model_pred_time_dict), else output only model_pred_proba_dict
1364
- """
1365
- if model_pred_proba_dict is None:
1366
- model_pred_proba_dict = {}
1367
- if model_pred_time_dict is None:
1368
- model_pred_time_dict = {}
1369
-
1370
- if use_val_cache:
1371
- _, model_pred_proba_dict = self._update_pred_proba_dict_with_val_cache(model_set=set(models), model_pred_proba_dict=model_pred_proba_dict)
1372
- if not model_pred_proba_dict:
1373
- model_pred_order = self._construct_model_pred_order(models)
1374
- else:
1375
- model_pred_order = self._construct_model_pred_order_with_pred_dict(models, models_to_ignore=list(model_pred_proba_dict.keys()))
1376
- if use_val_cache:
1377
- model_set, model_pred_proba_dict = self._update_pred_proba_dict_with_val_cache(
1378
- model_set=set(model_pred_order), model_pred_proba_dict=model_pred_proba_dict
1379
- )
1380
- model_pred_order = [model for model in model_pred_order if model in model_set]
1381
-
1382
- # Compute model predictions in topological order
1383
- for model_name in model_pred_order:
1384
- if record_pred_time:
1385
- time_start = time.time()
1386
-
1387
- model = self.load_model(model_name=model_name)
1388
- if isinstance(model, StackerEnsembleModel):
1389
- preprocess_kwargs = dict(infer=False, model_pred_proba_dict=model_pred_proba_dict)
1390
- model_pred_proba_dict[model_name] = model.predict_proba(X, **preprocess_kwargs)
1391
- else:
1392
- model_pred_proba_dict[model_name] = model.predict_proba(X)
1393
-
1394
- if record_pred_time:
1395
- time_end = time.time()
1396
- model_pred_time_dict[model_name] = time_end - time_start
1397
-
1398
- if record_pred_time:
1399
- return model_pred_proba_dict, model_pred_time_dict
1400
- else:
1401
- return model_pred_proba_dict
1402
-
1403
- def get_model_oof_dict(self, models: list[str]) -> dict:
1404
- """
1405
- Returns a dictionary of out-of-fold prediction probabilities, keyed by model name
1406
- """
1407
- return {model: self.get_model_oof(model) for model in models}
1408
-
1409
- def get_model_pred_dict(self, X: pd.DataFrame, models: list[str], record_pred_time: bool = False, **kwargs):
1410
- """
1411
- Optimally computes predictions for each model in `models`.
1412
- Will compute each necessary model only once and store predictions in a `model_pred_dict` dictionary.
1413
- Note: Mutates model_pred_proba_dict and model_pred_time_dict input if present to minimize memory usage.
1414
-
1415
- Acts as a wrapper to `self.get_model_pred_proba_dict`, converting the output to predictions.
1416
-
1417
- Parameters
1418
- ----------
1419
- X : pd.DataFrame
1420
- Input data to predict on.
1421
- models : list[str]
1422
- The list of models to predict with.
1423
- Note that if models have dependency models, their dependencies will also be predicted with and included in the output.
1424
- record_pred_time : bool, default = False
1425
- Whether to store marginal inference times of each model as an extra output `model_pred_time_dict`.
1426
- **kwargs : dict, optional
1427
- Refer to `self.get_model_pred_proba_dict` for documentation of remaining arguments.
1428
- This method shares identical arguments.
1429
-
1430
- Returns
1431
- -------
1432
- If `record_pred_time==True`, outputs tuple of dicts (model_pred_dict, model_pred_time_dict), else output only model_pred_dict
1433
- """
1434
- model_pred_proba_dict = self.get_model_pred_proba_dict(X=X, models=models, record_pred_time=record_pred_time, **kwargs)
1435
- if record_pred_time:
1436
- model_pred_proba_dict, model_pred_time_dict = model_pred_proba_dict
1437
- else:
1438
- model_pred_time_dict = None
1439
-
1440
- model_pred_dict = {}
1441
- for m in model_pred_proba_dict:
1442
- # Convert pred_proba to pred
1443
- model_pred_dict[m] = get_pred_from_proba(y_pred_proba=model_pred_proba_dict[m], problem_type=self.problem_type)
1444
-
1445
- if record_pred_time:
1446
- return model_pred_dict, model_pred_time_dict
1447
- else:
1448
- return model_pred_dict
1449
-
1450
- def get_model_oof(self, model: str, use_refit_parent: bool = False) -> np.ndarray:
1451
- """
1452
- Gets the out of fold prediction probabilities for a bagged ensemble model
1453
-
1454
- Parameters
1455
- ----------
1456
- model : str
1457
- Name of the model to get OOF.
1458
- use_refit_parent: bool = False
1459
- If True and the model is a refit model, will instead return the parent model's OOF.
1460
- If False and the model is a refit model, an exception will be raised.
1461
-
1462
- Returns
1463
- -------
1464
- np.ndarray
1465
- model OOF prediction probabilities (if classification) or predictions (if regression)
1466
- """
1467
- if use_refit_parent and self.get_model_attribute(model=model, attribute="refit_full", default=False):
1468
- model = self.get_model_attribute(model=model, attribute="refit_full_parent")
1469
- model_type = self.get_model_attribute(model=model, attribute="type")
1470
- if issubclass(model_type, BaggedEnsembleModel):
1471
- model_path = self.get_model_attribute(model=model, attribute="path")
1472
- return model_type.load_oof(path=os.path.join(self.path, model_path))
1473
- else:
1474
- raise AssertionError(f"Model {model} must be a BaggedEnsembleModel to return oof_pred_proba")
1475
-
1476
- def get_model_learning_curves(self, model: str) -> dict:
1477
- model_type = self.get_model_attribute(model=model, attribute="type")
1478
- model_path = self.get_model_attribute(model=model, attribute="path")
1479
- return model_type.load_learning_curves(path=os.path.join(self.path, model_path))
1480
-
1481
- def _update_pred_proba_dict_with_val_cache(self, model_set: set, model_pred_proba_dict):
1482
- """For each model in model_set, check if y_pred_proba_val is cached to disk. If so, load and add it to model_pred_proba_dict"""
1483
- for model in model_set:
1484
- y_pred_proba = self.get_model_attribute(model, attribute="cached_y_pred_proba_val", default=None)
1485
- if isinstance(y_pred_proba, bool):
1486
- if y_pred_proba:
1487
- try:
1488
- y_pred_proba = self._load_model_y_pred_proba_val(model)
1489
- except FileNotFoundError:
1490
- y_pred_proba = None
1491
- else:
1492
- y_pred_proba = None
1493
- if y_pred_proba is not None:
1494
- model_pred_proba_dict[model] = y_pred_proba
1495
- model_set = model_set.difference(set(model_pred_proba_dict.keys()))
1496
- return model_set, model_pred_proba_dict
1497
-
1498
- def get_inputs_to_stacker(
1499
- self,
1500
- X: pd.DataFrame,
1501
- *,
1502
- model: str | None = None,
1503
- base_models: list[str] | None = None,
1504
- model_pred_proba_dict: Optional[dict] = None,
1505
- fit: bool = False,
1506
- use_orig_features: bool = True,
1507
- use_val_cache: bool = False,
1508
- ) -> pd.DataFrame:
1509
- """
1510
- Returns the valid X input for a stacker model with base models equal to `base_models`.
1511
- Pairs with `feature_metadata = self.get_feature_metadata(...)`. The contents of the returned `X` should reflect `feature_metadata`.
1512
-
1513
- Parameters
1514
- ----------
1515
- X : pd.DataFrame
1516
- Input data to augment.
1517
- model : str, default = None
1518
- The model to derive `base_models` from.
1519
- Cannot be specified alongside `base_models`.
1520
- base_models : list[str], default = None
1521
- The list of base models to augment X with.
1522
- Base models will add their prediction probabilities as extra features to X.
1523
- Cannot be specified alongside `model`.
1524
- model_pred_proba_dict : dict, optional
1525
- A dict of predict_probas that could have been computed by a prior call to `get_model_pred_proba_dict` to avoid redundant computations.
1526
- Models already present in model_pred_proba_dict will not be predicted on.
1527
- Note: Mutated in-place to minimize memory usage
1528
- fit : bool, default = False
1529
- If True, X represents the training data and the models will return their out-of-fold prediction probabilities.
1530
- If False, X represents validation or test data and the models will predict directly on X to generate their prediction probabilities.
1531
- use_orig_features : bool, default = True
1532
- If True, the output DataFrame will include X's original features in addition to the new stack features.
1533
- If False, the output DataFrame will only contain the new stack features.
1534
- use_val_cache : bool, default = False
1535
- Whether to fetch cached val prediction probabilities for models instead of predicting on the data.
1536
- Only set to True if X is equal to the validation data and you want to skip live predictions.
1537
-
1538
- Returns
1539
- -------
1540
- X : DataFrame, an updated DataFrame with the additional stack features from `base_models`.
1541
- """
1542
- if model is not None and base_models is not None:
1543
- raise AssertionError("Only one of `model`, `base_models` is allowed to be set.")
1544
-
1545
- if model is not None and base_models is None:
1546
- base_models = self.get_base_model_names(model)
1547
- if not base_models:
1548
- return X
1549
- if fit:
1550
- model_pred_proba_dict = self.get_model_oof_dict(models=base_models)
1551
- else:
1552
- model_pred_proba_dict = self.get_model_pred_proba_dict(
1553
- X=X, models=base_models, model_pred_proba_dict=model_pred_proba_dict, use_val_cache=use_val_cache
1554
- )
1555
- pred_proba_list = [model_pred_proba_dict[model] for model in base_models]
1556
- stack_column_names, _ = self._get_stack_column_names(models=base_models)
1557
- X_stacker = convert_pred_probas_to_df(pred_proba_list=pred_proba_list, problem_type=self.problem_type, columns=stack_column_names, index=X.index)
1558
- if use_orig_features:
1559
- X = pd.concat([X_stacker, X], axis=1)
1560
- else:
1561
- X = X_stacker
1562
- return X
1563
-
1564
- def get_feature_metadata(self, use_orig_features: bool = True, model: str | None = None, base_models: list[str] | None = None) -> FeatureMetadata:
1565
- """
1566
- Returns the FeatureMetadata input to a `model.fit` call.
1567
- Pairs with `X = self.get_inputs_to_stacker(...)`. The returned FeatureMetadata should reflect the contents of `X`.
1568
-
1569
- Parameters
1570
- ----------
1571
- use_orig_features : bool, default = True
1572
- If True, will include the original features in the FeatureMetadata.
1573
- If False, will only include the stack features in the FeatureMetadata.
1574
- model : str, default = None
1575
- If specified, it must be an already existing model.
1576
- `base_models` will be set to the base models of `model`.
1577
- base_models : list[str], default = None
1578
- If specified, will add the stack features of the `base_models` to FeatureMetadata.
1579
-
1580
- Returns
1581
- -------
1582
- FeatureMetadata
1583
- The FeatureMetadata that should be passed into a `model.fit` call.
1584
- """
1585
- if model is not None and base_models is not None:
1586
- raise AssertionError("Only one of `model`, `base_models` is allowed to be set.")
1587
- if model is not None and base_models is None:
1588
- base_models = self.get_base_model_names(model)
1589
-
1590
- feature_metadata = None
1591
- if use_orig_features:
1592
- feature_metadata = self.feature_metadata
1593
- if base_models:
1594
- stack_column_names, _ = self._get_stack_column_names(models=base_models)
1595
- stacker_type_map_raw = {column: R_FLOAT for column in stack_column_names}
1596
- stacker_type_group_map_special = {S_STACK: stack_column_names}
1597
- stacker_feature_metadata = FeatureMetadata(type_map_raw=stacker_type_map_raw, type_group_map_special=stacker_type_group_map_special)
1598
- if feature_metadata is not None:
1599
- feature_metadata = feature_metadata.join_metadata(stacker_feature_metadata)
1600
- else:
1601
- feature_metadata = stacker_feature_metadata
1602
- if feature_metadata is None:
1603
- feature_metadata = FeatureMetadata(type_map_raw={})
1604
- return feature_metadata
1605
-
1606
- def _get_stack_column_names(self, models: list[str]) -> tuple[list[str], int]:
1607
- """
1608
- Get the stack column names generated when the provided models are used as base models in a stack ensemble.
1609
- Additionally output the number of columns per model as an int.
1610
- """
1611
- if self.problem_type in [MULTICLASS, SOFTCLASS]:
1612
- stack_column_names = [stack_column_prefix + "_" + str(cls) for stack_column_prefix in models for cls in range(self.num_classes)]
1613
- num_columns_per_model = self.num_classes
1614
- elif self.problem_type == QUANTILE:
1615
- stack_column_names = [stack_column_prefix + "_" + str(q) for stack_column_prefix in models for q in self.quantile_levels]
1616
- num_columns_per_model = len(self.quantile_levels)
1617
- else:
1618
- stack_column_names = models
1619
- num_columns_per_model = 1
1620
- return stack_column_names, num_columns_per_model
1621
-
1622
- # You must have previously called fit() with cache_data=True
1623
- # Fits _FULL versions of specified models, but does NOT link them (_FULL stackers will still use normal models as input)
1624
- def refit_single_full(
1625
- self,
1626
- X=None,
1627
- y=None,
1628
- X_val=None,
1629
- y_val=None,
1630
- X_unlabeled=None,
1631
- models=None,
1632
- fit_strategy: Literal["sequential", "parallel"] = "sequential",
1633
- **kwargs,
1634
- ) -> list[str]:
1635
- if fit_strategy == "parallel":
1636
- logger.log(30, f"Note: refit_full does not yet support fit_strategy='parallel', switching to 'sequential'...")
1637
- fit_strategy = "sequential"
1638
- if X is None:
1639
- X = self.load_X()
1640
- if X_val is None:
1641
- X_val = self.load_X_val()
1642
- if y is None:
1643
- y = self.load_y()
1644
- if y_val is None:
1645
- y_val = self.load_y_val()
1646
-
1647
- if models is None:
1648
- models = self.get_model_names()
1649
-
1650
- model_levels = dict()
1651
- ignore_models = []
1652
- ignore_stack_names = [REFIT_FULL_NAME]
1653
- for stack_name in ignore_stack_names:
1654
- ignore_models += self.get_model_names(stack_name=stack_name) # get_model_names returns [] if stack_name does not exist
1655
- models = [model for model in models if model not in ignore_models]
1656
- for model in models:
1657
- model_level = self.get_model_level(model)
1658
- if model_level not in model_levels:
1659
- model_levels[model_level] = []
1660
- model_levels[model_level].append(model)
1661
-
1662
- levels = sorted(model_levels.keys())
1663
- models_trained_full = []
1664
- model_refit_map = {} # FIXME: is this even used, remove?
1665
-
1666
- if fit_strategy == "sequential":
1667
- for level in levels:
1668
- models_level = model_levels[level]
1669
- for model in models_level:
1670
- model_name, models_trained = _detached_refit_single_full(
1671
- _self=self,
1672
- model=model,
1673
- X=X,
1674
- y=y,
1675
- X_val=X_val,
1676
- y_val=y_val,
1677
- X_unlabeled=X_unlabeled,
1678
- level=level,
1679
- kwargs=kwargs,
1680
- fit_strategy=fit_strategy,
1681
- )
1682
- if len(models_trained) == 1:
1683
- model_refit_map[model_name] = models_trained[0]
1684
- for model_trained in models_trained:
1685
- self._update_model_attr(
1686
- model_trained,
1687
- refit_full=True,
1688
- refit_full_parent=model_name,
1689
- refit_full_parent_val_score=self.get_model_attribute(model_name, "val_score"),
1690
- )
1691
- models_trained_full += models_trained
1692
- elif fit_strategy == "parallel":
1693
- # -- Parallel refit
1694
- ray = try_import_ray()
1695
-
1696
- # FIXME: Need a common utility class for initializing ray so we don't duplicate code
1697
- if not ray.is_initialized():
1698
- ray.init(log_to_driver=False, logging_level=logging.ERROR)
1699
-
1700
- distributed_manager = ParallelFitManager(
1701
- mode="refit",
1702
- func=_remote_refit_single_full,
1703
- func_kwargs=dict(fit_strategy=fit_strategy),
1704
- func_put_kwargs=dict(
1705
- _self=self,
1706
- X=X,
1707
- y=y,
1708
- X_val=X_val,
1709
- y_val=y_val,
1710
- X_unlabeled=X_unlabeled,
1711
- kwargs=kwargs,
1712
- ),
1713
- # TODO: check if this is available in the kwargs
1714
- num_cpus=kwargs.get("total_resources", {}).get("num_cpus", 1),
1715
- num_gpus=kwargs.get("total_resources", {}).get("num_gpus", 0),
1716
- get_model_attribute_func=self.get_model_attribute,
1717
- X=X,
1718
- y=y,
1719
- )
1720
-
1721
- for level in levels:
1722
- models_trained_full_level = []
1723
- distributed_manager.job_kwargs["level"] = level
1724
- models_level = model_levels[level]
1725
-
1726
- logger.log(20, f"Scheduling distributed model-workers for refitting {len(models_level)} L{level} models...")
1727
- unfinished_job_refs = distributed_manager.schedule_jobs(models_to_fit=models_level)
1728
-
1729
- while unfinished_job_refs:
1730
- finished, unfinished_job_refs = ray.wait(unfinished_job_refs, num_returns=1)
1731
- refit_full_parent, model_trained, model_path, model_type = ray.get(finished[0])
1732
-
1733
- self._add_model(
1734
- model_type.load(path=os.path.join(self.path,model_path), reset_paths=self.reset_paths),
1735
- stack_name=REFIT_FULL_NAME,
1736
- level=level,
1737
- _is_refit=True
1738
- )
1739
- model_refit_map[refit_full_parent] = model_trained
1740
- self._update_model_attr(
1741
- model_trained,
1742
- refit_full=True,
1743
- refit_full_parent=refit_full_parent,
1744
- refit_full_parent_val_score=self.get_model_attribute(refit_full_parent,"val_score"),
1745
- )
1746
- models_trained_full_level.append(model_trained)
1747
-
1748
- logger.log(20,f"Finished refit model for {refit_full_parent}")
1749
- unfinished_job_refs += distributed_manager.schedule_jobs()
1750
-
1751
- logger.log(20, f"Finished distributed refitting for {len(models_trained_full_level)} L{level} models.")
1752
- models_trained_full += models_trained_full_level
1753
- distributed_manager.clean_job_state(unfinished_job_refs=unfinished_job_refs)
1754
-
1755
- distributed_manager.clean_up_ray()
1756
- else:
1757
- raise ValueError(f"Invalid value for fit_strategy: '{fit_strategy}'")
1758
-
1759
- keys_to_del = []
1760
- for model in model_refit_map.keys():
1761
- if model_refit_map[model] not in models_trained_full:
1762
- keys_to_del.append(model)
1763
- for key in keys_to_del:
1764
- del model_refit_map[key]
1765
- self.save() # TODO: This could be more efficient by passing in arg to not save if called by refit_ensemble_full since it saves anyways later.
1766
- return models_trained_full
1767
-
1768
- # Fits _FULL models and links them in the stack so _FULL models only use other _FULL models as input during stacking
1769
- # If model is specified, will fit all _FULL models that are ancestors of the provided model, automatically linking them.
1770
- # If no model is specified, all models are refit and linked appropriately.
1771
- def refit_ensemble_full(self, model: str | list[str] = "all", **kwargs) -> dict:
1772
- if model == "all":
1773
- ensemble_set = self.get_model_names()
1774
- elif isinstance(model, list):
1775
- ensemble_set = self.get_minimum_models_set(model)
1776
- else:
1777
- if model == "best":
1778
- model = self.get_model_best()
1779
- ensemble_set = self.get_minimum_model_set(model)
1780
- existing_models = self.get_model_names()
1781
- ensemble_set_valid = []
1782
- model_refit_map = self.model_refit_map()
1783
- for model in ensemble_set:
1784
- if model in model_refit_map and model_refit_map[model] in existing_models:
1785
- logger.log(20, f"Model '{model}' already has a refit _FULL model: '{model_refit_map[model]}', skipping refit...")
1786
- else:
1787
- ensemble_set_valid.append(model)
1788
- if ensemble_set_valid:
1789
- models_trained_full = self.refit_single_full(models=ensemble_set_valid, **kwargs)
1790
- else:
1791
- models_trained_full = []
1792
-
1793
- model_refit_map = self.model_refit_map()
1794
- for model_full in models_trained_full:
1795
- # TODO: Consider moving base model info to a separate pkl file so that it can be edited without having to load/save the model again
1796
- # Downside: Slower inference speed when models are not persisted in memory prior.
1797
- model_loaded = self.load_model(model_full)
1798
- if isinstance(model_loaded, StackerEnsembleModel):
1799
- for stack_column_prefix in model_loaded.stack_column_prefix_lst:
1800
- base_model = model_loaded.stack_column_prefix_to_model_map[stack_column_prefix]
1801
- new_base_model = model_refit_map[base_model]
1802
- new_base_model_type = self.get_model_attribute(model=new_base_model, attribute="type")
1803
- new_base_model_path = self.get_model_attribute(model=new_base_model, attribute="path")
1804
-
1805
- model_loaded.base_model_paths_dict[new_base_model] = new_base_model_path
1806
- model_loaded.base_model_types_dict[new_base_model] = new_base_model_type
1807
- model_loaded.base_model_names.append(new_base_model)
1808
- model_loaded.stack_column_prefix_to_model_map[stack_column_prefix] = new_base_model
1809
-
1810
- model_loaded.save() # TODO: Avoid this!
1811
-
1812
- # Remove old edges and add new edges
1813
- edges_to_remove = list(self.model_graph.in_edges(model_loaded.name))
1814
- self.model_graph.remove_edges_from(edges_to_remove)
1815
- if isinstance(model_loaded, StackerEnsembleModel):
1816
- for stack_column_prefix in model_loaded.stack_column_prefix_lst:
1817
- base_model_name = model_loaded.stack_column_prefix_to_model_map[stack_column_prefix]
1818
- self.model_graph.add_edge(base_model_name, model_loaded.name)
1819
-
1820
- self.save()
1821
- return self.model_refit_map()
1822
-
1823
- def get_refit_full_parent(self, model: str) -> str:
1824
- """Get refit full model's parent. If model does not have a parent, return `model`."""
1825
- return self.get_model_attribute(model=model, attribute="refit_full_parent", default=model)
1826
-
1827
- def get_model_best(
1828
- self,
1829
- can_infer: bool | None = None,
1830
- allow_full: bool = True,
1831
- infer_limit: float | None = None,
1832
- infer_limit_as_child: bool = False
1833
- ) -> str:
1834
- """
1835
- Returns the name of the model with the best validation score that satisfies all specified constraints.
1836
- If no model satisfies the constraints, an AssertionError will be raised.
1837
-
1838
- Parameters
1839
- ----------
1840
- can_infer: bool, default = None
1841
- If True, only consider models that can infer.
1842
- If False, only consider models that can't infer.
1843
- If None, consider all models.
1844
- allow_full: bool, default = True
1845
- If True, consider all models.
1846
- If False, disallow refit_full models.
1847
- infer_limit: float, default = None
1848
- The maximum time in seconds per sample that a model is allowed to take during inference.
1849
- If None, consider all models.
1850
- If specified, consider only models that have a lower predict time per sample than `infer_limit`.
1851
- infer_limit_as_child: bool, default = False
1852
- If True, use the predict time per sample of the (theoretical) refit version of the model.
1853
- If the model is already refit, the predict time per sample is unchanged.
1854
- If False, use the predict time per sample of the model.
1855
-
1856
- Returns
1857
- -------
1858
- model: str
1859
- The string name of the model with the best metric score that satisfies all constraints.
1860
- """
1861
- models = self.get_model_names(can_infer=can_infer)
1862
- if not models:
1863
- raise AssertionError("Trainer has no fit models that can infer.")
1864
- models_full = self.get_models_attribute_dict(models=models, attribute="refit_full_parent")
1865
- if not allow_full:
1866
- models = [model for model in models if model not in models_full]
1867
-
1868
- predict_1_time_attribute = None
1869
- if infer_limit is not None:
1870
- if infer_limit_as_child:
1871
- predict_1_time_attribute = "predict_1_child_time"
1872
- else:
1873
- predict_1_time_attribute = "predict_1_time"
1874
- models_predict_1_time = self.get_models_attribute_full(models=models, attribute=predict_1_time_attribute)
1875
- models_og = copy.deepcopy(models)
1876
- for model_key in models_predict_1_time:
1877
- if models_predict_1_time[model_key] is None or models_predict_1_time[model_key] > infer_limit:
1878
- models.remove(model_key)
1879
- if models_og and not models:
1880
- # get the fastest model
1881
- models_predict_time_list = [models_predict_1_time[m] for m in models_og]
1882
- min_time = np.array(models_predict_time_list).min()
1883
- infer_limit_new = min_time * 1.2 # Give 20% lee-way
1884
- logger.log(30, f"WARNING: Impossible to satisfy infer_limit constraint. Relaxing constraint from {infer_limit} to {infer_limit_new} ...")
1885
- models = models_og
1886
- for model_key in models_predict_1_time:
1887
- if models_predict_1_time[model_key] > infer_limit_new:
1888
- models.remove(model_key)
1889
- if not models:
1890
- raise AssertionError(
1891
- f"Trainer has no fit models that can infer while satisfying the constraints: (infer_limit={infer_limit}, allow_full={allow_full})."
1892
- )
1893
- model_performances = self.get_models_attribute_dict(models=models, attribute="val_score")
1894
-
1895
- predict_time_attr = predict_1_time_attribute if predict_1_time_attribute is not None else "predict_time"
1896
- models_predict_time = self.get_models_attribute_full(models=models, attribute=predict_time_attr)
1897
-
1898
- perfs = [(m, model_performances[m], models_predict_time[m]) for m in models if model_performances[m] is not None]
1899
- if not perfs:
1900
- models = [m for m in models if m in models_full]
1901
- perfs = [(m, self.get_model_attribute(model=m, attribute="refit_full_parent_val_score"), models_predict_time[m]) for m in models]
1902
- if not perfs:
1903
- raise AssertionError("No fit models that can infer exist with a validation score to choose the best model.")
1904
- elif not allow_full:
1905
- raise AssertionError(
1906
- "No fit models that can infer exist with a validation score to choose the best model, but refit_full models exist. Set `allow_full=True` to get the best refit_full model."
1907
- )
1908
- return max(perfs, key=lambda i: (i[1], -i[2]))[0]
1909
-
1910
- def save_model(self, model, reduce_memory=True):
1911
- # TODO: In future perhaps give option for the reduce_memory_size arguments, perhaps trainer level variables specified by user?
1912
- if reduce_memory:
1913
- model.reduce_memory_size(remove_fit=True, remove_info=False, requires_save=True)
1914
- if self.low_memory:
1915
- model.save()
1916
- else:
1917
- self.models[model.name] = model
1918
-
1919
- def save(self) -> None:
1920
- models = self.models
1921
- if self.low_memory:
1922
- self.models = {}
1923
- save_pkl.save(path=os.path.join(self.path, self.trainer_file_name), object=self)
1924
- if self.low_memory:
1925
- self.models = models
1926
-
1927
- def compile(self, model_names="all", with_ancestors=False, compiler_configs=None) -> list[str]:
1928
- """
1929
- Compile a list of models for accelerated prediction.
1930
-
1931
- Parameters
1932
- ----------
1933
- model_names : str or list
1934
- A list of model names for model compilation. Alternatively, this can be 'all' or 'best'.
1935
- compiler_configs: dict, default=None
1936
- Model specific compiler options.
1937
- This can be useful to specify the compiler backend for a specific model,
1938
- e.g. {"RandomForest": {"compiler": "onnx"}}
1939
- """
1940
- if model_names == "all":
1941
- model_names = self.get_model_names(can_infer=True)
1942
- elif model_names == "best":
1943
- if self.model_best is not None:
1944
- model_names = [self.model_best]
1945
- else:
1946
- model_names = [self.get_model_best(can_infer=True)]
1947
- if not isinstance(model_names, list):
1948
- raise ValueError(f"model_names must be a list of model names. Invalid value: {model_names}")
1949
- if with_ancestors:
1950
- model_names = self.get_minimum_models_set(model_names)
1951
-
1952
- logger.log(20, f"Compiling {len(model_names)} Models ...")
1953
- total_compile_time = 0
1954
-
1955
- model_names_to_compile = []
1956
- model_names_to_configs_dict = dict()
1957
- for model_name in model_names:
1958
- model_type_inner = self.get_model_attribute(model_name, "type_inner")
1959
- # Get model specific compiler options
1960
- # Model type can be described with either model type, or model name as string
1961
- if model_name in compiler_configs:
1962
- config = compiler_configs[model_name]
1963
- elif model_type_inner in compiler_configs:
1964
- config = compiler_configs[model_type_inner]
1965
- else:
1966
- config = None
1967
- if config is not None:
1968
- model_names_to_compile.append(model_name)
1969
- model_names_to_configs_dict[model_name] = config
1970
- else:
1971
- logger.log(20, f"Skipping compilation for {model_name} ... (No config specified)")
1972
- for model_name in model_names_to_compile:
1973
- model = self.load_model(model_name)
1974
- config = model_names_to_configs_dict[model_name]
1975
-
1976
- # Check if already compiled, or if can't compile due to missing dependencies,
1977
- # or if model hasn't implemented compiling.
1978
- if "compiler" in config and model.get_compiler_name() == config["compiler"]:
1979
- logger.log(20, f'Skipping compilation for {model_name} ... (Already compiled with "{model.get_compiler_name()}" backend)')
1980
- elif model.can_compile(compiler_configs=config):
1981
- logger.log(20, f"Compiling model: {model.name} ... Config = {config}")
1982
- compile_start_time = time.time()
1983
- model.compile(compiler_configs=config)
1984
- compile_end_time = time.time()
1985
- model.compile_time = compile_end_time - compile_start_time
1986
- compile_type = model.get_compiler_name()
1987
- total_compile_time += model.compile_time
1988
-
1989
- # Update model_graph in order to put compile_time into leaderboard,
1990
- # since models are saved right after training.
1991
- self.model_graph.nodes[model.name]["compile_time"] = model.compile_time
1992
- self.save_model(model, reduce_memory=False)
1993
- logger.log(20, f'\tCompiled model with "{compile_type}" backend ...')
1994
- logger.log(20, f"\t{round(model.compile_time, 2)}s\t = Compile runtime")
1995
- else:
1996
- logger.log(20, f"Skipping compilation for {model.name} ... (Unable to compile with the provided config: {config})")
1997
- logger.log(20, f"Finished compiling models, total runtime = {round(total_compile_time, 2)}s.")
1998
- self.save()
1999
- return model_names
2000
-
2001
- def persist(self, model_names="all", with_ancestors=False, max_memory=None) -> list[str]:
2002
- if model_names == "all":
2003
- model_names = self.get_model_names()
2004
- elif model_names == "best":
2005
- if self.model_best is not None:
2006
- model_names = [self.model_best]
2007
- else:
2008
- model_names = [self.get_model_best(can_infer=True)]
2009
- if not isinstance(model_names, list):
2010
- raise ValueError(f"model_names must be a list of model names. Invalid value: {model_names}")
2011
- if with_ancestors:
2012
- model_names = self.get_minimum_models_set(model_names)
2013
- model_names_already_persisted = [model_name for model_name in model_names if model_name in self.models]
2014
- if model_names_already_persisted:
2015
- logger.log(
2016
- 30,
2017
- f"The following {len(model_names_already_persisted)} models were already persisted and will be ignored in the model loading process: {model_names_already_persisted}",
2018
- )
2019
- model_names = [model_name for model_name in model_names if model_name not in model_names_already_persisted]
2020
- if not model_names:
2021
- logger.log(30, f"No valid unpersisted models were specified to be persisted, so no change in model persistence was performed.")
2022
- return []
2023
- if max_memory is not None:
2024
-
2025
- @disable_if_lite_mode(ret=True)
2026
- def _check_memory():
2027
- info = self.get_models_info(model_names)
2028
- model_mem_size_map = {model: info[model]["memory_size"] for model in model_names}
2029
- for model in model_mem_size_map:
2030
- if "children_info" in info[model]:
2031
- for child in info[model]["children_info"].values():
2032
- model_mem_size_map[model] += child["memory_size"]
2033
- total_mem_required = sum(model_mem_size_map.values())
2034
- available_mem = ResourceManager.get_available_virtual_mem()
2035
- memory_proportion = total_mem_required / available_mem
2036
- if memory_proportion > max_memory:
2037
- logger.log(
2038
- 30,
2039
- f"Models will not be persisted in memory as they are expected to require {round(memory_proportion * 100, 2)}% of memory, which is greater than the specified max_memory limit of {round(max_memory*100, 2)}%.",
2040
- )
2041
- logger.log(
2042
- 30,
2043
- f"\tModels will be loaded on-demand from disk to maintain safe memory usage, increasing inference latency. If inference latency is a concern, try to use smaller models or increase the value of max_memory.",
2044
- )
2045
- return False
2046
- else:
2047
- logger.log(20, f"Persisting {len(model_names)} models in memory. Models will require {round(memory_proportion*100, 2)}% of memory.")
2048
- return True
2049
-
2050
- if not _check_memory():
2051
- return []
2052
-
2053
- models = []
2054
- for model_name in model_names:
2055
- model = self.load_model(model_name)
2056
- self.models[model.name] = model
2057
- models.append(model)
2058
-
2059
- for model in models:
2060
- # TODO: Move this to model code
2061
- if isinstance(model, BaggedEnsembleModel):
2062
- for fold, fold_model in enumerate(model.models):
2063
- if isinstance(fold_model, str):
2064
- model.models[fold] = model.load_child(fold_model)
2065
- return model_names
2066
-
2067
- def unpersist(self, model_names="all") -> list:
2068
- if model_names == "all":
2069
- model_names = list(self.models.keys())
2070
- if not isinstance(model_names, list):
2071
- raise ValueError(f"model_names must be a list of model names. Invalid value: {model_names}")
2072
- unpersisted_models = []
2073
- for model in model_names:
2074
- if model in self.models:
2075
- self.models.pop(model)
2076
- unpersisted_models.append(model)
2077
- if unpersisted_models:
2078
- logger.log(20, f"Unpersisted {len(unpersisted_models)} models: {unpersisted_models}")
2079
- else:
2080
- logger.log(30, f"No valid persisted models were specified to be unpersisted, so no change in model persistence was performed.")
2081
- return unpersisted_models
2082
-
2083
- def generate_weighted_ensemble(
2084
- self,
2085
- X,
2086
- y,
2087
- level,
2088
- base_model_names,
2089
- k_fold=1,
2090
- n_repeats=1,
2091
- stack_name=None,
2092
- hyperparameters=None,
2093
- ag_args_fit=None,
2094
- time_limit=None,
2095
- name_suffix: str | None = None,
2096
- save_bag_folds=None,
2097
- check_if_best=True,
2098
- child_hyperparameters=None,
2099
- get_models_func=None,
2100
- total_resources: dict | None = None,
2101
- ) -> list[str]:
2102
- if get_models_func is None:
2103
- get_models_func = self.construct_model_templates
2104
- if len(base_model_names) == 0:
2105
- logger.log(20, "No base models to train on, skipping weighted ensemble...")
2106
- return []
2107
-
2108
- if child_hyperparameters is None:
2109
- child_hyperparameters = {}
2110
-
2111
- if save_bag_folds is None:
2112
- can_infer_dict = self.get_models_attribute_dict("can_infer", models=base_model_names)
2113
- if False in can_infer_dict.values():
2114
- save_bag_folds = False
2115
- else:
2116
- save_bag_folds = True
2117
-
2118
- feature_metadata = self.get_feature_metadata(use_orig_features=False, base_models=base_model_names)
2119
-
2120
- base_model_paths_dict = self.get_models_attribute_dict(attribute="path", models=base_model_names)
2121
- base_model_paths_dict = {key: os.path.join(self.path, val) for key, val in base_model_paths_dict.items()}
2122
- weighted_ensemble_model, _ = get_models_func(
2123
- hyperparameters={
2124
- "default": {
2125
- "ENS_WEIGHTED": [child_hyperparameters],
2126
- }
2127
- },
2128
- ensemble_type=WeightedEnsembleModel,
2129
- ensemble_kwargs=dict(
2130
- base_model_names=base_model_names,
2131
- base_model_paths_dict=base_model_paths_dict,
2132
- base_model_types_dict=self.get_models_attribute_dict(attribute="type", models=base_model_names),
2133
- base_model_types_inner_dict=self.get_models_attribute_dict(attribute="type_inner", models=base_model_names),
2134
- base_model_performances_dict=self.get_models_attribute_dict(attribute="val_score", models=base_model_names),
2135
- hyperparameters=hyperparameters,
2136
- random_state=level + self.random_state,
2137
- ),
2138
- ag_args={"name_bag_suffix": ""},
2139
- ag_args_fit=ag_args_fit,
2140
- ag_args_ensemble={"save_bag_folds": save_bag_folds},
2141
- name_suffix=name_suffix,
2142
- level=level,
2143
- )
2144
- weighted_ensemble_model = weighted_ensemble_model[0]
2145
- w = None
2146
- if self.weight_evaluation:
2147
- X, w = extract_column(X, self.sample_weight)
2148
- models = self._train_multi(
2149
- X=X,
2150
- y=y,
2151
- X_val=None,
2152
- y_val=None,
2153
- models=[weighted_ensemble_model],
2154
- k_fold=k_fold,
2155
- n_repeats=n_repeats,
2156
- hyperparameter_tune_kwargs=None,
2157
- stack_name=stack_name,
2158
- level=level,
2159
- time_limit=time_limit,
2160
- ens_sample_weight=w,
2161
- fit_kwargs=dict(feature_metadata=feature_metadata, num_classes=self.num_classes, groups=None), # FIXME: Is this the right way to do this?
2162
- total_resources=total_resources,
2163
- )
2164
- for weighted_ensemble_model_name in models:
2165
- if check_if_best and weighted_ensemble_model_name in self.get_model_names():
2166
- if self.model_best is None:
2167
- self.model_best = weighted_ensemble_model_name
2168
- else:
2169
- best_score = self.get_model_attribute(self.model_best, "val_score")
2170
- cur_score = self.get_model_attribute(weighted_ensemble_model_name, "val_score")
2171
- if best_score is not None and cur_score > best_score:
2172
- # new best model
2173
- self.model_best = weighted_ensemble_model_name
2174
- return models
2175
-
2176
- def _train_single(
2177
- self,
2178
- X: pd.DataFrame,
2179
- y: pd.Series,
2180
- model: AbstractModel,
2181
- X_val: pd.DataFrame | None = None,
2182
- y_val: pd.Series | None = None,
2183
- X_test: pd.DataFrame | None = None,
2184
- y_test: pd.Series | None = None,
2185
- total_resources: dict = None,
2186
- **model_fit_kwargs,
2187
- ) -> AbstractModel:
2188
- """
2189
- Trains model but does not add the trained model to this Trainer.
2190
- Returns trained model object.
2191
- """
2192
- model = model.fit(X=X, y=y, X_val=X_val, y_val=y_val, X_test=X_test, y_test=y_test, total_resources=total_resources, **model_fit_kwargs)
2193
- return model
2194
-
2195
- def _train_and_save(
2196
- self,
2197
- X: pd.DataFrame,
2198
- y: pd.Series,
2199
- model: AbstractModel,
2200
- X_val: pd.DataFrame | None = None,
2201
- y_val: pd.Series | None = None,
2202
- X_test: pd.DataFrame | None = None,
2203
- y_test: pd.Series | None = None,
2204
- X_pseudo: pd.DataFrame | None = None,
2205
- y_pseudo: pd.DataFrame | None = None,
2206
- time_limit: float | None = None,
2207
- stack_name: str = "core",
2208
- level: int = 1,
2209
- compute_score: bool = True,
2210
- total_resources: dict | None = None,
2211
- errors: Literal["ignore", "raise"] = "ignore",
2212
- errors_ignore: list | None = None,
2213
- errors_raise: list | None = None,
2214
- is_ray_worker: bool = False,
2215
- **model_fit_kwargs,
2216
- ) -> list[str]:
2217
- """
2218
- Trains model and saves it to disk, returning a list with a single element: The name of the model, or no elements if training failed.
2219
- If the model name is returned:
2220
- The model can be accessed via self.load_model(model.name).
2221
- The model will have metadata information stored in self.model_graph.
2222
- The model's name will be appended to self.models_level[stack_name][level]
2223
- The model will be accessible and usable through any Trainer function that takes as input 'model' or 'model_name'.
2224
- Note: self._train_and_save should not be used outside of self._train_single_full
2225
-
2226
- Parameters
2227
- ----------
2228
- errors: Literal["ignore", "raise"], default = "ignore"
2229
- Determines how model fit exceptions are handled.
2230
- If "ignore", will ignore all model exceptions during fit. If an exception occurs, an empty list is returned.
2231
- If "raise", will raise the model exception if it occurs.
2232
- Can be overwritten by `errors_ignore` and `errors_raise`.
2233
- errors_ignore: list[str], optional
2234
- The exception types specified in `errors_ignore` will be treated as if `errors="ignore"`.
2235
- errors_raise: list[str], optional
2236
- The exception types specified in `errors_raise` will be treated as if `errors="raise"`.
2237
-
2238
- """
2239
- fit_start_time = time.time()
2240
- model_names_trained = []
2241
- y_pred_proba_val = None
2242
-
2243
- is_distributed_mode = DistributedContext.is_distributed_mode() or is_ray_worker
2244
-
2245
- fit_log_message = f"Fitting model: {model.name} ..."
2246
- if time_limit is not None:
2247
- time_left_total = time_limit
2248
- not_enough_time = False
2249
- if time_limit <= 0:
2250
- not_enough_time = True
2251
- elif self._time_limit is not None and self._time_train_start is not None:
2252
- time_left_total = self._time_limit - (fit_start_time - self._time_train_start)
2253
- # If only a very small amount of time remains, skip training
2254
- min_time_required = min(self._time_limit * 0.01, 10)
2255
- if (time_left_total < min_time_required) and (time_limit < min_time_required):
2256
- not_enough_time = True
2257
- if not_enough_time:
2258
- skip_msg = f"Skipping {model.name} due to lack of time remaining."
2259
- not_enough_time_exception = InsufficientTime(skip_msg)
2260
- if self._check_raise_exception(exception=not_enough_time_exception, errors=errors, errors_ignore=errors_ignore, errors_raise=errors_raise):
2261
- raise not_enough_time_exception
2262
- else:
2263
- logger.log(15, skip_msg)
2264
- return []
2265
- fit_log_message += f" Training model for up to {time_limit:.2f}s of the {time_left_total:.2f}s of remaining time."
2266
- logger.log(10 if is_distributed_mode else 20, fit_log_message)
2267
-
2268
- if isinstance(model, BaggedEnsembleModel) and not compute_score:
2269
- # Do not perform OOF predictions when we don't compute a score.
2270
- model_fit_kwargs["_skip_oof"] = True
2271
-
2272
- model_fit_kwargs = dict(
2273
- model=model,
2274
- X_val=X_val,
2275
- y_val=y_val,
2276
- X_test=X_test,
2277
- y_test=y_test,
2278
- time_limit=time_limit,
2279
- total_resources=total_resources,
2280
- **model_fit_kwargs,
2281
- )
2282
-
2283
- # If model is not bagged model and not stacked then pseudolabeled data needs to be incorporated at this level
2284
- # Bagged model does validation on the fit level where as single models do it separately. Hence this if statement
2285
- # is required
2286
- if not isinstance(model, BaggedEnsembleModel) and X_pseudo is not None and y_pseudo is not None and X_pseudo.columns.equals(X.columns):
2287
- assert_pseudo_column_match(X=X, X_pseudo=X_pseudo)
2288
- X_w_pseudo = pd.concat([X, X_pseudo])
2289
- y_w_pseudo = pd.concat([y, y_pseudo])
2290
- logger.log(15, f"{len(X_pseudo)} extra rows of pseudolabeled data added to training set for {model.name}")
2291
- model_fit_kwargs["X"] = X_w_pseudo
2292
- model_fit_kwargs["y"] = y_w_pseudo
2293
- else:
2294
- model_fit_kwargs["X"] = X
2295
- model_fit_kwargs["y"] = y
2296
- if level > 1:
2297
- if X_pseudo is not None and y_pseudo is not None:
2298
- logger.log(15, f"Dropping pseudo in stacking layer due to missing out-of-fold predictions")
2299
- else:
2300
- model_fit_kwargs["X_pseudo"] = X_pseudo
2301
- model_fit_kwargs["y_pseudo"] = y_pseudo
2302
-
2303
- exception = None
2304
- try:
2305
- model = self._train_single(**model_fit_kwargs)
2306
-
2307
- fit_end_time = time.time()
2308
- if self.weight_evaluation:
2309
- w = model_fit_kwargs.get("sample_weight", None)
2310
- w_val = model_fit_kwargs.get("sample_weight_val", None)
2311
- else:
2312
- w = None
2313
- w_val = None
2314
- if not compute_score:
2315
- score = None
2316
- model.predict_time = None
2317
- elif X_val is not None and y_val is not None:
2318
- y_pred_proba_val = model.predict_proba(X_val, record_time=True)
2319
- score = model.score_with_y_pred_proba(y=y_val, y_pred_proba=y_pred_proba_val, sample_weight=w_val)
2320
- elif isinstance(model, BaggedEnsembleModel):
2321
- if model.is_valid_oof() or isinstance(model, WeightedEnsembleModel):
2322
- score = model.score_with_oof(y=y, sample_weight=w)
2323
- else:
2324
- score = None
2325
- else:
2326
- score = None
2327
- pred_end_time = time.time()
2328
- if model.fit_time is None:
2329
- model.fit_time = fit_end_time - fit_start_time
2330
- if model.predict_time is None and score is not None:
2331
- model.predict_time = pred_end_time - fit_end_time
2332
- model.val_score = score
2333
- # TODO: Add recursive=True to avoid repeatedly loading models each time this is called for bagged ensembles (especially during repeated bagging)
2334
- self.save_model(model=model)
2335
- except Exception as exc:
2336
- exception = exc # required to reference exc outside of `except` statement
2337
- del_model = True
2338
- if isinstance(exception, TimeLimitExceeded):
2339
- logger.log(20, f"\tTime limit exceeded... Skipping {model.name}.")
2340
- elif isinstance(exception, NotEnoughMemoryError):
2341
- logger.warning(f"\tNot enough memory to train {model.name}... Skipping this model.")
2342
- elif isinstance(exception, NoStackFeatures):
2343
- logger.warning(f"\tNo stack features to train {model.name}... Skipping this model. {exception}")
2344
- elif isinstance(exception, NotValidStacker):
2345
- logger.warning(f"\tStacking disabled for {model.name}... Skipping this model. {exception}")
2346
- elif isinstance(exception, NoValidFeatures):
2347
- logger.warning(f"\tNo valid features to train {model.name}... Skipping this model.")
2348
- elif isinstance(exception, NoGPUError):
2349
- logger.warning(f"\tNo GPUs available to train {model.name}... Skipping this model.")
2350
- elif isinstance(exception, NotEnoughCudaMemoryError):
2351
- logger.warning(f"\tNot enough CUDA memory available to train {model.name}... Skipping this model.")
2352
- elif isinstance(exception, ImportError):
2353
- logger.error(f"\tWarning: Exception caused {model.name} to fail during training (ImportError)... Skipping this model.")
2354
- logger.error(f"\t\t{exception}")
2355
- del_model = False
2356
- if self.verbosity > 2:
2357
- logger.exception("Detailed Traceback:")
2358
- else: # all other exceptions
2359
- logger.error(f"\tWarning: Exception caused {model.name} to fail during training... Skipping this model.")
2360
- logger.error(f"\t\t{exception}")
2361
- if self.verbosity > 0:
2362
- logger.exception("Detailed Traceback:")
2363
- crash_time = time.time()
2364
- total_time = crash_time - fit_start_time
2365
- tb = traceback.format_exc()
2366
- model_info = self.get_model_info(model=model)
2367
- self._models_failed_to_train_errors[model.name] = dict(
2368
- exc_type=exception.__class__.__name__,
2369
- exc_str=str(exception),
2370
- exc_traceback=tb,
2371
- model_info=model_info,
2372
- total_time=total_time,
2373
- )
2374
-
2375
- if del_model:
2376
- del model
2377
- else:
2378
- self._add_model(model=model, stack_name=stack_name, level=level, y_pred_proba_val=y_pred_proba_val, is_ray_worker=is_ray_worker)
2379
- model_names_trained.append(model.name)
2380
- if self.low_memory:
2381
- del model
2382
- if exception is not None:
2383
- if self._check_raise_exception(exception=exception, errors=errors, errors_ignore=errors_ignore, errors_raise=errors_raise):
2384
- raise exception
2385
- return model_names_trained
2386
-
2387
- # FIXME: v1.0 Move to AbstractModel for most fields
2388
- def _get_model_metadata(self, model: AbstractModel, stack_name: str = "core", level: int = 1) -> dict[str, Any]:
2389
- """
2390
- Returns the model metadata used to initialize a node in the DAG (self.model_graph).
2391
- """
2392
- if isinstance(model, BaggedEnsembleModel):
2393
- type_inner = model._child_type
2394
- else:
2395
- type_inner = type(model)
2396
- num_children = len(model.models) if hasattr(model, "models") else 1
2397
- predict_child_time = model.predict_time / num_children if model.predict_time is not None else None
2398
- predict_1_child_time = model.predict_1_time / num_children if model.predict_1_time is not None else None
2399
- fit_metadata = model.get_fit_metadata()
2400
-
2401
- model_param_aux = getattr(model, "_params_aux_child", model.params_aux)
2402
- model_metadata = dict(
2403
- fit_time=model.fit_time,
2404
- compile_time=model.compile_time,
2405
- predict_time=model.predict_time,
2406
- predict_1_time=model.predict_1_time,
2407
- predict_child_time=predict_child_time,
2408
- predict_1_child_time=predict_1_child_time,
2409
- predict_n_time_per_row=model.predict_n_time_per_row,
2410
- predict_n_size=model.predict_n_size,
2411
- val_score=model.val_score,
2412
- eval_metric=model.eval_metric.name,
2413
- stopping_metric=model.stopping_metric.name,
2414
- path=os.path.relpath(model.path, self.path).split(os.sep), # model's relative path to trainer
2415
- type=type(model), # Outer type, can be BaggedEnsemble, StackEnsemble (Type that is able to load the model)
2416
- type_inner=type_inner, # Inner type, if Ensemble then it is the type of the inner model (May not be able to load with this type)
2417
- can_infer=model.can_infer(),
2418
- can_fit=model.can_fit(),
2419
- is_valid=model.is_valid(),
2420
- stack_name=stack_name,
2421
- level=level,
2422
- num_children=num_children,
2423
- fit_num_cpus=model.fit_num_cpus,
2424
- fit_num_gpus=model.fit_num_gpus,
2425
- fit_num_cpus_child=model.fit_num_cpus_child,
2426
- fit_num_gpus_child=model.fit_num_gpus_child,
2427
- refit_full_requires_gpu=(model.fit_num_gpus_child is not None) and (model.fit_num_gpus_child >= 1) and model._user_params.get("refit_folds", False),
2428
- **fit_metadata,
2429
- )
2430
- return model_metadata
2431
-
2432
- def _add_model(self, model: AbstractModel, stack_name: str = "core", level: int = 1, y_pred_proba_val=None, _is_refit=False, is_distributed_main=False, is_ray_worker: bool = False) -> bool:
2433
- """
2434
- Registers the fit model in the Trainer object. Stores information such as model performance, save path, model type, and more.
2435
- To use a model in Trainer, self._add_model must be called.
2436
- If self.low_memory, then the model object will be deleted after this call. Use Trainer directly to leverage the model further.
2437
-
2438
- Parameters
2439
- ----------
2440
- model : AbstractModel
2441
- Model which has been fit. This model will be registered to the Trainer.
2442
- stack_name : str, default 'core'
2443
- Stack name to assign the model to. This is used for advanced functionality.
2444
- level : int, default 1
2445
- Stack level of the stack name to assign the model to. This is used for advanced functionality.
2446
- The model's name is appended to self.models_level[stack_name][level]
2447
- The model's base_models (if it has any) must all be a lower level than the model.
2448
- is_distributed_main: bool, default = False
2449
- If True, the main process in distributed training is calling this function.
2450
- This is used to avoid redundant logging in distributed training.
2451
-
2452
- Returns
2453
- -------
2454
- boolean, True if model was registered, False if model was found to be invalid and not registered.
2455
- """
2456
- if model.val_score is not None and np.isnan(model.val_score):
2457
- logger.warning(
2458
- f"WARNING: {model.name} has a val_score of {model.val_score} (NaN)! This should never happen. The model will not be saved to avoid instability."
2459
- )
2460
- return False
2461
- # TODO: Add to HPO
2462
-
2463
- node_attributes = self._get_model_metadata(model=model, stack_name=stack_name, level=level)
2464
- if y_pred_proba_val is not None:
2465
- # Cache y_pred_proba_val for later reuse to avoid redundant predict calls
2466
- self._save_model_y_pred_proba_val(model=model.name, y_pred_proba_val=y_pred_proba_val)
2467
- node_attributes["cached_y_pred_proba_val"] = True
2468
-
2469
- self.model_graph.add_node(
2470
- model.name,
2471
- **node_attributes,
2472
- )
2473
- if isinstance(model, StackerEnsembleModel):
2474
- prior_models = self.get_model_names()
2475
- # TODO: raise exception if no base models and level != 1?
2476
- for stack_column_prefix in model.stack_column_prefix_lst:
2477
- base_model_name = model.stack_column_prefix_to_model_map[stack_column_prefix]
2478
- if base_model_name not in prior_models:
2479
- raise AssertionError(
2480
- f"Model '{model.name}' depends on model '{base_model_name}', but '{base_model_name}' is not registered as a trained model! Valid models: {prior_models}"
2481
- )
2482
- elif level <= self.model_graph.nodes[base_model_name]["level"]:
2483
- raise AssertionError(
2484
- f"Model '{model.name}' depends on model '{base_model_name}', but '{base_model_name}' is not in a lower stack level. ('{model.name}' level: {level}, '{base_model_name}' level: {self.model_graph.nodes[base_model_name]['level']})"
2485
- )
2486
- self.model_graph.add_edge(base_model_name, model.name)
2487
- self._log_model_stats(model, _is_refit=_is_refit, is_distributed_main=is_distributed_main, is_ray_worker=is_ray_worker)
2488
- if self.low_memory:
2489
- del model
2490
- return True
2491
-
2492
- def _path_attr_model(self, model: str):
2493
- """Returns directory where attributes are cached"""
2494
- return os.path.join(self._path_attr, model)
2495
-
2496
- def _path_to_model_attr(self, model: str, attribute: str):
2497
- """Returns pkl file path for a cached model attribute"""
2498
- return os.path.join(self._path_attr_model(model), f"{attribute}.pkl")
2499
-
2500
- def _save_model_y_pred_proba_val(self, model: str, y_pred_proba_val):
2501
- """Cache y_pred_proba_val for later reuse to avoid redundant predict calls"""
2502
- save_pkl.save(path=self._path_to_model_attr(model=model, attribute="y_pred_proba_val"), object=y_pred_proba_val)
2503
-
2504
- def _load_model_y_pred_proba_val(self, model: str):
2505
- """Load cached y_pred_proba_val for a given model"""
2506
- return load_pkl.load(path=self._path_to_model_attr(model=model, attribute="y_pred_proba_val"))
2507
-
2508
- # TODO: Once Python min-version is 3.8, can refactor to use positional-only argument for model
2509
- # https://peps.python.org/pep-0570/#empowering-library-authors
2510
- # Currently this method cannot accept the attribute key 'model' without making usage ugly.
2511
- def _update_model_attr(self, model: str, **attributes):
2512
- """Updates model node in graph with the input attributes dictionary"""
2513
- if model not in self.model_graph:
2514
- raise AssertionError(f'"{model}" is not a key in self.model_graph, cannot add attributes: {attributes}')
2515
- self.model_graph.nodes[model].update(attributes)
2516
-
2517
- def _log_model_stats(self, model, _is_refit=False, is_distributed_main=False, is_ray_worker: bool = False):
2518
- """Logs model fit time, val score, predict time, and predict_1_time"""
2519
- model = self.load_model(model)
2520
- print_weights = model._get_tags().get("print_weights", False)
2521
-
2522
- is_log_during_distributed_fit = DistributedContext.is_distributed_mode() and (not is_distributed_main)
2523
- if is_ray_worker:
2524
- is_log_during_distributed_fit = True
2525
- log_level = 10 if is_log_during_distributed_fit else 20
2526
-
2527
- if print_weights:
2528
- model_weights = model._get_model_weights()
2529
- model_weights = {k: round(v, 3) for k, v in model_weights.items()}
2530
- msg_weights = ""
2531
- is_first = True
2532
- for key, value in sorted(model_weights.items(), key=lambda x: x[1], reverse=True):
2533
- if not is_first:
2534
- msg_weights += ", "
2535
- msg_weights += f"'{key}': {value}"
2536
- is_first = False
2537
- logger.log(log_level, f"\tEnsemble Weights: {{{msg_weights}}}")
2538
- if model.val_score is not None:
2539
- if model.eval_metric.name != self.eval_metric.name:
2540
- logger.log(log_level, f"\tNote: model has different eval_metric than default.")
2541
- if not model.eval_metric.greater_is_better_internal:
2542
- sign_str = "-"
2543
- else:
2544
- sign_str = ""
2545
- logger.log(log_level, f"\t{round(model.val_score, 4)}\t = Validation score ({sign_str}{model.eval_metric.name})")
2546
- if model.fit_time is not None:
2547
- logger.log(log_level, f"\t{round(model.fit_time, 2)}s\t = Training runtime")
2548
- if model.predict_time is not None:
2549
- logger.log(log_level, f"\t{round(model.predict_time, 2)}s\t = Validation runtime")
2550
- predict_n_time_per_row = self.get_model_attribute_full(model=model.name, attribute="predict_n_time_per_row")
2551
- predict_n_size = self.get_model_attribute_full(model=model.name, attribute="predict_n_size", func=min)
2552
- if predict_n_time_per_row is not None and predict_n_size is not None:
2553
- logger.log(
2554
- 15,
2555
- f"\t{round(1/(predict_n_time_per_row if predict_n_time_per_row else np.finfo(np.float16).eps), 1)}"
2556
- f"\t = Inference throughput (rows/s | {int(predict_n_size)} batch size)",
2557
- )
2558
- if model.predict_1_time is not None:
2559
- fit_metadata = model.get_fit_metadata()
2560
- predict_1_batch_size = fit_metadata.get("predict_1_batch_size", None)
2561
- assert predict_1_batch_size is not None, "predict_1_batch_size cannot be None if predict_1_time is not None"
2562
-
2563
- if _is_refit:
2564
- predict_1_time = self.get_model_attribute(model=model.name, attribute="predict_1_child_time")
2565
- predict_1_time_full = self.get_model_attribute_full(model=model.name, attribute="predict_1_child_time")
2566
- else:
2567
- predict_1_time = model.predict_1_time
2568
- predict_1_time_full = self.get_model_attribute_full(model=model.name, attribute="predict_1_time")
2569
-
2570
- predict_1_time_log, time_unit = convert_time_in_s_to_log_friendly(time_in_sec=predict_1_time)
2571
- logger.log(log_level, f"\t{round(predict_1_time_log, 3)}{time_unit}\t = Validation runtime (1 row | {predict_1_batch_size} batch size | MARGINAL)")
2572
-
2573
- predict_1_time_full_log, time_unit = convert_time_in_s_to_log_friendly(time_in_sec=predict_1_time_full)
2574
- logger.log(log_level, f"\t{round(predict_1_time_full_log, 3)}{time_unit}\t = Validation runtime (1 row | {predict_1_batch_size} batch size)")
2575
-
2576
- if not _is_refit:
2577
- predict_1_time_child = self.get_model_attribute(model=model.name, attribute="predict_1_child_time")
2578
- predict_1_time_child_log, time_unit = convert_time_in_s_to_log_friendly(time_in_sec=predict_1_time_child)
2579
- logger.log(
2580
- log_level,
2581
- f"\t{round(predict_1_time_child_log, 3)}{time_unit}\t = Validation runtime (1 row | {predict_1_batch_size} batch size | REFIT | MARGINAL)",
2582
- )
2583
-
2584
- predict_1_time_full_child = self.get_model_attribute_full(model=model.name, attribute="predict_1_child_time")
2585
- predict_1_time_full_child_log, time_unit = convert_time_in_s_to_log_friendly(time_in_sec=predict_1_time_full_child)
2586
- logger.log(
2587
- log_level, f"\t{round(predict_1_time_full_child_log, 3)}{time_unit}\t = Validation runtime (1 row | {predict_1_batch_size} batch size | REFIT)"
2588
- )
2589
-
2590
- # TODO: Split this to avoid confusion, HPO should go elsewhere?
2591
- def _train_single_full(
2592
- self,
2593
- X,
2594
- y,
2595
- model: AbstractModel,
2596
- X_unlabeled=None,
2597
- X_val=None,
2598
- y_val=None,
2599
- X_test=None,
2600
- y_test=None,
2601
- X_pseudo=None,
2602
- y_pseudo=None,
2603
- feature_prune=False,
2604
- hyperparameter_tune_kwargs=None,
2605
- stack_name="core",
2606
- k_fold=None,
2607
- k_fold_start=0,
2608
- k_fold_end=None,
2609
- n_repeats=None,
2610
- n_repeat_start=0,
2611
- level=1,
2612
- time_limit=None,
2613
- fit_kwargs=None,
2614
- compute_score=True,
2615
- total_resources: dict | None = None,
2616
- errors: Literal["ignore", "raise"] = "ignore",
2617
- errors_ignore: list | None = None,
2618
- errors_raise: list | None = None,
2619
- is_ray_worker: bool = False,
2620
- **kwargs,
2621
- ) -> list[str]:
2622
- """
2623
- Trains a model, with the potential to train multiple versions of this model with hyperparameter tuning and feature pruning.
2624
- Returns a list of successfully trained and saved model names.
2625
- Models trained from this method will be accessible in this Trainer.
2626
-
2627
- Parameters
2628
- ----------
2629
- errors: Literal["ignore", "raise"], default = "ignore"
2630
- Determines how model fit exceptions are handled.
2631
- If "ignore", will ignore all model exceptions during fit. If an exception occurs, an empty list is returned.
2632
- If "raise", will raise the model exception if it occurs.
2633
- Can be overwritten by `errors_ignore` and `errors_raise`.
2634
- errors_ignore: list[str], optional
2635
- The exception types specified in `errors_ignore` will be treated as if `errors="ignore"`.
2636
- errors_raise: list[str], optional
2637
- The exception types specified in `errors_raise` will be treated as if `errors="raise"`.
2638
- """
2639
- if self._callback_early_stop:
2640
- return []
2641
- check_callbacks = k_fold_start == 0 and n_repeat_start == 0 and not is_ray_worker
2642
- skip_model = False
2643
- if self.callbacks and check_callbacks:
2644
- skip_model, time_limit = self._callbacks_before_fit(
2645
- model=model,
2646
- time_limit=time_limit,
2647
- stack_name=stack_name,
2648
- level=level,
2649
- )
2650
- if self._callback_early_stop or skip_model:
2651
- return []
2652
-
2653
- model_fit_kwargs = self._get_model_fit_kwargs(
2654
- X=X, X_val=X_val, time_limit=time_limit, k_fold=k_fold, fit_kwargs=fit_kwargs, ens_sample_weight=kwargs.get("ens_sample_weight", None)
2655
- )
2656
- exception = None
2657
- if hyperparameter_tune_kwargs:
2658
- if n_repeat_start != 0:
2659
- raise ValueError(f"n_repeat_start must be 0 to hyperparameter_tune, value = {n_repeat_start}")
2660
- elif k_fold_start != 0:
2661
- raise ValueError(f"k_fold_start must be 0 to hyperparameter_tune, value = {k_fold_start}")
2662
- # hpo_models (dict): keys = model_names, values = model_paths
2663
- fit_log_message = f"Hyperparameter tuning model: {model.name} ..."
2664
- if time_limit is not None:
2665
- if time_limit <= 0:
2666
- logger.log(15, f"Skipping {model.name} due to lack of time remaining.")
2667
- return []
2668
- fit_start_time = time.time()
2669
- if self._time_limit is not None and self._time_train_start is not None:
2670
- time_left_total = self._time_limit - (fit_start_time - self._time_train_start)
2671
- else:
2672
- time_left_total = time_limit
2673
- fit_log_message += f" Tuning model for up to {round(time_limit, 2)}s of the {round(time_left_total, 2)}s of remaining time."
2674
- logger.log(20, fit_log_message)
2675
- try:
2676
- if isinstance(model, BaggedEnsembleModel):
2677
- bagged_model_fit_kwargs = self._get_bagged_model_fit_kwargs(
2678
- k_fold=k_fold, k_fold_start=k_fold_start, k_fold_end=k_fold_end, n_repeats=n_repeats, n_repeat_start=n_repeat_start
2679
- )
2680
- model_fit_kwargs.update(bagged_model_fit_kwargs)
2681
- hpo_models, hpo_results = model.hyperparameter_tune(
2682
- X=X,
2683
- y=y,
2684
- model=model,
2685
- X_val=X_val,
2686
- y_val=y_val,
2687
- X_unlabeled=X_unlabeled,
2688
- stack_name=stack_name,
2689
- level=level,
2690
- compute_score=compute_score,
2691
- hyperparameter_tune_kwargs=hyperparameter_tune_kwargs,
2692
- total_resources=total_resources,
2693
- **model_fit_kwargs,
2694
- )
2695
- else:
2696
- hpo_models, hpo_results = model.hyperparameter_tune(
2697
- X=X,
2698
- y=y,
2699
- X_val=X_val,
2700
- y_val=y_val,
2701
- hyperparameter_tune_kwargs=hyperparameter_tune_kwargs,
2702
- total_resources=total_resources,
2703
- **model_fit_kwargs,
2704
- )
2705
- if len(hpo_models) == 0:
2706
- logger.warning(f"No model was trained during hyperparameter tuning {model.name}... Skipping this model.")
2707
- except Exception as exc:
2708
- exception = exc # required to provide exc outside of `except` statement
2709
- if isinstance(exception, NoStackFeatures):
2710
- logger.warning(f"\tNo stack features to train {model.name}... Skipping this model. {exception}")
2711
- elif isinstance(exception, NotValidStacker):
2712
- logger.warning(f"\tStacking disabled for {model.name}... Skipping this model. {exception}")
2713
- elif isinstance(exception, NoValidFeatures):
2714
- logger.warning(f"\tNo valid features to train {model.name}... Skipping this model.")
2715
- else:
2716
- logger.exception(f"Warning: Exception caused {model.name} to fail during hyperparameter tuning... Skipping this model.")
2717
- logger.warning(exception)
2718
- del model
2719
- model_names_trained = []
2720
- else:
2721
- # Commented out because it takes too much space (>>5 GB if run for an hour on a small-medium sized dataset)
2722
- # self.hpo_results[model.name] = hpo_results
2723
- model_names_trained = []
2724
- self._extra_banned_names.add(model.name)
2725
- for model_hpo_name, model_info in hpo_models.items():
2726
- model_hpo = self.load_model(model_hpo_name, path=os.path.relpath(model_info["path"], self.path), model_type=type(model))
2727
- logger.log(20, f"Fitted model: {model_hpo.name} ...")
2728
- if self._add_model(model=model_hpo, stack_name=stack_name, level=level):
2729
- model_names_trained.append(model_hpo.name)
2730
- else:
2731
- model_fit_kwargs.update(dict(X_pseudo=X_pseudo, y_pseudo=y_pseudo))
2732
- if isinstance(model, BaggedEnsembleModel):
2733
- bagged_model_fit_kwargs = self._get_bagged_model_fit_kwargs(
2734
- k_fold=k_fold, k_fold_start=k_fold_start, k_fold_end=k_fold_end, n_repeats=n_repeats, n_repeat_start=n_repeat_start
2735
- )
2736
- model_fit_kwargs.update(bagged_model_fit_kwargs)
2737
- model_names_trained = self._train_and_save(
2738
- X=X,
2739
- y=y,
2740
- model=model,
2741
- X_val=X_val,
2742
- y_val=y_val,
2743
- X_test=X_test,
2744
- y_test=y_test,
2745
- X_unlabeled=X_unlabeled,
2746
- stack_name=stack_name,
2747
- level=level,
2748
- compute_score=compute_score,
2749
- total_resources=total_resources,
2750
- errors=errors,
2751
- errors_ignore=errors_ignore,
2752
- errors_raise=errors_raise,
2753
- is_ray_worker=is_ray_worker,
2754
- **model_fit_kwargs,
2755
- )
2756
- if self.callbacks and check_callbacks:
2757
- self._callbacks_after_fit(model_names=model_names_trained, stack_name=stack_name, level=level)
2758
- self.save()
2759
- if exception is not None:
2760
- if self._check_raise_exception(exception=exception, errors=errors, errors_ignore=errors_ignore, errors_raise=errors_raise):
2761
- raise exception
2762
- return model_names_trained
2763
-
2764
- # TODO: Move to a utility function outside of AbstractTabularTrainer
2765
- @staticmethod
2766
- def _check_raise_exception(
2767
- exception: Exception,
2768
- errors: Literal["ignore", "raise"] = "ignore",
2769
- errors_ignore: list = None,
2770
- errors_raise: list = None,
2771
- ) -> bool:
2772
- """
2773
- Check if an exception should be raised based on the provided error handling logic.
2774
-
2775
- Parameters
2776
- ----------
2777
- exception: Exception
2778
- The exception to check
2779
- errors: Literal["ignore", "raise"], default = "ignore"
2780
- Determines how exceptions are handled.
2781
- If "ignore", will return False.
2782
- If "raise", will return True.
2783
- Can be overwritten by `errors_ignore` and `errors_raise`.
2784
- errors_ignore: list[str], optional
2785
- The exception types specified in `errors_ignore` will be treated as if `errors="ignore"`.
2786
- errors_raise: list[str], optional
2787
- The exception types specified in `errors_raise` will be treated as if `errors="raise"`.
2788
-
2789
- Returns
2790
- -------
2791
- raise_exception: bool
2792
- If True, indicates that the exception should be raised based on the provided error handling rules.
2793
- """
2794
- raise_exception = None
2795
- if errors_raise is not None:
2796
- for err_type in errors_raise:
2797
- if isinstance(exception, err_type):
2798
- raise_exception = True
2799
- break
2800
- if errors_ignore is not None and raise_exception is None:
2801
- for err_type in errors_ignore:
2802
- if isinstance(exception, err_type):
2803
- raise_exception = False
2804
- break
2805
- if raise_exception is None:
2806
- if errors == "ignore":
2807
- raise_exception = False
2808
- elif errors == "raise":
2809
- raise_exception = True
2810
- else:
2811
- raise ValueError(f"Invalid `errors` value: {errors} (valid values: ['ignore', 'raise']")
2812
- return raise_exception
2813
-
2814
- def _callbacks_before_fit(
2815
- self,
2816
- *,
2817
- model: AbstractModel,
2818
- time_limit: float | None,
2819
- stack_name: str,
2820
- level: int,
2821
- ):
2822
- skip_model = False
2823
- ts = time.time()
2824
- for callback in self.callbacks:
2825
- callback_early_stop, callback_skip_model = callback.before_model_fit(
2826
- trainer=self,
2827
- model=model,
2828
- time_limit=time_limit,
2829
- stack_name=stack_name,
2830
- level=level,
2831
- )
2832
- if callback_early_stop:
2833
- self._callback_early_stop = True
2834
- if callback_skip_model:
2835
- skip_model = True
2836
- if time_limit is not None:
2837
- te = time.time()
2838
- time_limit -= te - ts
2839
- ts = te
2840
- return skip_model, time_limit
2841
-
2842
- def _callbacks_after_fit(
2843
- self,
2844
- *,
2845
- model_names: list[str],
2846
- stack_name: str,
2847
- level: int,
2848
- ):
2849
- for callback in self.callbacks:
2850
- callback_early_stop = callback.after_model_fit(
2851
- self,
2852
- model_names=model_names,
2853
- stack_name=stack_name,
2854
- level=level,
2855
- )
2856
- if callback_early_stop:
2857
- self._callback_early_stop = True
2858
-
2859
- # TODO: How to deal with models that fail during this? They have trained valid models before, but should we still use those models or remove the entire model? Currently we still use models.
2860
- # TODO: Time allowance can be made better by only using time taken during final model training and not during HPO and feature pruning.
2861
- # TODO: Time allowance not accurate if running from fit_continue
2862
- # TODO: Remove level and stack_name arguments, can get them automatically
2863
- # TODO: Make sure that pretraining on X_unlabeled only happens 1 time rather than every fold of bagging. (Do during pretrain API work?)
2864
- def _train_multi_repeats(self, X, y, models: list, n_repeats, n_repeat_start=1, time_limit=None, time_limit_total_level=None, **kwargs) -> list[str]:
2865
- """
2866
- Fits bagged ensemble models with additional folds and/or bagged repeats.
2867
- Models must have already been fit prior to entering this method.
2868
- This method should only be called in self._train_multi
2869
- Returns a list of successfully trained and saved model names.
2870
- """
2871
- if time_limit_total_level is None:
2872
- time_limit_total_level = time_limit
2873
- models_valid = models
2874
- models_valid_next = []
2875
- repeats_completed = 0
2876
- time_start = time.time()
2877
- for n in range(n_repeat_start, n_repeats):
2878
- if not models_valid:
2879
- break # No models to repeat
2880
- if time_limit is not None:
2881
- time_start_repeat = time.time()
2882
- time_left = time_limit - (time_start_repeat - time_start)
2883
- if n == n_repeat_start:
2884
- time_required = time_limit_total_level * 0.575 # Require slightly over 50% to be safe
2885
- else:
2886
- time_required = (time_start_repeat - time_start) / repeats_completed * (0.575 / 0.425)
2887
- if time_left < time_required:
2888
- logger.log(15, "Not enough time left to finish repeated k-fold bagging, stopping early ...")
2889
- break
2890
- logger.log(20, f"Repeating k-fold bagging: {n+1}/{n_repeats}")
2891
- for i, model in enumerate(models_valid):
2892
- if self._callback_early_stop:
2893
- break
2894
- if not self.get_model_attribute(model=model, attribute="can_fit"):
2895
- if isinstance(model, str):
2896
- models_valid_next.append(model)
2897
- else:
2898
- models_valid_next.append(model.name)
2899
- continue
2900
-
2901
- if isinstance(model, str):
2902
- model = self.load_model(model)
2903
- if not isinstance(model, BaggedEnsembleModel):
2904
- raise AssertionError(
2905
- f"{model.name} must inherit from BaggedEnsembleModel to perform repeated k-fold bagging. Model type: {type(model).__name__}"
2906
- )
2907
- if time_limit is None:
2908
- time_left = None
2909
- else:
2910
- time_start_model = time.time()
2911
- time_left = time_limit - (time_start_model - time_start)
2912
-
2913
- models_valid_next += self._train_single_full(
2914
- X=X, y=y, model=model, k_fold_start=0, k_fold_end=None, n_repeats=n + 1, n_repeat_start=n, time_limit=time_left, **kwargs
2915
- )
2916
- models_valid = copy.deepcopy(models_valid_next)
2917
- models_valid_next = []
2918
- repeats_completed += 1
2919
- logger.log(20, f"Completed {n_repeat_start + repeats_completed}/{n_repeats} k-fold bagging repeats ...")
2920
- return models_valid
2921
-
2922
- def _train_multi_initial(
2923
- self, X, y, models: list[AbstractModel], k_fold, n_repeats, hyperparameter_tune_kwargs=None, time_limit=None, feature_prune_kwargs=None, **kwargs
2924
- ):
2925
- """
2926
- Fits models that have not previously been fit.
2927
- This method should only be called in self._train_multi
2928
- Returns a list of successfully trained and saved model names.
2929
- """
2930
- multi_fold_time_start = time.time()
2931
- fit_args = dict(
2932
- X=X,
2933
- y=y,
2934
- k_fold=k_fold,
2935
- )
2936
- fit_args.update(kwargs)
2937
-
2938
- hpo_enabled = False
2939
- if hyperparameter_tune_kwargs:
2940
- for key in hyperparameter_tune_kwargs:
2941
- if hyperparameter_tune_kwargs[key] is not None:
2942
- hpo_enabled = True
2943
- break
2944
-
2945
- hpo_time_ratio = 0.9
2946
- if hpo_enabled:
2947
- time_split = True
2948
- else:
2949
- time_split = False
2950
- k_fold_start = 0
2951
- bagged = k_fold > 0
2952
- if not bagged:
2953
- time_ratio = hpo_time_ratio if hpo_enabled else 1
2954
- models = self._train_multi_fold(
2955
- models=models,
2956
- hyperparameter_tune_kwargs=hyperparameter_tune_kwargs,
2957
- time_limit=time_limit,
2958
- time_split=time_split,
2959
- time_ratio=time_ratio,
2960
- **fit_args,
2961
- )
2962
- else:
2963
- time_ratio = hpo_time_ratio if hpo_enabled else 1
2964
- models = self._train_multi_fold(
2965
- models=models,
2966
- hyperparameter_tune_kwargs=hyperparameter_tune_kwargs,
2967
- k_fold_start=0,
2968
- k_fold_end=k_fold,
2969
- n_repeats=n_repeats,
2970
- n_repeat_start=0,
2971
- time_limit=time_limit,
2972
- time_split=time_split,
2973
- time_ratio=time_ratio,
2974
- **fit_args,
2975
- )
2976
-
2977
- multi_fold_time_elapsed = time.time() - multi_fold_time_start
2978
- if time_limit is not None:
2979
- time_limit = time_limit - multi_fold_time_elapsed
2980
-
2981
- if feature_prune_kwargs is not None and len(models) > 0:
2982
- feature_prune_time_start = time.time()
2983
- model_fit_kwargs = self._get_model_fit_kwargs(
2984
- X=X,
2985
- X_val=kwargs.get("X_val", None),
2986
- time_limit=None,
2987
- k_fold=k_fold,
2988
- fit_kwargs=kwargs.get("fit_kwargs", {}),
2989
- ens_sample_weight=kwargs.get("ens_sample_weight"),
2990
- )
2991
- model_fit_kwargs.update(dict(X=X, y=y, X_val=kwargs.get("X_val", None), y_val=kwargs.get("y_val", None)))
2992
- if bagged:
2993
- bagged_model_fit_kwargs = self._get_bagged_model_fit_kwargs(
2994
- k_fold=k_fold, k_fold_start=k_fold_start, k_fold_end=k_fold, n_repeats=n_repeats, n_repeat_start=0
2995
- )
2996
- model_fit_kwargs.update(bagged_model_fit_kwargs)
2997
-
2998
- # FIXME: v1.3: X.columns incorrectly includes sample_weight column
2999
- # FIXME: v1.3: Move sample_weight logic into fit_stack_core level methods, currently we are editing X too many times in self._get_model_fit_kwargs
3000
- candidate_features = self._proxy_model_feature_prune(
3001
- time_limit=time_limit,
3002
- layer_fit_time=multi_fold_time_elapsed,
3003
- level=kwargs["level"],
3004
- features=X.columns.tolist(),
3005
- model_fit_kwargs=model_fit_kwargs,
3006
- **feature_prune_kwargs,
3007
- )
3008
- if time_limit is not None:
3009
- time_limit = time_limit - (time.time() - feature_prune_time_start)
3010
-
3011
- fit_args["X"] = X[candidate_features]
3012
- fit_args["X_val"] = kwargs["X_val"][candidate_features] if isinstance(kwargs.get("X_val", None), pd.DataFrame) else kwargs.get("X_val", None)
3013
-
3014
- if len(candidate_features) < len(X.columns):
3015
- unfit_models = []
3016
- original_prune_map = {}
3017
- for model in models:
3018
- unfit_model = self.load_model(model).convert_to_template()
3019
- unfit_model.rename(f"{unfit_model.name}_Prune")
3020
- unfit_models.append(unfit_model)
3021
- original_prune_map[unfit_model.name] = model
3022
- pruned_models = self._train_multi_fold(
3023
- models=unfit_models,
3024
- hyperparameter_tune_kwargs=None,
3025
- k_fold_start=k_fold_start,
3026
- k_fold_end=k_fold,
3027
- n_repeats=n_repeats,
3028
- n_repeat_start=0,
3029
- time_limit=time_limit,
3030
- **fit_args,
3031
- )
3032
- force_prune = feature_prune_kwargs.get("force_prune", False)
3033
- models = self._retain_better_pruned_models(pruned_models=pruned_models, original_prune_map=original_prune_map, force_prune=force_prune)
3034
- return models
3035
-
3036
- # TODO: Ban KNN from being a Stacker model outside of aux. Will need to ensemble select on all stack layers ensemble selector to make it work
3037
- # TODO: Robert dataset, LightGBM is super good but RF and KNN take all the time away from it on 1h despite being much worse
3038
- # TODO: Add time_limit_per_model
3039
- # TODO: Rename for v0.1
3040
- def _train_multi_fold(
3041
- self,
3042
- X: pd.DataFrame,
3043
- y: pd.Series,
3044
- models: list[AbstractModel],
3045
- time_limit: float | None = None,
3046
- time_split: bool = False,
3047
- time_ratio: float = 1,
3048
- hyperparameter_tune_kwargs: dict | None = None,
3049
- fit_strategy: Literal["sequential", "parallel"] = "sequential",
3050
- **kwargs,
3051
- ) -> list[str]:
3052
- """
3053
- Trains and saves a list of models sequentially.
3054
- This method should only be called in self._train_multi_initial
3055
- Returns a list of trained model names.
3056
- """
3057
- time_start = time.time()
3058
- if time_limit is not None:
3059
- time_limit = time_limit * time_ratio
3060
- if time_limit is not None and len(models) > 0:
3061
- time_limit_model_split = time_limit / len(models)
3062
- else:
3063
- time_limit_model_split = time_limit
3064
-
3065
- if fit_strategy == "parallel" and hyperparameter_tune_kwargs is not None and hyperparameter_tune_kwargs:
3066
- for k, v in hyperparameter_tune_kwargs.items():
3067
- if v is not None and (not isinstance(v, dict) or len(v) != 0):
3068
- logger.log(
3069
- 30,
3070
- f"WARNING: fit_strategy='parallel', but `hyperparameter_tune_kwargs` is specified for model '{k}' with value {v}. "
3071
- f"Hyperparameter tuning does not yet support `parallel` fit_strategy. "
3072
- f"Falling back to fit_strategy='sequential' ... "
3073
- )
3074
- fit_strategy = "sequential"
3075
- break
3076
- if fit_strategy == "parallel":
3077
- num_cpus = kwargs.get("total_resources", {}).get("num_cpus", "auto")
3078
- if isinstance(num_cpus, str) and num_cpus == "auto":
3079
- num_cpus = get_resource_manager().get_cpu_count_psutil()
3080
- if num_cpus < 12:
3081
- force_parallel = os.environ.get("AG_FORCE_PARALLEL", "False") == "True"
3082
- if not force_parallel:
3083
- logger.log(
3084
- 30,
3085
- f"Note: fit_strategy='parallel', but `num_cpus={num_cpus}`. "
3086
- f"Running parallel mode with fewer than 12 CPUs is not recommended and has been disabled. "
3087
- f'You can override this by specifying `os.environ["AG_FORCE_PARALLEL"] = "True"`. '
3088
- f"Falling back to fit_strategy='sequential' ..."
3089
- )
3090
- fit_strategy = "sequential"
3091
- if fit_strategy == "parallel":
3092
- num_gpus = kwargs.get("total_resources", {}).get("num_gpus", 0)
3093
- if isinstance(num_gpus, str) and num_gpus == "auto":
3094
- num_gpus = get_resource_manager().get_gpu_count()
3095
- if isinstance(num_gpus, (float, int)) and num_gpus > 0:
3096
- logger.log(
3097
- 30,
3098
- f"WARNING: fit_strategy='parallel', but `num_gpus={num_gpus}` is specified. "
3099
- f"GPU is not yet supported for `parallel` fit_strategy. To enable parallel, ensure you specify `num_gpus=0` in the fit call. "
3100
- f"Falling back to fit_strategy='sequential' ... "
3101
- )
3102
- fit_strategy = "sequential"
3103
- if fit_strategy == "parallel":
3104
- try:
3105
- try_import_ray()
3106
- except Exception as e:
3107
- logger.log(
3108
- 30,
3109
- f"WARNING: Exception encountered when trying to import ray (fit_strategy='parallel'). "
3110
- f"ray is required for 'parallel' fit_strategy. Falling back to fit_strategy='sequential' ... "
3111
- f"\n\tException details: {e.__class__.__name__}: {e}"
3112
- )
3113
- fit_strategy = "sequential"
3114
-
3115
- if fit_strategy == "sequential":
3116
- models_valid = []
3117
- for model in models:
3118
- if self._callback_early_stop:
3119
- return models_valid
3120
-
3121
- models_valid += _detached_train_multi_fold(
3122
- _self=self,
3123
- model=model,
3124
- X=X,
3125
- y=y,
3126
- time_start=time_start,
3127
- time_split=time_split,
3128
- time_limit=time_limit,
3129
- time_limit_model_split=time_limit_model_split,
3130
- hyperparameter_tune_kwargs=hyperparameter_tune_kwargs,
3131
- is_ray_worker=False,
3132
- kwargs=kwargs,
3133
- )
3134
- elif fit_strategy == "parallel":
3135
- models_valid = self._train_multi_fold_parallel(
3136
- X=X,
3137
- y=y,
3138
- models=models,
3139
- time_start=time_start,
3140
- time_limit_model_split=time_limit_model_split,
3141
- time_limit=time_limit,
3142
- time_split=time_split,
3143
- hyperparameter_tune_kwargs=hyperparameter_tune_kwargs,
3144
- **kwargs,
3145
- )
3146
- else:
3147
- raise ValueError(f"Invalid value for fit_strategy: '{fit_strategy}'")
3148
- return models_valid
3149
-
3150
- def _train_multi_fold_parallel(
3151
- self,
3152
- X: pd.DataFrame,
3153
- y: pd.Series,
3154
- models: list[AbstractModel],
3155
- time_start: float,
3156
- time_limit_model_split: float | None,
3157
- time_limit: float | None = None,
3158
- time_split: bool = False,
3159
- hyperparameter_tune_kwargs: dict | None = None,
3160
- **kwargs,
3161
- ) -> list[str]:
3162
- # -- Parallel or Distributed training
3163
- ray = try_import_ray()
3164
-
3165
- # FIXME: Need a common utility class for initializing ray so we don't duplicate code
3166
- if not ray.is_initialized():
3167
- ray.init(log_to_driver=False, logging_level=logging.ERROR)
3168
-
3169
- models_valid = []
3170
-
3171
- if time_limit is not None:
3172
- # Give models less than the full time limit to account for overheads (predict, cache, ray, etc.)
3173
- time_limit_models = time_limit * 0.9
3174
- else:
3175
- time_limit_models = None
3176
-
3177
- logger.log(20, "Scheduling parallel model-workers for training...")
3178
- distributed_manager = ParallelFitManager(
3179
- mode="fit",
3180
- X=X, # FIXME: REMOVE
3181
- y=y, # FIXME: REMOVE
3182
- func=_remote_train_multi_fold,
3183
- func_kwargs=dict(
3184
- time_split=time_split,
3185
- time_limit_model_split=time_limit_model_split,
3186
- time_limit=time_limit_models,
3187
- time_start=time_start,
3188
- errors="raise",
3189
- ),
3190
- func_put_kwargs=dict(
3191
- _self=self,
3192
- X=X,
3193
- y=y,
3194
- hyperparameter_tune_kwargs=hyperparameter_tune_kwargs,
3195
- kwargs=kwargs,
3196
- ),
3197
- num_cpus=kwargs.get("total_resources", {}).get("num_cpus", 1),
3198
- num_gpus=kwargs.get("total_resources", {}).get("num_gpus", 0),
3199
- num_splits=kwargs.get("k_fold", 1) * kwargs.get("n_repeats", 1),
3200
- problem_type=self.problem_type, # FIXME: Should this be passed here?
3201
- num_classes=self.num_classes, # FIXME: Should this be passed here?
3202
- )
3203
- jobs_finished = 0
3204
- jobs_total = len(models)
3205
-
3206
- ordered_model_names = [m.name for m in models] # Use to ensure same model order is returned
3207
- expected_model_names = set(ordered_model_names)
3208
- unfinished_job_refs = distributed_manager.schedule_jobs(models_to_fit=models)
3209
-
3210
- timeout = None
3211
-
3212
- if time_limit is not None:
3213
- # allow between 5 and 60 seconds overhead before force killing jobs to give some leniency to jobs with overhead.
3214
- time_overhead = min(max(time_limit * 0.01, 5), 60)
3215
- min_time_required_base = min(self._time_limit * 0.01, 10) # This is checked in the worker thread, will skip if not satisfied
3216
- # If time remaining is less than min_time_required, avoid scheduling new jobs and only wait for existing ones to finish.
3217
- min_time_required = min_time_required_base * 1.5 + 1 # Add 50% buffer and 1 second to account for ray overhead
3218
- else:
3219
- time_overhead = None
3220
- min_time_required = None
3221
-
3222
- can_schedule_jobs = True
3223
- while unfinished_job_refs:
3224
- if time_limit is not None:
3225
- time_left = time_limit - (time.time() - time_start)
3226
- timeout = int(time_left + time_overhead) # include overhead.
3227
- if timeout <= 0:
3228
- logger.log(20, "Ran into timeout while waiting for model training to finish. Stopping now.")
3229
- break
3230
- finished, unfinished_job_refs = ray.wait(unfinished_job_refs, num_returns=1, timeout=timeout)
3231
-
3232
- if not finished:
3233
- logger.log(20, "Ran into timeout while waiting for model training to finish. Stopping now.")
3234
- break
3235
-
3236
- distributed_manager.deallocate_resources(job_ref=finished[0])
3237
- model_name, model_path, model_type, exc, model_failure_info = ray.get(finished[0])
3238
- assert model_name in expected_model_names, (f"Unexpected model name outputted during parallel fit: {model_name}\n"
3239
- f"Valid Names: {expected_model_names}\n"
3240
- f"This should never happen. Please create a GitHub Issue.")
3241
- jobs_finished += 1
3242
-
3243
- if exc is not None or model_path is None:
3244
- if exc is None:
3245
- if model_failure_info is not None:
3246
- exc_type = model_failure_info["exc_type"]
3247
- exc_str = model_failure_info["exc_str"]
3248
- else:
3249
- exc_type = None
3250
- exc_str = None
3251
- else:
3252
- exc_type = exc.__class__
3253
- exc_str = str(exc)
3254
- if exc_type is not None:
3255
- extra_log = f": {exc_type.__name__}: {exc_str}"
3256
- else:
3257
- extra_log = ""
3258
- if exc_type is not None and issubclass(exc_type, InsufficientTime):
3259
- logger.log(20, exc_str)
3260
- else:
3261
- logger.log(20, f"Skipping {model_name if isinstance(model_name, str) else model_name.name} due to exception{extra_log}")
3262
- if model_failure_info is not None:
3263
- self._models_failed_to_train_errors[model_name] = model_failure_info
3264
- else:
3265
- logger.log(20, f"Fitted {model_name}:")
3266
-
3267
- # TODO: figure out a way to avoid calling _add_model in the worker-process to save overhead time.
3268
- # - Right now, we need to call it within _add_model to be able to pass the model path to the main process without changing
3269
- # the return signature of _train_single_full. This can be a lot of work to change.
3270
- # TODO: determine if y_pred_proba_val was cached in the worker-process. Right now, we re-do predictions for holdout data.
3271
- # Self object is not permanently mutated during worker execution, so we need to add model to the "main" self (again).
3272
- # This is the synchronization point between the distributed and main processes.
3273
- if self._add_model(
3274
- model_type.load(path=os.path.join(self.path, model_path), reset_paths=self.reset_paths),
3275
- stack_name=kwargs["stack_name"],
3276
- level=kwargs["level"]
3277
- ):
3278
- jobs_running = len(unfinished_job_refs)
3279
- if can_schedule_jobs:
3280
- remaining_task_word = "pending"
3281
- else:
3282
- remaining_task_word = "skipped"
3283
- parallel_status_log = (
3284
- f"\tJobs: {jobs_running} running, "
3285
- f"{jobs_total - (jobs_finished + jobs_running)} {remaining_task_word}, "
3286
- f"{jobs_finished}/{jobs_total} finished"
3287
- )
3288
- if time_limit is not None:
3289
- time_left = time_limit - (time.time() - time_start)
3290
- parallel_status_log += f" | {time_left:.0f}s remaining"
3291
- logger.log(20, parallel_status_log)
3292
- models_valid.append(model_name)
3293
- else:
3294
- logger.log(40, f"Failed to add {model_name} to model graph. This should never happen. Please create a GitHub issue.")
3295
-
3296
- if not unfinished_job_refs and not distributed_manager.models_to_schedule:
3297
- # Completed all jobs
3298
- break
3299
-
3300
- # TODO: look into what this does / how this works for distributed training
3301
- if self._callback_early_stop:
3302
- logger.log(20, "Callback triggered in parallel setting. Stopping model training and cancelling remaining jobs.")
3303
- break
3304
-
3305
- # Stop due to time limit after adding model
3306
- if time_limit is not None:
3307
- time_elapsed = time.time() - time_start
3308
- time_left = time_limit - time_elapsed
3309
- time_left_models = time_limit_models - time_elapsed
3310
- if (time_left + time_overhead) <= 0:
3311
- logger.log(20, "Time limit reached for this stacking layer. Stopping model training and cancelling remaining jobs.")
3312
- break
3313
- elif time_left_models < min_time_required:
3314
- if can_schedule_jobs:
3315
- if len(distributed_manager.models_to_schedule) > 0:
3316
- logger.log(
3317
- 20,
3318
- f"Low on time, skipping {len(distributed_manager.models_to_schedule)} "
3319
- f"pending jobs and waiting for running jobs to finish... ({time_left:.0f}s remaining time)"
3320
- )
3321
- can_schedule_jobs = False
3322
-
3323
- if can_schedule_jobs:
3324
- # Re-schedule jobs
3325
- unfinished_job_refs += distributed_manager.schedule_jobs()
3326
-
3327
- distributed_manager.clean_up_ray(unfinished_job_refs=unfinished_job_refs)
3328
- logger.log(20, "Finished all parallel work for this stacking layer.")
3329
-
3330
- models_valid = set(models_valid)
3331
- models_valid = [m for m in ordered_model_names if m in models_valid] # maintain original order
3332
-
3333
- return models_valid
3334
-
3335
- def _train_multi(
3336
- self,
3337
- X,
3338
- y,
3339
- models: list[AbstractModel],
3340
- hyperparameter_tune_kwargs=None,
3341
- feature_prune_kwargs=None,
3342
- k_fold=None,
3343
- n_repeats=None,
3344
- n_repeat_start=0,
3345
- time_limit=None,
3346
- delay_bag_sets: bool = False,
3347
- **kwargs,
3348
- ) -> list[str]:
3349
- """
3350
- Train a list of models using the same data.
3351
- Assumes that input data has already been processed in the form the models will receive as input (including stack feature generation).
3352
- Trained models are available in the trainer object.
3353
- Note: Consider using public APIs instead of this.
3354
- Returns a list of trained model names.
3355
- """
3356
- time_limit_total_level = time_limit
3357
- if k_fold is None:
3358
- k_fold = self.k_fold
3359
- if n_repeats is None:
3360
- n_repeats = self.n_repeats
3361
- if (k_fold == 0) and (n_repeats != 1):
3362
- raise ValueError(f"n_repeats must be 1 when k_fold is 0, values: ({n_repeats}, {k_fold})")
3363
- if (time_limit is None and feature_prune_kwargs is None) or (not delay_bag_sets):
3364
- n_repeats_initial = n_repeats
3365
- else:
3366
- n_repeats_initial = 1
3367
- if n_repeat_start == 0:
3368
- time_start = time.time()
3369
- model_names_trained = self._train_multi_initial(
3370
- X=X,
3371
- y=y,
3372
- models=models,
3373
- k_fold=k_fold,
3374
- n_repeats=n_repeats_initial,
3375
- hyperparameter_tune_kwargs=hyperparameter_tune_kwargs,
3376
- feature_prune_kwargs=feature_prune_kwargs,
3377
- time_limit=time_limit,
3378
- **kwargs,
3379
- )
3380
- n_repeat_start = n_repeats_initial
3381
- if time_limit is not None:
3382
- time_limit = time_limit - (time.time() - time_start)
3383
- else:
3384
- model_names_trained = models
3385
- if (n_repeats > 1) and (n_repeat_start < n_repeats):
3386
- model_names_trained = self._train_multi_repeats(
3387
- X=X,
3388
- y=y,
3389
- models=model_names_trained,
3390
- k_fold=k_fold,
3391
- n_repeats=n_repeats,
3392
- n_repeat_start=n_repeat_start,
3393
- time_limit=time_limit,
3394
- time_limit_total_level=time_limit_total_level,
3395
- **kwargs,
3396
- )
3397
- return model_names_trained
3398
-
3399
- def _train_multi_and_ensemble(
3400
- self,
3401
- X,
3402
- y,
3403
- X_val,
3404
- y_val,
3405
- X_test=None,
3406
- y_test=None,
3407
- hyperparameters: dict | None = None,
3408
- X_unlabeled=None,
3409
- num_stack_levels=0,
3410
- time_limit=None,
3411
- groups=None,
3412
- **kwargs,
3413
- ) -> list[str]:
3414
- """Identical to self.train_multi_levels, but also saves the data to disk. This should only ever be called once."""
3415
- if time_limit is not None and time_limit <= 0:
3416
- raise AssertionError(f"Not enough time left to train models. Consider specifying a larger time_limit. Time remaining: {round(time_limit, 2)}s")
3417
- if self.save_data and not self.is_data_saved:
3418
- self.save_X(X)
3419
- self.save_y(y)
3420
- if X_val is not None:
3421
- self.save_X_val(X_val)
3422
- if y_val is not None:
3423
- self.save_y_val(y_val)
3424
- if X_test is not None:
3425
- self.save_X_test(X_test)
3426
- if y_test is not None:
3427
- self.save_y_test(y_test)
3428
- self.is_data_saved = True
3429
- if self._groups is None:
3430
- self._groups = groups
3431
- self._num_rows_train = len(X)
3432
- if X_val is not None:
3433
- self._num_rows_val = len(X_val)
3434
- if X_test is not None:
3435
- self._num_rows_test = len(X_test)
3436
- self._num_cols_train = len(list(X.columns))
3437
- model_names_fit = self.train_multi_levels(
3438
- X,
3439
- y,
3440
- hyperparameters=hyperparameters,
3441
- X_val=X_val,
3442
- y_val=y_val,
3443
- X_test=X_test,
3444
- y_test=y_test,
3445
- X_unlabeled=X_unlabeled,
3446
- level_start=1,
3447
- level_end=num_stack_levels + 1,
3448
- time_limit=time_limit,
3449
- **kwargs,
3450
- )
3451
- if len(self.get_model_names()) == 0:
3452
- # TODO v1.0: Add toggle to raise exception if no models trained
3453
- logger.log(30, "Warning: AutoGluon did not successfully train any models")
3454
- return model_names_fit
3455
-
3456
- def _predict_model(self, X: pd.DataFrame, model: str, model_pred_proba_dict: dict | None = None) -> np.ndarray:
3457
- y_pred_proba = self._predict_proba_model(X=X, model=model, model_pred_proba_dict=model_pred_proba_dict)
3458
- return get_pred_from_proba(y_pred_proba=y_pred_proba, problem_type=self.problem_type)
3459
-
3460
- def _predict_proba_model(self, X: pd.DataFrame, model: str, model_pred_proba_dict: dict | None = None) -> np.ndarray:
3461
- model_pred_proba_dict = self.get_model_pred_proba_dict(X=X, models=[model], model_pred_proba_dict=model_pred_proba_dict)
3462
- if not isinstance(model, str):
3463
- model = model.name
3464
- return model_pred_proba_dict[model]
3465
-
3466
- def _proxy_model_feature_prune(
3467
- self, model_fit_kwargs: dict, time_limit: float, layer_fit_time: float, level: int, features: list[str], **feature_prune_kwargs: dict
3468
- ) -> list[str]:
3469
- """
3470
- Uses the best LightGBM-based base learner of this layer to perform time-aware permutation feature importance based feature pruning.
3471
- If all LightGBM models fail, use the model that achieved the highest validation accuracy. Feature pruning gets the smaller of the
3472
- remaining layer time limit and k times (default=2) it took to fit the base learners of this layer as its resource. Note that feature pruning can
3473
- exit earlier based on arguments in feature_prune_kwargs. The method returns the list of feature names that survived the pruning procedure.
3474
-
3475
- Parameters
3476
- ----------
3477
- feature_prune_kwargs : dict
3478
- Feature pruning kwarg arguments. Should contain arguments passed to FeatureSelector.select_features. One can optionally attach the following
3479
- additional kwargs that are consumed at this level: 'proxy_model_class' to use a model of particular type with the highest validation score as the
3480
- proxy model, 'feature_prune_time_limit' to manually specify how long we should perform the feature pruning procedure for, 'k' to specify how long
3481
- we should perform feature pruning for if 'feature_prune_time_limit' has not been set (feature selection time budget is set to k * layer_fit_time),
3482
- and 'raise_exception' to signify that AutoGluon should throw an exception if feature pruning errors out.
3483
- time_limit : float
3484
- Time limit left within the current stack layer in seconds. Feature pruning should never take more than this time.
3485
- layer_fit_time : float
3486
- How long it took to fit all the models in this layer once. Used to calculate how long to feature prune for.
3487
- level : int
3488
- Level of this stack layer.
3489
- features: list[str]
3490
- The list of feature names in the inputted dataset.
3491
-
3492
- Returns
3493
- -------
3494
- candidate_features : list[str]
3495
- Feature names that survived the pruning procedure.
3496
- """
3497
- k = feature_prune_kwargs.pop("k", 2)
3498
- proxy_model_class = feature_prune_kwargs.pop("proxy_model_class", self._get_default_proxy_model_class())
3499
- feature_prune_time_limit = feature_prune_kwargs.pop("feature_prune_time_limit", None)
3500
- raise_exception_on_fail = feature_prune_kwargs.pop("raise_exception", False)
3501
-
3502
- proxy_model = self._get_feature_prune_proxy_model(proxy_model_class=proxy_model_class, level=level)
3503
- if proxy_model is None:
3504
- return features
3505
-
3506
- if feature_prune_time_limit is not None:
3507
- feature_prune_time_limit = min(max(time_limit - layer_fit_time, 0), feature_prune_time_limit)
3508
- elif time_limit is not None:
3509
- feature_prune_time_limit = min(max(time_limit - layer_fit_time, 0), max(k * layer_fit_time, 0.05 * time_limit))
3510
- else:
3511
- feature_prune_time_limit = max(k * layer_fit_time, 300)
3512
-
3513
- if feature_prune_time_limit < 2 * proxy_model.fit_time:
3514
- logger.warning(
3515
- f"Insufficient time to train even a single feature pruning model (remaining: {feature_prune_time_limit}, "
3516
- f"needed: {proxy_model.fit_time}). Skipping feature pruning."
3517
- )
3518
- return features
3519
- selector = FeatureSelector(
3520
- model=proxy_model, time_limit=feature_prune_time_limit, raise_exception=raise_exception_on_fail, problem_type=self.problem_type
3521
- )
3522
- candidate_features = selector.select_features(**feature_prune_kwargs, **model_fit_kwargs)
3523
- return candidate_features
3524
-
3525
- def _get_default_proxy_model_class(self):
3526
- return None
3527
-
3528
- def _retain_better_pruned_models(self, pruned_models: list[str], original_prune_map: dict, force_prune: bool = False) -> list[str]:
3529
- """
3530
- Compares models fit on the pruned set of features with their counterpart, models fit on full set of features.
3531
- Take the model that achieved a higher validation set score and delete the other from self.model_graph.
3532
-
3533
- Parameters
3534
- ----------
3535
- pruned_models : list[str]
3536
- A list of pruned model names.
3537
- original_prune_map : dict
3538
- A dictionary mapping the names of models fitted on pruned features to the names of models fitted on original features.
3539
- force_prune : bool, default = False
3540
- If set to true, force all base learners to work with the pruned set of features.
3541
-
3542
- Returns
3543
- ----------
3544
- models : list[str]
3545
- A list of model names.
3546
- """
3547
- models = []
3548
- for pruned_model in pruned_models:
3549
- original_model = original_prune_map[pruned_model]
3550
- leaderboard = self.leaderboard()
3551
- original_score = leaderboard[leaderboard["model"] == original_model]["score_val"].item()
3552
- pruned_score = leaderboard[leaderboard["model"] == pruned_model]["score_val"].item()
3553
- score_str = f"({round(pruned_score, 4)} vs {round(original_score, 4)})"
3554
- if force_prune:
3555
- logger.log(30, f"Pruned score vs original score is {score_str}. Replacing original model since force_prune=True...")
3556
- self.delete_models(models_to_delete=original_model, dry_run=False)
3557
- models.append(pruned_model)
3558
- elif pruned_score > original_score:
3559
- logger.log(30, f"Model trained with feature pruning score is better than original model's score {score_str}. Replacing original model...")
3560
- self.delete_models(models_to_delete=original_model, dry_run=False)
3561
- models.append(pruned_model)
3562
- else:
3563
- logger.log(30, f"Model trained with feature pruning score is not better than original model's score {score_str}. Keeping original model...")
3564
- self.delete_models(models_to_delete=pruned_model, dry_run=False)
3565
- models.append(original_model)
3566
- return models
3567
-
3568
- # TODO: Enable raw=True for bagged models when X=None
3569
- # This is non-trivial to implement for multi-layer stacking ensembles on the OOF data.
3570
- # TODO: Consider limiting X to 10k rows here instead of inside the model call
3571
- def get_feature_importance(self, model=None, X=None, y=None, raw=True, **kwargs) -> pd.DataFrame:
3572
- if model is None:
3573
- model = self.model_best
3574
- model: AbstractModel = self.load_model(model)
3575
- if X is None and model.val_score is None:
3576
- raise AssertionError(
3577
- f"Model {model.name} is not valid for generating feature importances on original training data because no validation data was used during training, please specify new test data to compute feature importances."
3578
- )
3579
-
3580
- if X is None:
3581
- if isinstance(model, WeightedEnsembleModel):
3582
- if self.bagged_mode:
3583
- if raw:
3584
- raise AssertionError(
3585
- "`feature_stage='transformed'` feature importance on the original training data is not yet supported when bagging is enabled, please specify new test data to compute feature importances."
3586
- )
3587
- X = None
3588
- is_oof = True
3589
- else:
3590
- if raw:
3591
- X = self.load_X_val()
3592
- else:
3593
- X = None
3594
- is_oof = False
3595
- elif isinstance(model, BaggedEnsembleModel):
3596
- if raw:
3597
- raise AssertionError(
3598
- "`feature_stage='transformed'` feature importance on the original training data is not yet supported when bagging is enabled, please specify new test data to compute feature importances."
3599
- )
3600
- X = self.load_X()
3601
- X = self.get_inputs_to_model(model=model, X=X, fit=True)
3602
- is_oof = True
3603
- else:
3604
- X = self.load_X_val()
3605
- if not raw:
3606
- X = self.get_inputs_to_model(model=model, X=X, fit=False)
3607
- is_oof = False
3608
- else:
3609
- is_oof = False
3610
- if not raw:
3611
- X = self.get_inputs_to_model(model=model, X=X, fit=False)
3612
-
3613
- if y is None and X is not None:
3614
- if is_oof:
3615
- y = self.load_y()
3616
- else:
3617
- y = self.load_y_val()
3618
-
3619
- if raw:
3620
- return self._get_feature_importance_raw(X=X, y=y, model=model, **kwargs)
3621
- else:
3622
- if is_oof:
3623
- kwargs["is_oof"] = is_oof
3624
- return model.compute_feature_importance(X=X, y=y, **kwargs)
3625
-
3626
- # TODO: Can get feature importances of all children of model at no extra cost, requires scoring the values after predict_proba on each model
3627
- # Could solve by adding a self.score_all() function which takes model as input and also returns scores of all children models.
3628
- # This would be best solved after adding graph representation, it lives most naturally in AbstractModel
3629
- # TODO: Can skip features which were pruned on all models that model depends on (Complex to implement, requires graph representation)
3630
- # TODO: Note that raw importance will not equal non-raw importance for bagged models, even if raw features are identical to the model features.
3631
- # This is because for non-raw, we do an optimization where each fold model calls .compute_feature_importance(), and then the feature importances are averaged across the folds.
3632
- # This is different from raw, where the predictions of the folds are averaged and then feature importance is computed.
3633
- # Consider aligning these methods so they produce the same result.
3634
- # The output of this function is identical to non-raw when model is level 1 and non-bagged
3635
- def _get_feature_importance_raw(self, X, y, model, eval_metric=None, **kwargs) -> pd.DataFrame:
3636
- if eval_metric is None:
3637
- eval_metric = self.eval_metric
3638
- if model is None:
3639
- model = self._get_best()
3640
- if eval_metric.needs_pred:
3641
- predict_func = self.predict
3642
- else:
3643
- predict_func = self.predict_proba
3644
- model: AbstractModel = self.load_model(model)
3645
- predict_func_kwargs = dict(model=model)
3646
- return compute_permutation_feature_importance(
3647
- X=X,
3648
- y=y,
3649
- predict_func=predict_func,
3650
- predict_func_kwargs=predict_func_kwargs,
3651
- eval_metric=eval_metric,
3652
- quantile_levels=self.quantile_levels,
3653
- **kwargs,
3654
- )
3655
-
3656
- def _get_models_load_info(self, model_names):
3657
- model_names = copy.deepcopy(model_names)
3658
- model_paths = self.get_models_attribute_dict(attribute="path", models=model_names)
3659
- model_types = self.get_models_attribute_dict(attribute="type", models=model_names)
3660
- return model_names, model_paths, model_types
3661
-
3662
- def get_model_attribute_full(self, model: str | list[str], attribute: str, func=sum) -> float | int:
3663
- """
3664
- Sums the attribute value across all models that the provided model depends on, including itself.
3665
- For instance, this function can return the expected total predict_time of a model.
3666
- attribute is the name of the desired attribute to be summed,
3667
- or a dictionary of model name -> attribute value if the attribute is not present in the graph.
3668
- """
3669
- if isinstance(model, list):
3670
- base_model_set = self.get_minimum_models_set(model)
3671
- else:
3672
- base_model_set = self.get_minimum_model_set(model)
3673
- if isinstance(attribute, dict):
3674
- is_dict = True
3675
- else:
3676
- is_dict = False
3677
- if len(base_model_set) == 1:
3678
- if is_dict:
3679
- return attribute[model]
3680
- else:
3681
- return self.model_graph.nodes[base_model_set[0]][attribute]
3682
- # attribute_full = 0
3683
- attribute_lst = []
3684
- for base_model in base_model_set:
3685
- if is_dict:
3686
- attribute_base_model = attribute[base_model]
3687
- else:
3688
- attribute_base_model = self.model_graph.nodes[base_model][attribute]
3689
- if attribute_base_model is None:
3690
- return None
3691
- attribute_lst.append(attribute_base_model)
3692
- # attribute_full += attribute_base_model
3693
- if attribute_lst:
3694
- attribute_full = func(attribute_lst)
3695
- else:
3696
- attribute_full = 0
3697
- return attribute_full
3698
-
3699
- def get_models_attribute_full(self, models: list[str], attribute: str, func=sum):
3700
- """
3701
- For each model in models, returns the output of self.get_model_attribute_full mapped to a dict.
3702
- """
3703
- d = dict()
3704
- for model in models:
3705
- d[model] = self.get_model_attribute_full(model=model, attribute=attribute, func=func)
3706
- return d
3707
-
3708
- # Gets the minimum set of models that the provided models depend on, including themselves
3709
- # Returns a list of model names
3710
- def get_minimum_models_set(self, models: list) -> list:
3711
- models_set = set()
3712
- for model in models:
3713
- models_set = models_set.union(self.get_minimum_model_set(model))
3714
- return list(models_set)
3715
-
3716
- # Gets the set of base models used directly by the provided model
3717
- # Returns a list of model names
3718
- def get_base_model_names(self, model) -> list:
3719
- if not isinstance(model, str):
3720
- model = model.name
3721
- base_model_set = list(self.model_graph.predecessors(model))
3722
- return base_model_set
3723
-
3724
- def model_refit_map(self, inverse=False) -> dict[str, str]:
3725
- """
3726
- Returns dict of parent model -> refit model
3727
-
3728
- If inverse=True, return dict of refit model -> parent model
3729
- """
3730
- model_refit_map = self.get_models_attribute_dict(attribute="refit_full_parent")
3731
- if not inverse:
3732
- model_refit_map = {parent: refit for refit, parent in model_refit_map.items()}
3733
- return model_refit_map
3734
-
3735
- def model_exists(self, model: str) -> bool:
3736
- return model in self.get_model_names()
3737
-
3738
- def _flatten_model_info(self, model_info: dict) -> dict:
3739
- """
3740
- Flattens the model_info nested dictionary into a shallow dictionary to convert to a pandas DataFrame row.
3741
-
3742
- Parameters
3743
- ----------
3744
- model_info: dict
3745
- A nested dictionary of model metadata information
3746
-
3747
- Returns
3748
- -------
3749
- A flattened dictionary of model info.
3750
- """
3751
- model_info_keys = [
3752
- "num_features",
3753
- "model_type",
3754
- "hyperparameters",
3755
- "hyperparameters_fit",
3756
- "ag_args_fit",
3757
- "features",
3758
- "is_initialized",
3759
- "is_fit",
3760
- "is_valid",
3761
- "can_infer",
3762
- ]
3763
- model_info_flat = {k: v for k, v in model_info.items() if k in model_info_keys}
3764
-
3765
- custom_info = {}
3766
- bagged_info = model_info.get("bagged_info", {})
3767
- custom_info["num_models"] = bagged_info.get("num_child_models", 1)
3768
- custom_info["memory_size"] = bagged_info.get("max_memory_size", model_info["memory_size"])
3769
- custom_info["memory_size_min"] = bagged_info.get("min_memory_size", model_info["memory_size"])
3770
- custom_info["compile_time"] = bagged_info.get("compile_time", model_info["compile_time"])
3771
- custom_info["child_model_type"] = bagged_info.get("child_model_type", None)
3772
- custom_info["child_hyperparameters"] = bagged_info.get("child_hyperparameters", None)
3773
- custom_info["child_hyperparameters_fit"] = bagged_info.get("child_hyperparameters_fit", None)
3774
- custom_info["child_ag_args_fit"] = bagged_info.get("child_ag_args_fit", None)
3775
-
3776
- model_info_keys = [
3777
- "num_models",
3778
- "memory_size",
3779
- "memory_size_min",
3780
- "compile_time",
3781
- "child_model_type",
3782
- "child_hyperparameters",
3783
- "child_hyperparameters_fit",
3784
- "child_ag_args_fit",
3785
- ]
3786
- for key in model_info_keys:
3787
- model_info_flat[key] = custom_info[key]
3788
- return model_info_flat
3789
-
3790
- def leaderboard(self, extra_info=False, refit_full: bool | None = None, set_refit_score_to_parent: bool = False):
3791
- model_names = self.get_model_names()
3792
- models_full_dict = self.get_models_attribute_dict(models=model_names, attribute="refit_full_parent")
3793
- if refit_full is not None:
3794
- if refit_full:
3795
- model_names = [model for model in model_names if model in models_full_dict]
3796
- else:
3797
- model_names = [model for model in model_names if model not in models_full_dict]
3798
- score_val = []
3799
- eval_metric = []
3800
- stopping_metric = []
3801
- fit_time_marginal = []
3802
- pred_time_val_marginal = []
3803
- stack_level = []
3804
- fit_time = []
3805
- pred_time_val = []
3806
- can_infer = []
3807
- fit_order = list(range(1, len(model_names) + 1))
3808
- score_val_dict = self.get_models_attribute_dict("val_score")
3809
- eval_metric_dict = self.get_models_attribute_dict("eval_metric")
3810
- stopping_metric_dict = self.get_models_attribute_dict("stopping_metric")
3811
- fit_time_marginal_dict = self.get_models_attribute_dict("fit_time")
3812
- predict_time_marginal_dict = self.get_models_attribute_dict("predict_time")
3813
- fit_time_dict = self.get_models_attribute_full(attribute="fit_time", models=model_names, func=sum)
3814
- pred_time_val_dict = self.get_models_attribute_full(attribute="predict_time", models=model_names, func=sum)
3815
- can_infer_dict = self.get_models_attribute_full(attribute="can_infer", models=model_names, func=min)
3816
- for model_name in model_names:
3817
- if set_refit_score_to_parent and (model_name in models_full_dict):
3818
- if models_full_dict[model_name] not in score_val_dict:
3819
- raise AssertionError(
3820
- f"Model parent is missing from leaderboard when `set_refit_score_to_parent=True`, "
3821
- f"this is invalid. The parent model may have been deleted. "
3822
- f"(model='{model_name}', parent='{models_full_dict[model_name]}')"
3823
- )
3824
- score_val.append(score_val_dict[models_full_dict[model_name]])
3825
- else:
3826
- score_val.append(score_val_dict[model_name])
3827
- eval_metric.append(eval_metric_dict[model_name])
3828
- stopping_metric.append(stopping_metric_dict[model_name])
3829
- fit_time_marginal.append(fit_time_marginal_dict[model_name])
3830
- fit_time.append(fit_time_dict[model_name])
3831
- pred_time_val_marginal.append(predict_time_marginal_dict[model_name])
3832
- pred_time_val.append(pred_time_val_dict[model_name])
3833
- stack_level.append(self.get_model_level(model_name))
3834
- can_infer.append(can_infer_dict[model_name])
3835
-
3836
- model_info_dict = defaultdict(list)
3837
- extra_info_dict = dict()
3838
- if extra_info:
3839
- # TODO: feature_metadata
3840
- # TODO: disk size
3841
- # TODO: load time
3842
- # TODO: Add persist_if_mem_safe() function to persist in memory all models if reasonable memory size (or a specific model+ancestors)
3843
- # TODO: Add is_persisted() function to check which models are persisted in memory
3844
- # TODO: package_dependencies, package_dependencies_full
3845
-
3846
- info = self.get_info(include_model_info=True)
3847
- model_info = info["model_info"]
3848
- custom_model_info = {}
3849
- for model_name in model_info:
3850
- custom_info = {}
3851
- bagged_info = model_info[model_name].get("bagged_info", {})
3852
- custom_info["num_models"] = bagged_info.get("num_child_models", 1)
3853
- custom_info["memory_size"] = bagged_info.get("max_memory_size", model_info[model_name]["memory_size"])
3854
- custom_info["memory_size_min"] = bagged_info.get("min_memory_size", model_info[model_name]["memory_size"])
3855
- custom_info["compile_time"] = bagged_info.get("compile_time", model_info[model_name]["compile_time"])
3856
- custom_info["child_model_type"] = bagged_info.get("child_model_type", None)
3857
- custom_info["child_hyperparameters"] = bagged_info.get("child_hyperparameters", None)
3858
- custom_info["child_hyperparameters_fit"] = bagged_info.get("child_hyperparameters_fit", None)
3859
- custom_info["child_ag_args_fit"] = bagged_info.get("child_ag_args_fit", None)
3860
- custom_model_info[model_name] = custom_info
3861
-
3862
- model_info_keys = ["num_features", "model_type", "hyperparameters", "hyperparameters_fit", "ag_args_fit", "features"]
3863
- model_info_sum_keys = []
3864
- for key in model_info_keys:
3865
- model_info_dict[key] = [model_info[model_name][key] for model_name in model_names]
3866
- if key in model_info_sum_keys:
3867
- key_dict = {model_name: model_info[model_name][key] for model_name in model_names}
3868
- model_info_dict[key + "_full"] = [self.get_model_attribute_full(model=model_name, attribute=key_dict) for model_name in model_names]
3869
-
3870
- model_info_keys = [
3871
- "num_models",
3872
- "memory_size",
3873
- "memory_size_min",
3874
- "compile_time",
3875
- "child_model_type",
3876
- "child_hyperparameters",
3877
- "child_hyperparameters_fit",
3878
- "child_ag_args_fit",
3879
- ]
3880
- model_info_full_keys = {
3881
- "memory_size": [("memory_size_w_ancestors", sum)],
3882
- "memory_size_min": [("memory_size_min_w_ancestors", max)],
3883
- "num_models": [("num_models_w_ancestors", sum)],
3884
- }
3885
- for key in model_info_keys:
3886
- model_info_dict[key] = [custom_model_info[model_name][key] for model_name in model_names]
3887
- if key in model_info_full_keys:
3888
- key_dict = {model_name: custom_model_info[model_name][key] for model_name in model_names}
3889
- for column_name, func in model_info_full_keys[key]:
3890
- model_info_dict[column_name] = [
3891
- self.get_model_attribute_full(model=model_name, attribute=key_dict, func=func) for model_name in model_names
3892
- ]
3893
-
3894
- ancestors = [list(nx.dag.ancestors(self.model_graph, model_name)) for model_name in model_names]
3895
- descendants = [list(nx.dag.descendants(self.model_graph, model_name)) for model_name in model_names]
3896
-
3897
- model_info_dict["num_ancestors"] = [len(ancestor_lst) for ancestor_lst in ancestors]
3898
- model_info_dict["num_descendants"] = [len(descendant_lst) for descendant_lst in descendants]
3899
- model_info_dict["ancestors"] = ancestors
3900
- model_info_dict["descendants"] = descendants
3901
-
3902
- extra_info_dict = {
3903
- "stopping_metric": stopping_metric,
3904
- }
3905
-
3906
- df = pd.DataFrame(
3907
- data={
3908
- "model": model_names,
3909
- "score_val": score_val,
3910
- "eval_metric": eval_metric,
3911
- "pred_time_val": pred_time_val,
3912
- "fit_time": fit_time,
3913
- "pred_time_val_marginal": pred_time_val_marginal,
3914
- "fit_time_marginal": fit_time_marginal,
3915
- "stack_level": stack_level,
3916
- "can_infer": can_infer,
3917
- "fit_order": fit_order,
3918
- **extra_info_dict,
3919
- **model_info_dict,
3920
- }
3921
- )
3922
- df_sorted = df.sort_values(by=["score_val", "pred_time_val", "model"], ascending=[False, True, False]).reset_index(drop=True)
3923
-
3924
- df_columns_lst = df_sorted.columns.tolist()
3925
- explicit_order = [
3926
- "model",
3927
- "score_val",
3928
- "eval_metric",
3929
- "pred_time_val",
3930
- "fit_time",
3931
- "pred_time_val_marginal",
3932
- "fit_time_marginal",
3933
- "stack_level",
3934
- "can_infer",
3935
- "fit_order",
3936
- "num_features",
3937
- "num_models",
3938
- "num_models_w_ancestors",
3939
- "memory_size",
3940
- "memory_size_w_ancestors",
3941
- "memory_size_min",
3942
- "memory_size_min_w_ancestors",
3943
- "num_ancestors",
3944
- "num_descendants",
3945
- "model_type",
3946
- "child_model_type",
3947
- ]
3948
- explicit_order = [column for column in explicit_order if column in df_columns_lst]
3949
- df_columns_other = [column for column in df_columns_lst if column not in explicit_order]
3950
- df_columns_new = explicit_order + df_columns_other
3951
- df_sorted = df_sorted[df_columns_new]
3952
-
3953
- return df_sorted
3954
-
3955
- def model_failures(self) -> pd.DataFrame:
3956
- """
3957
- [Advanced] Get the model failures that occurred during the fitting of this predictor, in the form of a pandas DataFrame.
3958
-
3959
- This is useful for in-depth debugging of model failures and identifying bugs.
3960
-
3961
- Returns
3962
- -------
3963
- model_failures_df: pd.DataFrame
3964
- A DataFrame of model failures. Each row corresponds to a model failure, and columns correspond to meta information about that model.
3965
- """
3966
- model_infos = dict()
3967
- for i, (model_name, model_info) in enumerate(self._models_failed_to_train_errors.items()):
3968
- model_info = copy.deepcopy(model_info)
3969
- model_info_inner = model_info["model_info"]
3970
-
3971
- model_info_inner = self._flatten_model_info(model_info_inner)
3972
-
3973
- valid_keys = [
3974
- "exc_type",
3975
- "exc_str",
3976
- "exc_traceback",
3977
- "total_time",
3978
- ]
3979
- valid_keys_inner = [
3980
- "model_type",
3981
- "hyperparameters",
3982
- "hyperparameters_fit",
3983
- "is_initialized",
3984
- "is_fit",
3985
- "is_valid",
3986
- "can_infer",
3987
- "num_features",
3988
- "memory_size",
3989
- "num_models",
3990
- "child_model_type",
3991
- "child_hyperparameters",
3992
- "child_hyperparameters_fit",
3993
- ]
3994
- model_info_out = {k: v for k, v in model_info.items() if k in valid_keys}
3995
- model_info_inner_out = {k: v for k, v in model_info_inner.items() if k in valid_keys_inner}
3996
-
3997
- model_info_out.update(model_info_inner_out)
3998
- model_info_out["model"] = model_name
3999
- model_info_out["exc_order"] = i + 1
4000
-
4001
- model_infos[model_name] = model_info_out
4002
-
4003
- df = pd.DataFrame(
4004
- data=model_infos,
4005
- ).T
4006
-
4007
- explicit_order = [
4008
- "model",
4009
- "exc_type",
4010
- "total_time",
4011
- "model_type",
4012
- "child_model_type",
4013
- "is_initialized",
4014
- "is_fit",
4015
- "is_valid",
4016
- "can_infer",
4017
- "num_features",
4018
- "num_models",
4019
- "memory_size",
4020
- "hyperparameters",
4021
- "hyperparameters_fit",
4022
- "child_hyperparameters",
4023
- "child_hyperparameters_fit",
4024
- "exc_str",
4025
- "exc_traceback",
4026
- "exc_order",
4027
- ]
4028
-
4029
- df_columns_lst = list(df.columns)
4030
- explicit_order = [column for column in explicit_order if column in df_columns_lst]
4031
- df_columns_other = [column for column in df_columns_lst if column not in explicit_order]
4032
- df_columns_new = explicit_order + df_columns_other
4033
- df_sorted = df[df_columns_new]
4034
- df_sorted = df_sorted.reset_index(drop=True)
4035
-
4036
- return df_sorted
4037
-
4038
- def get_info(self, include_model_info=False, include_model_failures=True) -> dict:
4039
- num_models_trained = len(self.get_model_names())
4040
- if self.model_best is not None:
4041
- best_model = self.model_best
4042
- else:
4043
- try:
4044
- best_model = self.get_model_best()
4045
- except AssertionError:
4046
- best_model = None
4047
- if best_model is not None:
4048
- best_model_score_val = self.get_model_attribute(model=best_model, attribute="val_score")
4049
- best_model_stack_level = self.get_model_level(best_model)
4050
- else:
4051
- best_model_score_val = None
4052
- best_model_stack_level = None
4053
- # fit_time = None
4054
- num_bag_folds = self.k_fold
4055
- max_core_stack_level = self.get_max_level("core")
4056
- max_stack_level = self.get_max_level()
4057
-
4058
- problem_type = self.problem_type
4059
- eval_metric = self.eval_metric.name
4060
- time_train_start = self._time_train_start_last
4061
- num_rows_train = self._num_rows_train
4062
- num_cols_train = self._num_cols_train
4063
- num_rows_val = self._num_rows_val
4064
- num_rows_test = self._num_rows_test
4065
- num_classes = self.num_classes
4066
- # TODO:
4067
- # Disk size of models
4068
- # Raw feature count
4069
- # HPO time
4070
- # Bag time
4071
- # Feature prune time
4072
- # Exception count / models failed count
4073
- # True model count (models * kfold)
4074
- # AutoGluon version fit on
4075
- # Max memory usage
4076
- # CPU count used / GPU count used
4077
-
4078
- info = {
4079
- "time_train_start": time_train_start,
4080
- "num_rows_train": num_rows_train,
4081
- "num_cols_train": num_cols_train,
4082
- "num_rows_val": num_rows_val,
4083
- "num_rows_test": num_rows_test,
4084
- "num_classes": num_classes,
4085
- "problem_type": problem_type,
4086
- "eval_metric": eval_metric,
4087
- "best_model": best_model,
4088
- "best_model_score_val": best_model_score_val,
4089
- "best_model_stack_level": best_model_stack_level,
4090
- "num_models_trained": num_models_trained,
4091
- "num_bag_folds": num_bag_folds,
4092
- "max_stack_level": max_stack_level,
4093
- "max_core_stack_level": max_core_stack_level,
4094
- }
4095
-
4096
- if include_model_info:
4097
- info["model_info"] = self.get_models_info()
4098
- if include_model_failures:
4099
- info["model_info_failures"] = copy.deepcopy(self._models_failed_to_train_errors)
4100
-
4101
- return info
4102
-
4103
- def reduce_memory_size(
4104
- self, remove_data=True, remove_fit_stack=False, remove_fit=True, remove_info=False, requires_save=True, reduce_children=False, **kwargs
4105
- ):
4106
- if remove_data and self.is_data_saved:
4107
- data_files = [
4108
- os.path.join(self.path_data, "X.pkl"),
4109
- os.path.join(self.path_data, "X_val.pkl"),
4110
- os.path.join(self.path_data, "y.pkl"),
4111
- os.path.join(self.path_data, "y_val.pkl"),
4112
- ]
4113
- for data_file in data_files:
4114
- try:
4115
- os.remove(data_file)
4116
- except FileNotFoundError:
4117
- pass
4118
- if requires_save:
4119
- self.is_data_saved = False
4120
- try:
4121
- os.rmdir(self.path_data)
4122
- except OSError:
4123
- pass
4124
- shutil.rmtree(path=Path(self._path_attr), ignore_errors=True)
4125
- try:
4126
- os.rmdir(self.path_utils)
4127
- except OSError:
4128
- pass
4129
- if remove_info and requires_save:
4130
- # Remove model failure info artifacts
4131
- self._models_failed_to_train_errors = dict()
4132
- models = self.get_model_names()
4133
- for model in models:
4134
- model = self.load_model(model)
4135
- model.reduce_memory_size(
4136
- remove_fit_stack=remove_fit_stack,
4137
- remove_fit=remove_fit,
4138
- remove_info=remove_info,
4139
- requires_save=requires_save,
4140
- reduce_children=reduce_children,
4141
- **kwargs,
4142
- )
4143
- if requires_save:
4144
- self.save_model(model, reduce_memory=False)
4145
- if requires_save:
4146
- self.save()
4147
-
4148
- # TODO: Also enable deletion of models which didn't succeed in training (files may still be persisted)
4149
- # This includes the original HPO fold for stacking
4150
- # Deletes specified models from trainer and from disk (if delete_from_disk=True).
4151
- def delete_models(self, models_to_keep=None, models_to_delete=None, allow_delete_cascade=False, delete_from_disk=True, dry_run=True):
4152
- if models_to_keep is not None and models_to_delete is not None:
4153
- raise ValueError("Exactly one of [models_to_keep, models_to_delete] must be set.")
4154
- if models_to_keep is not None:
4155
- if not isinstance(models_to_keep, list):
4156
- models_to_keep = [models_to_keep]
4157
- minimum_model_set = set()
4158
- for model in models_to_keep:
4159
- minimum_model_set.update(self.get_minimum_model_set(model))
4160
- minimum_model_set = list(minimum_model_set)
4161
- models_to_remove = [model for model in self.get_model_names() if model not in minimum_model_set]
4162
- elif models_to_delete is not None:
4163
- if not isinstance(models_to_delete, list):
4164
- models_to_delete = [models_to_delete]
4165
- minimum_model_set = set(models_to_delete)
4166
- minimum_model_set_orig = copy.deepcopy(minimum_model_set)
4167
- for model in models_to_delete:
4168
- minimum_model_set.update(nx.algorithms.dag.descendants(self.model_graph, model))
4169
- if not allow_delete_cascade:
4170
- if minimum_model_set != minimum_model_set_orig:
4171
- raise AssertionError(
4172
- "models_to_delete contains models which cause a delete cascade due to other models being dependent on them. Set allow_delete_cascade=True to enable the deletion."
4173
- )
4174
- minimum_model_set = list(minimum_model_set)
4175
- models_to_remove = [model for model in self.get_model_names() if model in minimum_model_set]
4176
- else:
4177
- raise ValueError("Exactly one of [models_to_keep, models_to_delete] must be set.")
4178
-
4179
- if dry_run:
4180
- logger.log(30, f"Dry run enabled, AutoGluon would have deleted the following models: {models_to_remove}")
4181
- if delete_from_disk:
4182
- for model in models_to_remove:
4183
- model = self.load_model(model)
4184
- logger.log(30, f"\tDirectory {model.path} would have been deleted.")
4185
- logger.log(30, "To perform the deletion, set dry_run=False")
4186
- return
4187
-
4188
- if delete_from_disk:
4189
- for model in models_to_remove:
4190
- model = self.load_model(model)
4191
- model.delete_from_disk()
4192
-
4193
- for model in models_to_remove:
4194
- self._delete_model_from_graph(model=model)
4195
-
4196
- models_kept = self.get_model_names()
4197
-
4198
- if self.model_best is not None and self.model_best not in models_kept:
4199
- try:
4200
- self.model_best = self.get_model_best()
4201
- except AssertionError:
4202
- self.model_best = None
4203
-
4204
- # TODO: Delete from all the other model dicts
4205
- self.save()
4206
-
4207
- def _delete_model_from_graph(self, model: str):
4208
- self.model_graph.remove_node(model)
4209
- if model in self.models:
4210
- self.models.pop(model)
4211
- path_attr_model = Path(self._path_attr_model(model))
4212
- shutil.rmtree(path=path_attr_model, ignore_errors=True)
4213
-
4214
- @staticmethod
4215
- def _process_hyperparameters(hyperparameters: dict) -> dict:
4216
- return process_hyperparameters(hyperparameters=hyperparameters)
4217
-
4218
- def distill(
4219
- self,
4220
- X=None,
4221
- y=None,
4222
- X_val=None,
4223
- y_val=None,
4224
- X_unlabeled=None,
4225
- time_limit=None,
4226
- hyperparameters=None,
4227
- holdout_frac=None,
4228
- verbosity=None,
4229
- models_name_suffix=None,
4230
- teacher=None,
4231
- teacher_preds="soft",
4232
- augmentation_data=None,
4233
- augment_method="spunge",
4234
- augment_args={"size_factor": 5, "max_size": int(1e5)},
4235
- augmented_sample_weight=1.0,
4236
- ):
4237
- """Various distillation algorithms.
4238
- Args:
4239
- X, y: pd.DataFrame and pd.Series of training data.
4240
- If None, original training data used during predictor.fit() will be loaded.
4241
- This data is split into train/validation if X_val, y_val are None.
4242
- X_val, y_val: pd.DataFrame and pd.Series of validation data.
4243
- time_limit, hyperparameters, holdout_frac: defined as in predictor.fit()
4244
- teacher (None or str):
4245
- If None, uses the model with the highest validation score as the teacher model, otherwise use the specified model name as the teacher.
4246
- teacher_preds (None or str): If None, we only train with original labels (no data augmentation, overrides augment_method)
4247
- If 'hard', labels are hard teacher predictions given by: teacher.predict()
4248
- If 'soft', labels are soft teacher predictions given by: teacher.predict_proba()
4249
- Note: 'hard' and 'soft' are equivalent for regression problems.
4250
- If augment_method specified, teacher predictions are only used to label augmented data (training data keeps original labels).
4251
- To apply label-smoothing: teacher_preds='onehot' will use original training data labels converted to one-hots for multiclass (no data augmentation). # TODO: expose smoothing-hyperparameter.
4252
- models_name_suffix (str): Suffix to append to each student model's name, new names will look like: 'MODELNAME_dstl_SUFFIX'
4253
- augmentation_data: pd.DataFrame of additional data to use as "augmented data" (does not contain labels).
4254
- When specified, augment_method, augment_args are ignored, and this is the only augmented data that is used (teacher_preds cannot be None).
4255
- augment_method (None or str): specifies which augmentation strategy to utilize. Options: [None, 'spunge','munge']
4256
- If None, no augmentation gets applied.
4257
- }
4258
- augment_args (dict): args passed into the augmentation function corresponding to augment_method.
4259
- augmented_sample_weight (float): Nonnegative value indicating how much to weight augmented samples. This is only considered if sample_weight was initially specified in Predictor.
4260
- """
4261
- if verbosity is None:
4262
- verbosity = self.verbosity
4263
-
4264
- if teacher is None:
4265
- teacher = self._get_best()
4266
-
4267
- hyperparameter_tune = False # TODO: add as argument with scheduler options.
4268
- if augmentation_data is not None and teacher_preds is None:
4269
- raise ValueError("augmentation_data must be None if teacher_preds is None")
4270
-
4271
- logger.log(20, f"Distilling with teacher='{teacher}', teacher_preds={str(teacher_preds)}, augment_method={str(augment_method)} ...")
4272
- if teacher not in self.get_model_names(can_infer=True):
4273
- raise AssertionError(
4274
- f"Teacher model '{teacher}' is not a valid teacher model! Either it does not exist or it cannot infer on new data.\n"
4275
- f"Valid teacher models: {self.get_model_names(can_infer=True)}"
4276
- )
4277
- if X is None:
4278
- if y is not None:
4279
- raise ValueError("X cannot be None when y specified.")
4280
- X = self.load_X()
4281
- X_val = self.load_X_val()
4282
-
4283
- if y is None:
4284
- y = self.load_y()
4285
- y_val = self.load_y_val()
4286
-
4287
- if X_val is None:
4288
- if y_val is not None:
4289
- raise ValueError("X_val cannot be None when y_val specified.")
4290
- if holdout_frac is None:
4291
- holdout_frac = default_holdout_frac(len(X), hyperparameter_tune)
4292
- X, X_val, y, y_val = generate_train_test_split(X, y, problem_type=self.problem_type, test_size=holdout_frac)
4293
-
4294
- y_val_og = y_val.copy()
4295
- og_bagged_mode = self.bagged_mode
4296
- og_verbosity = self.verbosity
4297
- self.bagged_mode = False # turn off bagging
4298
- self.verbosity = verbosity # change verbosity for distillation
4299
-
4300
- if self.sample_weight is not None:
4301
- X, w = extract_column(X, self.sample_weight)
4302
-
4303
- if teacher_preds is None or teacher_preds == "onehot":
4304
- augment_method = None
4305
- logger.log(
4306
- 20, "Training students without a teacher model. Set teacher_preds = 'soft' or 'hard' to distill using the best AutoGluon predictor as teacher."
4307
- )
4308
-
4309
- if teacher_preds in ["onehot", "soft"]:
4310
- y = format_distillation_labels(y, self.problem_type, self.num_classes)
4311
- y_val = format_distillation_labels(y_val, self.problem_type, self.num_classes)
4312
-
4313
- if augment_method is None and augmentation_data is None:
4314
- if teacher_preds == "hard":
4315
- y_pred = pd.Series(self.predict(X, model=teacher))
4316
- if (self.problem_type != REGRESSION) and (len(y_pred.unique()) < len(y.unique())): # add missing labels
4317
- logger.log(15, "Adding missing labels to distillation dataset by including some real training examples")
4318
- indices_to_add = []
4319
- for clss in y.unique():
4320
- if clss not in y_pred.unique():
4321
- logger.log(15, f"Fetching a row with label={clss} from training data")
4322
- clss_index = y[y == clss].index[0]
4323
- indices_to_add.append(clss_index)
4324
- X_extra = X.loc[indices_to_add].copy()
4325
- y_extra = y.loc[indices_to_add].copy() # these are actually real training examples
4326
- X = pd.concat([X, X_extra])
4327
- y_pred = pd.concat([y_pred, y_extra])
4328
- if self.sample_weight is not None:
4329
- w = pd.concat([w, w[indices_to_add]])
4330
- y = y_pred
4331
- elif teacher_preds == "soft":
4332
- y = self.predict_proba(X, model=teacher)
4333
- if self.problem_type == MULTICLASS:
4334
- y = pd.DataFrame(y)
4335
- else:
4336
- y = pd.Series(y)
4337
- else:
4338
- X_aug = augment_data(
4339
- X=X, feature_metadata=self.feature_metadata, augmentation_data=augmentation_data, augment_method=augment_method, augment_args=augment_args
4340
- )
4341
- if len(X_aug) > 0:
4342
- if teacher_preds == "hard":
4343
- y_aug = pd.Series(self.predict(X_aug, model=teacher))
4344
- elif teacher_preds == "soft":
4345
- y_aug = self.predict_proba(X_aug, model=teacher)
4346
- if self.problem_type == MULTICLASS:
4347
- y_aug = pd.DataFrame(y_aug)
4348
- else:
4349
- y_aug = pd.Series(y_aug)
4350
- else:
4351
- raise ValueError(f"Unknown teacher_preds specified: {teacher_preds}")
4352
-
4353
- X = pd.concat([X, X_aug])
4354
- y = pd.concat([y, y_aug])
4355
- if self.sample_weight is not None:
4356
- w = pd.concat([w, pd.Series([augmented_sample_weight] * len(X_aug))])
4357
-
4358
- X.reset_index(drop=True, inplace=True)
4359
- y.reset_index(drop=True, inplace=True)
4360
- if self.sample_weight is not None:
4361
- w.reset_index(drop=True, inplace=True)
4362
- X[self.sample_weight] = w
4363
-
4364
- name_suffix = "_DSTL" # all student model names contain this substring
4365
- if models_name_suffix is not None:
4366
- name_suffix = name_suffix + "_" + models_name_suffix
4367
-
4368
- if hyperparameters is None:
4369
- hyperparameters = {"GBM": {}, "CAT": {}, "NN_TORCH": {}, "RF": {}}
4370
- hyperparameters = self._process_hyperparameters(
4371
- hyperparameters=hyperparameters
4372
- ) # TODO: consider exposing ag_args_fit, excluded_model_types as distill() arguments.
4373
- if teacher_preds is not None and teacher_preds != "hard" and self.problem_type != REGRESSION:
4374
- self._regress_preds_asprobas = True
4375
-
4376
- core_kwargs = {
4377
- "stack_name": self.distill_stackname,
4378
- "get_models_func": self.construct_model_templates_distillation,
4379
- }
4380
- aux_kwargs = {
4381
- "get_models_func": self.construct_model_templates_distillation,
4382
- "check_if_best": False,
4383
- }
4384
-
4385
- # self.bagged_mode = True # TODO: Add options for bagging
4386
- models = self.train_multi_levels(
4387
- X=X,
4388
- y=y,
4389
- X_val=X_val,
4390
- y_val=y_val,
4391
- hyperparameters=hyperparameters,
4392
- time_limit=time_limit, # FIXME: Also limit augmentation time
4393
- name_suffix=name_suffix,
4394
- core_kwargs=core_kwargs,
4395
- aux_kwargs=aux_kwargs,
4396
- )
4397
-
4398
- distilled_model_names = []
4399
- w_val = None
4400
- if self.weight_evaluation:
4401
- X_val, w_val = extract_column(X_val, self.sample_weight)
4402
- for model_name in models: # finally measure original metric on validation data and overwrite stored val_scores
4403
- model_score = self.score(X_val, y_val_og, model=model_name, weights=w_val)
4404
- model_obj = self.load_model(model_name)
4405
- model_obj.val_score = model_score
4406
- model_obj.save() # TODO: consider omitting for sake of efficiency
4407
- self.model_graph.nodes[model_name]["val_score"] = model_score
4408
- distilled_model_names.append(model_name)
4409
- leaderboard = self.leaderboard()
4410
- logger.log(20, "Distilled model leaderboard:")
4411
- leaderboard_distilled = leaderboard[leaderboard["model"].isin(models)].reset_index(drop=True)
4412
- with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.width", 1000):
4413
- logger.log(20, leaderboard_distilled)
4414
-
4415
- # reset trainer to old state before distill() was called:
4416
- self.bagged_mode = og_bagged_mode # TODO: Confirm if safe to train future models after training models in both bagged and non-bagged modes
4417
- self.verbosity = og_verbosity
4418
- return distilled_model_names
4419
-
4420
- def _get_model_fit_kwargs(
4421
- self, X: pd.DataFrame, X_val: pd.DataFrame, time_limit: float, k_fold: int, fit_kwargs: dict, ens_sample_weight: list | None = None
4422
- ) -> dict:
4423
- # Returns kwargs to be passed to AbstractModel's fit function
4424
- if fit_kwargs is None:
4425
- fit_kwargs = dict()
4426
-
4427
- model_fit_kwargs = dict(time_limit=time_limit, verbosity=self.verbosity, **fit_kwargs)
4428
- if self.sample_weight is not None:
4429
- X, w_train = extract_column(X, self.sample_weight)
4430
- if w_train is not None: # may be None for ensemble
4431
- # TODO: consider moving weight normalization into AbstractModel.fit()
4432
- model_fit_kwargs["sample_weight"] = w_train.values / w_train.mean() # normalization can affect gradient algorithms like boosting
4433
- if X_val is not None:
4434
- X_val, w_val = extract_column(X_val, self.sample_weight)
4435
- if self.weight_evaluation and w_val is not None: # ignore validation sample weights unless weight_evaluation specified
4436
- model_fit_kwargs["sample_weight_val"] = w_val.values / w_val.mean()
4437
- if ens_sample_weight is not None:
4438
- model_fit_kwargs["sample_weight"] = ens_sample_weight # sample weights to use for weighted ensemble only
4439
- if self._groups is not None and "groups" not in model_fit_kwargs:
4440
- if k_fold == self.k_fold: # don't do this on refit full
4441
- model_fit_kwargs["groups"] = self._groups
4442
-
4443
- # FIXME: Sample weight `extract_column` is a hack, have to compute feature_metadata here because sample weight column could be in X upstream, extract sample weight column upstream instead.
4444
- if "feature_metadata" not in model_fit_kwargs:
4445
- raise AssertionError(f"Missing expected parameter 'feature_metadata'.")
4446
- return model_fit_kwargs
4447
-
4448
- def _get_bagged_model_fit_kwargs(self, k_fold: int, k_fold_start: int, k_fold_end: int, n_repeats: int, n_repeat_start: int) -> dict:
4449
- # Returns additional kwargs (aside from _get_model_fit_kwargs) to be passed to BaggedEnsembleModel's fit function
4450
- if k_fold is None:
4451
- k_fold = self.k_fold
4452
- if n_repeats is None:
4453
- n_repeats = self.n_repeats
4454
- return dict(
4455
- k_fold=k_fold, k_fold_start=k_fold_start, k_fold_end=k_fold_end, n_repeats=n_repeats, n_repeat_start=n_repeat_start, compute_base_preds=False
4456
- )
4457
-
4458
- def _get_feature_prune_proxy_model(self, proxy_model_class: AbstractModel | None, level: int) -> AbstractModel:
4459
- """
4460
- Returns proxy model to be used for feature pruning - the base learner that has the highest validation score in a particular stack layer.
4461
- Ties are broken by inference speed. If proxy_model_class is not None, take the best base learner belonging to proxy_model_class.
4462
- proxy_model_class is an AbstractModel class (ex. LGBModel).
4463
- """
4464
- proxy_model = None
4465
- if isinstance(proxy_model_class, str):
4466
- raise AssertionError(f"proxy_model_class must be a subclass of AbstractModel. Was instead a string: {proxy_model_class}")
4467
- banned_models = [GreedyWeightedEnsembleModel, SimpleWeightedEnsembleModel]
4468
- assert proxy_model_class not in banned_models, "WeightedEnsemble models cannot be feature pruning proxy models."
4469
-
4470
- leaderboard = self.leaderboard()
4471
- banned_names = []
4472
- candidate_model_rows = leaderboard[(~leaderboard["score_val"].isna()) & (leaderboard["stack_level"] == level)]
4473
- candidate_models_type_inner = self.get_models_attribute_dict(attribute="type_inner", models=candidate_model_rows["model"])
4474
- for model_name, type_inner in candidate_models_type_inner.copy().items():
4475
- if type_inner in banned_models:
4476
- banned_names.append(model_name)
4477
- candidate_models_type_inner.pop(model_name, None)
4478
- banned_names = set(banned_names)
4479
- candidate_model_rows = candidate_model_rows[~candidate_model_rows["model"].isin(banned_names)]
4480
- if proxy_model_class is not None:
4481
- candidate_model_names = [model_name for model_name, model_class in candidate_models_type_inner.items() if model_class == proxy_model_class]
4482
- candidate_model_rows = candidate_model_rows[candidate_model_rows["model"].isin(candidate_model_names)]
4483
- if len(candidate_model_rows) == 0:
4484
- if proxy_model_class is None:
4485
- logger.warning(f"No models from level {level} have been successfully fit. Skipping feature pruning.")
4486
- else:
4487
- logger.warning(f"No models of type {proxy_model_class} have finished training in level {level}. Skipping feature pruning.")
4488
- return proxy_model
4489
- best_candidate_model_rows = candidate_model_rows.loc[candidate_model_rows["score_val"] == candidate_model_rows["score_val"].max()]
4490
- return self.load_model(best_candidate_model_rows.loc[best_candidate_model_rows["fit_time"].idxmin()]["model"])
4491
-
4492
- def calibrate_model(self, model_name: str | None = None, lr: float = 0.1, max_iter: int = 200, init_val: float = 1.0):
4493
- """
4494
- Applies temperature scaling to a model.
4495
- Applies inverse softmax to predicted probs then trains temperature scalar
4496
- on validation data to maximize negative log likelihood.
4497
- Inversed softmaxes are divided by temperature scalar
4498
- then softmaxed to return predicted probs.
4499
-
4500
- Parameters:
4501
- -----------
4502
- model_name: str: default = None
4503
- model name to tune temperature scaling on.
4504
- If None, will tune best model only. Best model chosen by validation score
4505
- lr: float: default = 0.1
4506
- The learning rate for temperature scaling algorithm
4507
- max_iter: int: default = 200
4508
- Number of iterations optimizer should take for
4509
- tuning temperature scaler
4510
- init_val: float: default = 1.0
4511
- The initial value for temperature scalar term
4512
- """
4513
- # TODO: Note that temperature scaling is known to worsen calibration in the face of shifted test data.
4514
- try:
4515
- # FIXME: Avoid depending on torch for temp scaling
4516
- try_import_torch()
4517
- except ImportError:
4518
- logger.log(30, "Warning: Torch is not installed, skipping calibration step...")
4519
- return
4520
-
4521
- if model_name is None:
4522
- if self.has_val:
4523
- can_infer = True
4524
- else:
4525
- can_infer = None
4526
- if self.model_best is not None:
4527
- models = self.get_model_names(can_infer=can_infer)
4528
- if self.model_best in models:
4529
- model_name = self.model_best
4530
- if model_name is None:
4531
- model_name = self.get_model_best(can_infer=can_infer)
4532
-
4533
- model_refit_map = self.model_refit_map()
4534
- model_name_og = model_name
4535
- for m, m_full in model_refit_map.items():
4536
- if m_full == model_name:
4537
- model_name_og = m
4538
- break
4539
- if self.has_val:
4540
- X_val = self.load_X_val()
4541
- y_val_probs = self.predict_proba(X_val, model_name_og)
4542
- y_val = self.load_y_val().to_numpy()
4543
- else: # bagged mode
4544
- y_val_probs = self.get_model_oof(model_name_og)
4545
- y_val = self.load_y().to_numpy()
4546
-
4547
- y_val_probs_og = y_val_probs
4548
- if self.problem_type == BINARY:
4549
- # Convert one-dimensional array to be in the form of a 2-class multiclass predict_proba output
4550
- y_val_probs = LabelCleanerMulticlassToBinary.convert_binary_proba_to_multiclass_proba(y_val_probs)
4551
-
4552
- model = self.load_model(model_name=model_name)
4553
- if self.problem_type == QUANTILE:
4554
- logger.log(15, f"Conformity scores being computed to calibrate model: {model_name}")
4555
- conformalize = compute_conformity_score(y_val_pred=y_val_probs, y_val=y_val, quantile_levels=self.quantile_levels)
4556
- model.conformalize = conformalize
4557
- model.save()
4558
- else:
4559
- logger.log(15, f"Temperature scaling term being tuned for model: {model_name}")
4560
- temp_scalar = tune_temperature_scaling(y_val_probs=y_val_probs, y_val=y_val, init_val=init_val, max_iter=max_iter, lr=lr)
4561
- if temp_scalar is None:
4562
- logger.log(
4563
- 15,
4564
- f"Warning: Infinity found during calibration, skipping calibration on {model.name}! "
4565
- f"This can occur when the model is absolutely certain of a validation prediction (1.0 pred_proba).",
4566
- )
4567
- elif temp_scalar <= 0:
4568
- logger.log(
4569
- 30,
4570
- f"Warning: Temperature scaling found optimal at a negative value ({temp_scalar}). Disabling temperature scaling to avoid overfitting.",
4571
- )
4572
- else:
4573
- # Check that scaling improves performance for the target metric
4574
- score_without_temp = self.score_with_y_pred_proba(y=y_val, y_pred_proba=y_val_probs_og, weights=None)
4575
- scaled_y_val_probs = apply_temperature_scaling(y_val_probs, temp_scalar, problem_type=self.problem_type, transform_binary_proba=False)
4576
- score_with_temp = self.score_with_y_pred_proba(y=y_val, y_pred_proba=scaled_y_val_probs, weights=None)
4577
-
4578
- if score_with_temp > score_without_temp:
4579
- logger.log(15, f"Temperature term found is: {temp_scalar}")
4580
- model.params_aux["temperature_scalar"] = temp_scalar
4581
- model.save()
4582
- else:
4583
- logger.log(15, "Temperature did not improve performance, skipping calibration.")
4584
-
4585
- def calibrate_decision_threshold(
4586
- self,
4587
- X: pd.DataFrame | None = None,
4588
- y: np.ndarray | None = None,
4589
- metric: str | Scorer | None = None,
4590
- model: str = "best",
4591
- weights=None,
4592
- decision_thresholds: int | list[float] = 25,
4593
- secondary_decision_thresholds: int | None = 19,
4594
- verbose: bool = True,
4595
- **kwargs,
4596
- ) -> float:
4597
- # TODO: Docstring
4598
- assert self.problem_type == BINARY, f'calibrate_decision_threshold is only available for `problem_type="{BINARY}"`'
4599
-
4600
- if metric is None:
4601
- metric = self.eval_metric
4602
- elif isinstance(metric, str):
4603
- metric = get_metric(metric, self.problem_type, "eval_metric")
4604
-
4605
- if model == "best":
4606
- model = self.get_model_best()
4607
-
4608
- if y is None:
4609
- # If model is refit_full, use its parent to avoid over-fitting
4610
- model_parent = self.get_refit_full_parent(model=model)
4611
- if not self.model_exists(model_parent):
4612
- raise AssertionError(
4613
- f"Unable to calibrate the decision threshold on the internal data because the "
4614
- f'model "{model}" is a refit_full model trained on all of the internal data, '
4615
- f'whose parent model "{model_parent}" does not exist or was deleted.\n'
4616
- f"It may have been deleted due to `predictor.fit(..., keep_only_best=True)`. "
4617
- f"Ensure `keep_only_best=False` to be able to calibrate refit_full models."
4618
- )
4619
- model = model_parent
4620
-
4621
- # TODO: Add helpful logging when data is not available, for example post optimize for deployment
4622
- if self.has_val:
4623
- # Use validation data
4624
- X = self.load_X_val()
4625
- if self.weight_evaluation:
4626
- X, weights = extract_column(X=X, col_name=self.sample_weight)
4627
- y: np.array = self.load_y_val()
4628
- y_pred_proba = self.predict_proba(X=X, model=model)
4629
- else:
4630
- # Use out-of-fold data
4631
- if self.weight_evaluation:
4632
- X = self.load_X()
4633
- X, weights = extract_column(X=X, col_name=self.sample_weight)
4634
- y: np.array = self.load_y()
4635
- y_pred_proba = self.get_model_oof(model=model)
4636
- else:
4637
- y_pred_proba = self.predict_proba(X=X, model=model)
4638
-
4639
- if not metric.needs_pred:
4640
- logger.warning(
4641
- f'WARNING: The provided metric "{metric.name}" does not use class predictions for scoring, '
4642
- f"and thus is invalid for decision threshold calibration. "
4643
- f"Falling back to `decision_threshold=0.5`."
4644
- )
4645
- return 0.5
4646
-
4647
- return calibrate_decision_threshold(
4648
- y=y,
4649
- y_pred_proba=y_pred_proba,
4650
- metric=lambda y, y_pred: self.score_with_y_pred(y=y, y_pred=y_pred, weights=weights, metric=metric),
4651
- decision_thresholds=decision_thresholds,
4652
- secondary_decision_thresholds=secondary_decision_thresholds,
4653
- metric_name=metric.name,
4654
- verbose=verbose,
4655
- **kwargs,
4656
- )
4657
-
4658
- @staticmethod
4659
- def _validate_num_classes(num_classes: int | None, problem_type: str):
4660
- if problem_type == BINARY:
4661
- assert num_classes is not None and num_classes == 2, f"num_classes must be 2 when problem_type='{problem_type}' (num_classes={num_classes})"
4662
- elif problem_type in [MULTICLASS, SOFTCLASS]:
4663
- assert num_classes is not None and num_classes >= 2, f"num_classes must be >=2 when problem_type='{problem_type}' (num_classes={num_classes})"
4664
- elif problem_type in [REGRESSION, QUANTILE]:
4665
- assert num_classes is None, f"num_classes must be None when problem_type='{problem_type}' (num_classes={num_classes})"
4666
- else:
4667
- raise AssertionError(f"Unknown problem_type: '{problem_type}'. Valid problem types: {[BINARY, MULTICLASS, REGRESSION, SOFTCLASS, QUANTILE]}")
4668
-
4669
- @staticmethod
4670
- def _validate_quantile_levels(quantile_levels: list[float] | np.ndarray | None, problem_type: str):
4671
- if problem_type == QUANTILE:
4672
- assert quantile_levels is not None, f"quantile_levels must not be None when problem_type='{problem_type}' (quantile_levels={quantile_levels})"
4673
- assert isinstance(quantile_levels, (list, np.ndarray)), f"quantile_levels must be a list or np.ndarray (quantile_levels={quantile_levels})"
4674
- assert len(quantile_levels) > 0, f"quantile_levels must not be an empty list (quantile_levels={quantile_levels})"
4675
- else:
4676
- assert quantile_levels is None, f"quantile_levels must be None when problem_type='{problem_type}' (quantile_levels={quantile_levels})"
4677
-
4678
-
4679
- def _detached_train_multi_fold(
4680
- *,
4681
- _self: AbstractTabularTrainer,
4682
- model: str | AbstractModel,
4683
- X: pd.DataFrame,
4684
- y: pd.Series,
4685
- time_split: bool,
4686
- time_start: float,
4687
- time_limit: float | None,
4688
- time_limit_model_split: float | None,
4689
- hyperparameter_tune_kwargs: dict,
4690
- is_ray_worker: bool = False,
4691
- kwargs: dict,
4692
- ) -> list[str]:
4693
- """Dedicated class-detached function to train a single model on multiple folds."""
4694
- if isinstance(model,str):
4695
- model = _self.load_model(model)
4696
- elif _self.low_memory:
4697
- model = copy.deepcopy(model)
4698
- if hyperparameter_tune_kwargs is not None and isinstance(hyperparameter_tune_kwargs,dict):
4699
- hyperparameter_tune_kwargs_model = hyperparameter_tune_kwargs.get(model.name,None)
4700
- else:
4701
- hyperparameter_tune_kwargs_model=None
4702
- # TODO: Only update scores when finished, only update model as part of final models if finished!
4703
- if time_split:
4704
- time_left=time_limit_model_split
4705
- else:
4706
- if time_limit is None:
4707
- time_left=None
4708
- else:
4709
- time_start_model=time.time()
4710
- time_left=time_limit-(time_start_model-time_start)
4711
-
4712
- model_name_trained_lst = _self._train_single_full(
4713
- X,
4714
- y,
4715
- model,
4716
- time_limit=time_left,
4717
- hyperparameter_tune_kwargs=hyperparameter_tune_kwargs_model,
4718
- is_ray_worker=is_ray_worker,
4719
- **kwargs
4720
- )
4721
-
4722
- if _self.low_memory:
4723
- del model
4724
-
4725
- return model_name_trained_lst
4726
-
4727
-
4728
- def _remote_train_multi_fold(
4729
- *,
4730
- _self: AbstractTabularTrainer,
4731
- model: str | AbstractModel,
4732
- X: pd.DataFrame,
4733
- y: pd.Series,
4734
- time_split: bool,
4735
- time_start: float,
4736
- time_limit: float | None,
4737
- time_limit_model_split: float | None,
4738
- hyperparameter_tune_kwargs: dict,
4739
- kwargs: dict,
4740
- errors: Literal["ignore", "raise"] | None = None,
4741
- ) -> tuple[str, str | None, str | None, Exception | None, dict | None]:
4742
- reset_logger_for_remote_call(verbosity=_self.verbosity)
4743
-
4744
- if errors is not None:
4745
- kwargs["errors"] = errors
4746
-
4747
- exception = None
4748
- try:
4749
- model_name_list = _detached_train_multi_fold(
4750
- _self=_self,
4751
- model=model,
4752
- X=X,
4753
- y=y,
4754
- time_start=time_start,
4755
- time_split=time_split,
4756
- time_limit=time_limit,
4757
- time_limit_model_split=time_limit_model_split,
4758
- hyperparameter_tune_kwargs=hyperparameter_tune_kwargs,
4759
- is_ray_worker=True,
4760
- kwargs=kwargs,
4761
- )
4762
- except Exception as exc:
4763
- model_name_list = []
4764
- if errors is not None and errors == "raise":
4765
- # If training fails and exception is returned, collect the exception information and return
4766
- exception = exc # required to use in outer scope
4767
- else:
4768
- raise exc
4769
-
4770
- if not model_name_list:
4771
- model_name = model if isinstance(model, str) else model.name
4772
- # Get model_failure metadata if it exists
4773
- model_failure_info = None
4774
- if model_name in _self._models_failed_to_train_errors:
4775
- model_failure_info = _self._models_failed_to_train_errors[model_name]
4776
- return model_name, None, None, exception, model_failure_info
4777
-
4778
- # Fallback, return original model name if training failed.
4779
- if not model_name_list:
4780
- model_name = model if isinstance(model, str) else model.name
4781
- return model_name, None, None, None, None
4782
- model_name = model_name_list[0]
4783
- return model_name, _self.get_model_attribute(model=model_name, attribute="path"), _self.get_model_attribute(model=model_name, attribute="type"), None, None
4784
-
4785
-
4786
- def _detached_refit_single_full(
4787
- *,
4788
- _self: AbstractTabularTrainer,
4789
- model: str,
4790
- X: pd.DataFrame,
4791
- y: pd.Series,
4792
- X_val: pd.DataFrame,
4793
- y_val: pd.Series,
4794
- X_unlabeled: pd.DataFrame,
4795
- level: int,
4796
- kwargs: dict,
4797
- fit_strategy: Literal["sequential", "parallel"] = "sequential",
4798
- ) -> tuple[str, list[str]]:
4799
- # TODO: loading the model is the reasons we must allocate GPU resources for this job in cases where models require GPU when loaded from disk
4800
- model=_self.load_model(model)
4801
- model_name = model.name
4802
- reuse_first_fold = False
4803
-
4804
- if isinstance(model,BaggedEnsembleModel):
4805
- # Reuse if model is already _FULL and no X_val
4806
- if X_val is None:
4807
- reuse_first_fold = not model._bagged_mode
4808
-
4809
- if not reuse_first_fold:
4810
- if isinstance(model,BaggedEnsembleModel):
4811
- can_refit_full=model._get_tags_child().get("can_refit_full",False)
4812
- else:
4813
- can_refit_full=model._get_tags().get("can_refit_full",False)
4814
- reuse_first_fold = not can_refit_full
4815
-
4816
- if not reuse_first_fold:
4817
- model_full=model.convert_to_refit_full_template()
4818
- # Mitigates situation where bagged models barely had enough memory and refit requires more. Worst case results in OOM, but this lowers chance of failure.
4819
- model_full._user_params_aux["max_memory_usage_ratio"]=model.params_aux["max_memory_usage_ratio"]*1.15
4820
- # Re-set user specified training resources.
4821
- # FIXME: this is technically also a bug for non-distributed mode, but there it is good to use more/all resources per refit.
4822
- # FIXME: Unsure if it is better to do model.fit_num_cpus or model.fit_num_cpus_child,
4823
- # (Nick): I'm currently leaning towards model.fit_num_cpus, it is also less memory intensive
4824
- # Better to not specify this for sequential fits, since we want the models to use the optimal amount of resources,
4825
- # which could be less than the available resources (ex: LightGBM fits faster using 50% of the cores)
4826
- if fit_strategy == "parallel":
4827
- # FIXME: Why use `model.fit_num_cpus_child` when we can use the same values as was passed to `ray` for the process, just pass those values as kwargs. Eliminates chance of inconsistency.
4828
- if model.fit_num_cpus_child is not None:
4829
- model_full._user_params_aux["num_cpus"] = model.fit_num_cpus_child
4830
- if model.fit_num_gpus_child is not None:
4831
- model_full._user_params_aux["num_gpus"] = model.fit_num_gpus_child
4832
- # TODO: Do it for all models in the level at once to avoid repeated processing of data?
4833
- base_model_names=_self.get_base_model_names(model_name)
4834
- # FIXME: Logs for inference speed (1 row) are incorrect because
4835
- # parents are non-refit models in this sequence and later correct after logging.
4836
- # Avoiding fix at present to minimize hacks in the code.
4837
- # Return to this later when Trainer controls all stacking logic to map correct parent.
4838
- models_trained = _self.stack_new_level_core(
4839
- X=X,
4840
- y=y,
4841
- X_val=X_val,
4842
- y_val=y_val,
4843
- X_unlabeled=X_unlabeled,
4844
- models=[model_full],
4845
- base_model_names=base_model_names,
4846
- level=level,
4847
- stack_name=REFIT_FULL_NAME,
4848
- hyperparameter_tune_kwargs=None,
4849
- feature_prune=False,
4850
- k_fold=0,
4851
- n_repeats=1,
4852
- ensemble_type=type(model),
4853
- refit_full=True,
4854
- **kwargs,
4855
- )
4856
- if len(models_trained)==0:
4857
- reuse_first_fold=True
4858
- logger.log(30,f"WARNING: Refit training failure detected for '{model_name}'... "
4859
- f"Falling back to using first fold to avoid downstream exception."
4860
- f"\n\tThis is likely due to an out-of-memory error or other memory related issue. "
4861
- f"\n\tPlease create a GitHub issue if this was triggered from a non-memory related problem.",)
4862
- if not model.params.get("save_bag_folds",True):
4863
- raise AssertionError(f"Cannot avoid training failure during refit for '{model_name}' by falling back to "
4864
- f"copying the first fold because it does not exist! (save_bag_folds=False)"
4865
- f"\n\tPlease specify `save_bag_folds=True` in the `.fit` call to avoid this exception.")
4866
-
4867
- if reuse_first_fold:
4868
- # Perform fallback black-box refit logic that doesn't retrain.
4869
- model_full=model.convert_to_refit_full_via_copy()
4870
- # FIXME: validation time not correct for infer 1 batch time, needed to hack _is_refit=True to fix
4871
- logger.log(20,f"Fitting model: {model_full.name} | Skipping fit via cloning parent ...")
4872
- _self._add_model(model_full,stack_name=REFIT_FULL_NAME,level=level,_is_refit=True)
4873
- _self.save_model(model_full)
4874
- models_trained=[model_full.name]
4875
-
4876
- return model_name, models_trained
4877
-
4878
-
4879
- def _remote_refit_single_full(
4880
- *,
4881
- _self: AbstractTabularTrainer,
4882
- model: str,
4883
- X: pd.DataFrame,
4884
- y: pd.Series,
4885
- X_val: pd.DataFrame,
4886
- y_val: pd.Series,
4887
- X_unlabeled: pd.DataFrame,
4888
- level: int,
4889
- kwargs: dict,
4890
- fit_strategy: Literal["sequential", "parallel"],
4891
- ) -> tuple[str, str, list[str], str, str]:
4892
- reset_logger_for_remote_call(verbosity=_self.verbosity)
4893
-
4894
- model_name, models_trained = _detached_refit_single_full(
4895
- _self=_self,
4896
- model=model,
4897
- X=X,
4898
- y=y,
4899
- X_val=X_val,
4900
- y_val=y_val,
4901
- X_unlabeled=X_unlabeled,
4902
- level=level,
4903
- kwargs=kwargs,
4904
- fit_strategy=fit_strategy,
4905
- )
4906
-
4907
- # We always just refit one model per call, so this must be the case.
4908
- assert len(models_trained) == 1
4909
- refitted_model_name = models_trained[0]
4910
- return model_name, refitted_model_name, _self.get_model_attribute(model=refitted_model_name,attribute="path"),_self.get_model_attribute(model=refitted_model_name, attribute="type")