autogluon.tabular 1.5.1b20260105__py3-none-any.whl → 1.5.1b20260116__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of autogluon.tabular might be problematic. Click here for more details.

Files changed (135) hide show
  1. autogluon/tabular/__init__.py +1 -0
  2. autogluon/tabular/configs/config_helper.py +18 -6
  3. autogluon/tabular/configs/feature_generator_presets.py +3 -1
  4. autogluon/tabular/configs/hyperparameter_configs.py +42 -9
  5. autogluon/tabular/configs/presets_configs.py +38 -14
  6. autogluon/tabular/configs/zeroshot/zeroshot_portfolio_2023.py +84 -14
  7. autogluon/tabular/configs/zeroshot/zeroshot_portfolio_2025.py +48 -48
  8. autogluon/tabular/configs/zeroshot/zeroshot_portfolio_cpu_2025_12_18.py +774 -1
  9. autogluon/tabular/configs/zeroshot/zeroshot_portfolio_gpu_2025_12_18.py +421 -1
  10. autogluon/tabular/experimental/_scikit_mixin.py +6 -2
  11. autogluon/tabular/experimental/_tabular_classifier.py +3 -1
  12. autogluon/tabular/experimental/_tabular_regressor.py +3 -1
  13. autogluon/tabular/experimental/plot_leaderboard.py +73 -19
  14. autogluon/tabular/learner/abstract_learner.py +160 -42
  15. autogluon/tabular/learner/default_learner.py +78 -22
  16. autogluon/tabular/models/__init__.py +2 -2
  17. autogluon/tabular/models/_utils/rapids_utils.py +3 -1
  18. autogluon/tabular/models/abstract/abstract_torch_model.py +2 -0
  19. autogluon/tabular/models/automm/automm_model.py +12 -3
  20. autogluon/tabular/models/automm/ft_transformer.py +5 -1
  21. autogluon/tabular/models/catboost/callbacks.py +2 -2
  22. autogluon/tabular/models/catboost/catboost_model.py +93 -29
  23. autogluon/tabular/models/catboost/catboost_softclass_utils.py +4 -1
  24. autogluon/tabular/models/catboost/catboost_utils.py +3 -1
  25. autogluon/tabular/models/ebm/ebm_model.py +8 -13
  26. autogluon/tabular/models/ebm/hyperparameters/parameters.py +1 -0
  27. autogluon/tabular/models/ebm/hyperparameters/searchspaces.py +1 -0
  28. autogluon/tabular/models/fastainn/callbacks.py +20 -3
  29. autogluon/tabular/models/fastainn/hyperparameters/searchspaces.py +11 -1
  30. autogluon/tabular/models/fastainn/quantile_helpers.py +10 -2
  31. autogluon/tabular/models/fastainn/tabular_nn_fastai.py +65 -18
  32. autogluon/tabular/models/fasttext/fasttext_model.py +3 -1
  33. autogluon/tabular/models/image_prediction/image_predictor.py +7 -2
  34. autogluon/tabular/models/knn/knn_model.py +41 -8
  35. autogluon/tabular/models/lgb/callbacks.py +32 -9
  36. autogluon/tabular/models/lgb/hyperparameters/searchspaces.py +3 -1
  37. autogluon/tabular/models/lgb/lgb_model.py +150 -34
  38. autogluon/tabular/models/lgb/lgb_utils.py +12 -4
  39. autogluon/tabular/models/lr/hyperparameters/searchspaces.py +5 -1
  40. autogluon/tabular/models/lr/lr_model.py +40 -10
  41. autogluon/tabular/models/lr/lr_rapids_model.py +22 -13
  42. autogluon/tabular/models/mitra/_internal/__init__.py +1 -1
  43. autogluon/tabular/models/mitra/_internal/config/__init__.py +1 -1
  44. autogluon/tabular/models/mitra/_internal/config/config_pretrain.py +36 -40
  45. autogluon/tabular/models/mitra/_internal/config/config_run.py +2 -14
  46. autogluon/tabular/models/mitra/_internal/config/enums.py +27 -26
  47. autogluon/tabular/models/mitra/_internal/core/__init__.py +1 -1
  48. autogluon/tabular/models/mitra/_internal/core/callbacks.py +14 -21
  49. autogluon/tabular/models/mitra/_internal/core/get_loss.py +10 -12
  50. autogluon/tabular/models/mitra/_internal/core/get_optimizer.py +17 -32
  51. autogluon/tabular/models/mitra/_internal/core/get_scheduler.py +12 -27
  52. autogluon/tabular/models/mitra/_internal/core/prediction_metrics.py +16 -21
  53. autogluon/tabular/models/mitra/_internal/core/trainer_finetune.py +130 -111
  54. autogluon/tabular/models/mitra/_internal/data/__init__.py +1 -1
  55. autogluon/tabular/models/mitra/_internal/data/collator.py +30 -26
  56. autogluon/tabular/models/mitra/_internal/data/dataset_finetune.py +18 -26
  57. autogluon/tabular/models/mitra/_internal/data/dataset_split.py +10 -7
  58. autogluon/tabular/models/mitra/_internal/data/preprocessor.py +70 -100
  59. autogluon/tabular/models/mitra/_internal/models/__init__.py +1 -1
  60. autogluon/tabular/models/mitra/_internal/models/base.py +7 -10
  61. autogluon/tabular/models/mitra/_internal/models/embedding.py +46 -56
  62. autogluon/tabular/models/mitra/_internal/models/tab2d.py +140 -120
  63. autogluon/tabular/models/mitra/_internal/utils/__init__.py +1 -1
  64. autogluon/tabular/models/mitra/_internal/utils/set_seed.py +3 -1
  65. autogluon/tabular/models/mitra/mitra_model.py +16 -11
  66. autogluon/tabular/models/mitra/sklearn_interface.py +178 -162
  67. autogluon/tabular/models/realmlp/realmlp_model.py +28 -15
  68. autogluon/tabular/models/rf/compilers/onnx.py +1 -1
  69. autogluon/tabular/models/rf/rf_model.py +45 -12
  70. autogluon/tabular/models/rf/rf_quantile.py +4 -2
  71. autogluon/tabular/models/tabdpt/tabdpt_model.py +8 -17
  72. autogluon/tabular/models/tabicl/tabicl_model.py +8 -1
  73. autogluon/tabular/models/tabm/_tabm_internal.py +6 -4
  74. autogluon/tabular/models/tabm/rtdl_num_embeddings.py +80 -127
  75. autogluon/tabular/models/tabm/tabm_model.py +8 -4
  76. autogluon/tabular/models/tabm/tabm_reference.py +53 -85
  77. autogluon/tabular/models/tabpfnmix/_internal/core/callbacks.py +7 -16
  78. autogluon/tabular/models/tabpfnmix/_internal/core/collator.py +16 -24
  79. autogluon/tabular/models/tabpfnmix/_internal/core/dataset_split.py +5 -7
  80. autogluon/tabular/models/tabpfnmix/_internal/core/enums.py +0 -2
  81. autogluon/tabular/models/tabpfnmix/_internal/core/get_loss.py +0 -1
  82. autogluon/tabular/models/tabpfnmix/_internal/core/get_optimizer.py +7 -18
  83. autogluon/tabular/models/tabpfnmix/_internal/core/get_scheduler.py +3 -14
  84. autogluon/tabular/models/tabpfnmix/_internal/core/trainer_finetune.py +79 -64
  85. autogluon/tabular/models/tabpfnmix/_internal/core/y_transformer.py +3 -5
  86. autogluon/tabular/models/tabpfnmix/_internal/data/dataset_finetune.py +17 -30
  87. autogluon/tabular/models/tabpfnmix/_internal/data/preprocessor.py +15 -35
  88. autogluon/tabular/models/tabpfnmix/_internal/models/foundation/embedding.py +21 -38
  89. autogluon/tabular/models/tabpfnmix/_internal/models/foundation/foundation_transformer.py +33 -51
  90. autogluon/tabular/models/tabpfnmix/_internal/results/prediction_metrics.py +4 -4
  91. autogluon/tabular/models/tabpfnmix/_internal/tabpfnmix_classifier.py +32 -12
  92. autogluon/tabular/models/tabpfnmix/_internal/tabpfnmix_regressor.py +32 -13
  93. autogluon/tabular/models/tabpfnmix/tabpfnmix_model.py +55 -19
  94. autogluon/tabular/models/tabpfnv2/tabpfnv2_5_model.py +21 -48
  95. autogluon/tabular/models/tabprep/prep_mixin.py +34 -26
  96. autogluon/tabular/models/tabular_nn/compilers/onnx.py +36 -8
  97. autogluon/tabular/models/tabular_nn/torch/tabular_nn_torch.py +130 -36
  98. autogluon/tabular/models/tabular_nn/torch/tabular_torch_dataset.py +8 -4
  99. autogluon/tabular/models/tabular_nn/torch/torch_network_modules.py +26 -5
  100. autogluon/tabular/models/tabular_nn/utils/categorical_encoders.py +41 -24
  101. autogluon/tabular/models/tabular_nn/utils/data_preprocessor.py +33 -8
  102. autogluon/tabular/models/tabular_nn/utils/nn_architecture_utils.py +21 -6
  103. autogluon/tabular/models/xgboost/callbacks.py +9 -3
  104. autogluon/tabular/models/xgboost/xgboost_model.py +59 -11
  105. autogluon/tabular/models/xt/xt_model.py +1 -0
  106. autogluon/tabular/predictor/interpretable_predictor.py +3 -1
  107. autogluon/tabular/predictor/predictor.py +409 -128
  108. autogluon/tabular/registry/__init__.py +1 -1
  109. autogluon/tabular/registry/_ag_model_registry.py +4 -5
  110. autogluon/tabular/registry/_model_registry.py +1 -0
  111. autogluon/tabular/testing/fit_helper.py +55 -15
  112. autogluon/tabular/testing/generate_datasets.py +1 -1
  113. autogluon/tabular/testing/model_fit_helper.py +10 -4
  114. autogluon/tabular/trainer/abstract_trainer.py +644 -230
  115. autogluon/tabular/trainer/auto_trainer.py +19 -8
  116. autogluon/tabular/trainer/model_presets/presets.py +33 -9
  117. autogluon/tabular/trainer/model_presets/presets_distill.py +16 -2
  118. autogluon/tabular/version.py +1 -1
  119. {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260116.dist-info}/METADATA +26 -26
  120. {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260116.dist-info}/RECORD +127 -135
  121. autogluon/tabular/models/tabpfnv2/rfpfn/__init__.py +0 -20
  122. autogluon/tabular/models/tabpfnv2/rfpfn/configs.py +0 -40
  123. autogluon/tabular/models/tabpfnv2/rfpfn/scoring_utils.py +0 -201
  124. autogluon/tabular/models/tabpfnv2/rfpfn/sklearn_based_decision_tree_tabpfn.py +0 -1464
  125. autogluon/tabular/models/tabpfnv2/rfpfn/sklearn_based_random_forest_tabpfn.py +0 -747
  126. autogluon/tabular/models/tabpfnv2/rfpfn/sklearn_compat.py +0 -863
  127. autogluon/tabular/models/tabpfnv2/rfpfn/utils.py +0 -106
  128. autogluon/tabular/models/tabpfnv2/tabpfnv2_model.py +0 -466
  129. /autogluon.tabular-1.5.1b20260105-py3.11-nspkg.pth → /autogluon.tabular-1.5.1b20260116-py3.11-nspkg.pth +0 -0
  130. {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260116.dist-info}/WHEEL +0 -0
  131. {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260116.dist-info}/licenses/LICENSE +0 -0
  132. {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260116.dist-info}/licenses/NOTICE +0 -0
  133. {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260116.dist-info}/namespace_packages.txt +0 -0
  134. {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260116.dist-info}/top_level.txt +0 -0
  135. {autogluon_tabular-1.5.1b20260105.dist-info → autogluon_tabular-1.5.1b20260116.dist-info}/zip-safe +0 -0
@@ -95,14 +95,17 @@ class NNFastAiTabularModel(AbstractModel):
95
95
  'early.stopping.min_delta': 0.0001,
96
96
  'early.stopping.patience': 10,
97
97
  """
98
+
98
99
  ag_key = "FASTAI"
99
100
  ag_name = "NeuralNetFastAI"
100
101
  ag_priority = 50
101
102
  # Increase priority for multiclass since neural networks
102
103
  # scale better than trees as a function of n_classes.
103
- ag_priority_by_problem_type = MappingProxyType({
104
- MULTICLASS: 95,
105
- })
104
+ ag_priority_by_problem_type = MappingProxyType(
105
+ {
106
+ MULTICLASS: 95,
107
+ }
108
+ )
106
109
  seed_name = "random_seed"
107
110
 
108
111
  model_internals_file_name = "model-internals.pkl"
@@ -136,8 +139,15 @@ class NNFastAiTabularModel(AbstractModel):
136
139
 
137
140
  if self.problem_type in [REGRESSION, QUANTILE] and self.y_scaler is not None:
138
141
  y_norm = pd.Series(self.y_scaler.fit_transform(y.values.reshape(-1, 1)).reshape(-1))
139
- y_val_norm = pd.Series(self.y_scaler.transform(y_val.values.reshape(-1, 1)).reshape(-1)) if y_val is not None else None
140
- logger.log(0, f"Training with scaled targets: {self.y_scaler} - !!! NN training metric will be different from the final results !!!")
142
+ y_val_norm = (
143
+ pd.Series(self.y_scaler.transform(y_val.values.reshape(-1, 1)).reshape(-1))
144
+ if y_val is not None
145
+ else None
146
+ )
147
+ logger.log(
148
+ 0,
149
+ f"Training with scaled targets: {self.y_scaler} - !!! NN training metric will be different from the final results !!!",
150
+ )
141
151
  else:
142
152
  y_norm = y
143
153
  y_val_norm = y_val
@@ -170,14 +180,20 @@ class NNFastAiTabularModel(AbstractModel):
170
180
  unique_vals = X[self.cont_columns].nunique()
171
181
  self.cont_columns = [c for c in self.cont_columns if unique_vals[c] > 1]
172
182
  if self.cont_columns:
173
- self._cont_normalization = (np.array(X[self.cont_columns].mean()), np.array(X[self.cont_columns].std()))
183
+ self._cont_normalization = (
184
+ np.array(X[self.cont_columns].mean()),
185
+ np.array(X[self.cont_columns].std()),
186
+ )
174
187
 
175
188
  num_cat_cols_og = len(self.cat_columns)
176
189
  if self.cat_columns:
177
190
  try:
178
191
  X_stats = X[self.cat_columns].describe(include="all").T.reset_index()
179
192
  cat_cols_to_drop = list(
180
- X_stats[(X_stats["unique"] > self.params.get("max_unique_categorical_values", 10000)) | (X_stats["unique"].isna())]["index"].values
193
+ X_stats[
194
+ (X_stats["unique"] > self.params.get("max_unique_categorical_values", 10000))
195
+ | (X_stats["unique"].isna())
196
+ ]["index"].values
181
197
  )
182
198
  except:
183
199
  cat_cols_to_drop = []
@@ -187,7 +203,9 @@ class NNFastAiTabularModel(AbstractModel):
187
203
  num_cat_cols_use = len(self.cat_columns)
188
204
  logger.log(15, f"Using {num_cat_cols_use}/{num_cat_cols_og} categorical features")
189
205
 
190
- nullable_numeric_features = self._feature_metadata.get_features(valid_raw_types=[R_FLOAT, R_DATETIME], invalid_special_types=[S_TEXT_SPECIAL])
206
+ nullable_numeric_features = self._feature_metadata.get_features(
207
+ valid_raw_types=[R_FLOAT, R_DATETIME], invalid_special_types=[S_TEXT_SPECIAL]
208
+ )
191
209
  self.columns_fills = dict()
192
210
  self._columns_fills_names = nullable_numeric_features
193
211
  for c in self._columns_fills_names: # No need to do this for int features, int can't have null
@@ -227,7 +245,9 @@ class NNFastAiTabularModel(AbstractModel):
227
245
  df = df.copy()
228
246
  return df
229
247
 
230
- def _fit(self, X, y, X_val=None, y_val=None, time_limit=None, num_cpus=None, num_gpus=0, sample_weight=None, **kwargs):
248
+ def _fit(
249
+ self, X, y, X_val=None, y_val=None, time_limit=None, num_cpus=None, num_gpus=0, sample_weight=None, **kwargs
250
+ ):
231
251
  try_import_fastai()
232
252
  import torch
233
253
  from fastai import torch_core
@@ -240,7 +260,10 @@ class NNFastAiTabularModel(AbstractModel):
240
260
  torch.set_num_threads(num_cpus)
241
261
  start_time = time.time()
242
262
  if sample_weight is not None: # TODO: support
243
- logger.log(15, "sample_weight not yet supported for NNFastAiTabularModel, this model will ignore them in training.")
263
+ logger.log(
264
+ 15,
265
+ "sample_weight not yet supported for NNFastAiTabularModel, this model will ignore them in training.",
266
+ )
244
267
 
245
268
  params = self._get_model_params()
246
269
  self._num_cpus_infer = params.pop("_num_cpus_infer", 1)
@@ -341,13 +364,19 @@ class NNFastAiTabularModel(AbstractModel):
341
364
 
342
365
  fname = "model"
343
366
  save_callback = AgSaveModelCallback(
344
- monitor=objective_func_name_to_monitor, comp=objective_optim_mode, fname=fname, best_epoch_stop=best_epoch_stop, with_opt=True
367
+ monitor=objective_func_name_to_monitor,
368
+ comp=objective_optim_mode,
369
+ fname=fname,
370
+ best_epoch_stop=best_epoch_stop,
371
+ with_opt=True,
345
372
  )
346
373
 
347
374
  if time_limit is not None:
348
375
  time_elapsed = time.time() - start_time
349
376
  time_left = time_limit - time_elapsed
350
- if time_left <= time_limit * 0.7: # if 30% of time was spent preprocessing, likely not enough time to train model
377
+ if (
378
+ time_left <= time_limit * 0.7
379
+ ): # if 30% of time was spent preprocessing, likely not enough time to train model
351
380
  raise TimeLimitExceeded
352
381
  else:
353
382
  time_left = None
@@ -371,7 +400,12 @@ class NNFastAiTabularModel(AbstractModel):
371
400
  self.model.path = Path(temp_dir)
372
401
 
373
402
  len_val = len(X_val) if X_val is not None else 0
374
- epochs = self._get_epochs_number(samples_num=len(X) + len_val, epochs=params["epochs"], batch_size=batch_size, time_left=time_left)
403
+ epochs = self._get_epochs_number(
404
+ samples_num=len(X) + len_val,
405
+ epochs=params["epochs"],
406
+ batch_size=batch_size,
407
+ time_left=time_left,
408
+ )
375
409
  if epochs == 0:
376
410
  # Stop early if there is not enough time to train a full epoch
377
411
  raise TimeLimitExceeded
@@ -474,7 +508,9 @@ class NNFastAiTabularModel(AbstractModel):
474
508
  objective_func_name = "pinball_loss"
475
509
  else:
476
510
  objective_func_name = "log_loss"
477
- logger.warning(f"Metric {stopping_metric.name} is not supported by this model - using {objective_func_name} instead")
511
+ logger.warning(
512
+ f"Metric {stopping_metric.name} is not supported by this model - using {objective_func_name} instead"
513
+ )
478
514
 
479
515
  nn_metric = metrics_map.get(objective_func_name, None)
480
516
 
@@ -482,7 +518,11 @@ class NNFastAiTabularModel(AbstractModel):
482
518
 
483
519
  def __get_objective_func_to_monitor(self, objective_func_name):
484
520
  monitor_obj_func = {
485
- **{k: m.name if hasattr(m, "name") else m.__name__ for k, m in self.__get_metrics_map().items() if m is not None},
521
+ **{
522
+ k: m.name if hasattr(m, "name") else m.__name__
523
+ for k, m in self.__get_metrics_map().items()
524
+ if m is not None
525
+ },
486
526
  "log_loss": "valid_loss",
487
527
  }
488
528
  objective_func_name_to_monitor = objective_func_name
@@ -534,13 +574,14 @@ class NNFastAiTabularModel(AbstractModel):
534
574
  self.model = __model
535
575
  # Export model
536
576
  if self._load_model:
537
- save_pkl.save_with_fn(self._model_internals_path, self.model, pickle_fn=lambda m, buffer: export(m, buffer), verbose=verbose)
577
+ save_pkl.save_with_fn(
578
+ self._model_internals_path, self.model, pickle_fn=lambda m, buffer: export(m, buffer), verbose=verbose
579
+ )
538
580
  self._load_model = None
539
581
  return path
540
582
 
541
583
  @classmethod
542
584
  def load(cls, path: str, reset_paths=True, verbose=True):
543
-
544
585
  from fastai.learner import load_learner
545
586
 
546
587
  model = super().load(path, reset_paths=reset_paths, verbose=verbose)
@@ -627,7 +668,13 @@ class NNFastAiTabularModel(AbstractModel):
627
668
 
628
669
  def _estimate_memory_usage(self, X: pd.DataFrame, **kwargs) -> int:
629
670
  hyperparameters = self._get_model_params()
630
- return self.estimate_memory_usage_static(X=X, problem_type=self.problem_type, num_classes=self.num_classes, hyperparameters=hyperparameters, **kwargs)
671
+ return self.estimate_memory_usage_static(
672
+ X=X,
673
+ problem_type=self.problem_type,
674
+ num_classes=self.num_classes,
675
+ hyperparameters=hyperparameters,
676
+ **kwargs,
677
+ )
631
678
 
632
679
  @classmethod
633
680
  def _estimate_memory_usage_static(
@@ -79,7 +79,9 @@ class FastTextModel(AbstractModel):
79
79
  params["verbose"] = 2
80
80
 
81
81
  if sample_weight is not None:
82
- logger.log(15, "sample_weight not yet supported for FastTextModel, this model will ignore them in training.")
82
+ logger.log(
83
+ 15, "sample_weight not yet supported for FastTextModel, this model will ignore them in training."
84
+ )
83
85
 
84
86
  X = self.preprocess(X)
85
87
 
@@ -24,6 +24,7 @@ class ImagePredictorModel(MultiModalPredictorModel):
24
24
  Additionally has special null image handling to improve performance in the presence of null images (aka image path of '')
25
25
  Note: null handling has not been compared to the built-in null handling of MultimodalPredictor yet.
26
26
  """
27
+
27
28
  ag_key = "AG_IMAGE_NN"
28
29
  ag_name = "ImagePredictor"
29
30
 
@@ -61,14 +62,18 @@ class ImagePredictorModel(MultiModalPredictorModel):
61
62
  X, y, X_val, y_val = super().preprocess_fit(X=X, y=y, X_val=X_val, y_val=y_val, **kwargs)
62
63
  X_features = list(X.columns)
63
64
  if len(X_features) != 1:
64
- raise AssertionError(f"ImagePredictorModel only supports one image feature, but {len(X_features)} were given: {X_features}")
65
+ raise AssertionError(
66
+ f"ImagePredictorModel only supports one image feature, but {len(X_features)} were given: {X_features}"
67
+ )
65
68
  self._image_col_name = X_features[0]
66
69
  null_indices = X[self._image_col_name] == ""
67
70
 
68
71
  # TODO: Consider some kind of weighting of the two options so there isn't a harsh cutoff at 50
69
72
  # FIXME: What if all rows in a class are null? Will probably crash.
70
73
  if null_indices.sum() > 50:
71
- self._dummy_pred_proba = self._compute_dummy_pred_proba(y[null_indices]) # FIXME: Do this one for better results
74
+ self._dummy_pred_proba = self._compute_dummy_pred_proba(
75
+ y[null_indices]
76
+ ) # FIXME: Do this one for better results
72
77
  else:
73
78
  # Not enough null to get a confident estimate of null label average, instead use all data average
74
79
  self._dummy_pred_proba = self._compute_dummy_pred_proba(y)
@@ -24,6 +24,7 @@ class KNNModel(AbstractModel):
24
24
  """
25
25
  KNearestNeighbors model (scikit-learn): https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
26
26
  """
27
+
27
28
  ag_key = "KNN"
28
29
  ag_name = "KNeighbors"
29
30
  ag_priority = 100
@@ -106,11 +107,19 @@ class KNNModel(AbstractModel):
106
107
  if time_limit is None or num_rows_max <= 10000:
107
108
  self.model = self._get_model_type()(**params).fit(X, y)
108
109
  else:
109
- self.model = self._fit_with_samples(X=X, y=y, model_params=params, time_limit=time_limit - (time.time() - time_start))
110
+ self.model = self._fit_with_samples(
111
+ X=X, y=y, model_params=params, time_limit=time_limit - (time.time() - time_start)
112
+ )
110
113
 
111
114
  def _estimate_memory_usage(self, X: pd.DataFrame, **kwargs) -> int:
112
115
  hyperparameters = self._get_model_params()
113
- return self.estimate_memory_usage_static(X=X, problem_type=self.problem_type, num_classes=self.num_classes, hyperparameters=hyperparameters, **kwargs)
116
+ return self.estimate_memory_usage_static(
117
+ X=X,
118
+ problem_type=self.problem_type,
119
+ num_classes=self.num_classes,
120
+ hyperparameters=hyperparameters,
121
+ **kwargs,
122
+ )
114
123
 
115
124
  @classmethod
116
125
  def _estimate_memory_usage_static(
@@ -120,12 +129,23 @@ class KNNModel(AbstractModel):
120
129
  **kwargs,
121
130
  ) -> int:
122
131
  model_size_bytes = 4 * X.shape[0] * X.shape[1] # Assuming float32 types
123
- expected_final_model_size_bytes = int(model_size_bytes * 3.6) # Roughly what can be expected of the final KNN model in memory size
132
+ expected_final_model_size_bytes = int(
133
+ model_size_bytes * 3.6
134
+ ) # Roughly what can be expected of the final KNN model in memory size
124
135
  return expected_final_model_size_bytes
125
136
 
126
- def _validate_fit_memory_usage(self, mem_error_threshold: float = 0.2, mem_warning_threshold: float = 0.15, mem_size_threshold: int = 1e7, **kwargs):
137
+ def _validate_fit_memory_usage(
138
+ self,
139
+ mem_error_threshold: float = 0.2,
140
+ mem_warning_threshold: float = 0.15,
141
+ mem_size_threshold: int = 1e7,
142
+ **kwargs,
143
+ ):
127
144
  return super()._validate_fit_memory_usage(
128
- mem_error_threshold=mem_error_threshold, mem_warning_threshold=mem_warning_threshold, mem_size_threshold=mem_size_threshold, **kwargs
145
+ mem_error_threshold=mem_error_threshold,
146
+ mem_warning_threshold=mem_warning_threshold,
147
+ mem_size_threshold=mem_size_threshold,
148
+ **kwargs,
129
149
  )
130
150
 
131
151
  # TODO: Won't work for RAPIDS without modification
@@ -167,7 +187,17 @@ class KNNModel(AbstractModel):
167
187
  return y_oof_pred_proba
168
188
 
169
189
  # TODO: Consider making this fully generic and available to all models
170
- def _fit_with_samples(self, X, y, model_params, time_limit, start_samples=10000, max_samples=None, sample_growth_factor=2, sample_time_growth_factor=8):
190
+ def _fit_with_samples(
191
+ self,
192
+ X,
193
+ y,
194
+ model_params,
195
+ time_limit,
196
+ start_samples=10000,
197
+ max_samples=None,
198
+ sample_growth_factor=2,
199
+ sample_time_growth_factor=8,
200
+ ):
171
201
  """
172
202
  Fit model with samples of the data repeatedly, gradually increasing the amount of data until time_limit is reached or all data is used.
173
203
 
@@ -243,11 +273,14 @@ class KNNModel(AbstractModel):
243
273
  time_limit_left = time_limit - (time_fit_end_sample - time_start)
244
274
  time_fit_sample = time_limit_left_prior - time_limit_left
245
275
  time_required_for_next = time_fit_sample * sample_time_growth_factor
246
- logger.log(15, f"\t{round(time_fit_sample, 2)}s \t= Train Time (Using {samples}/{num_rows_max} rows) ({round(time_limit_left, 2)}s remaining time)")
276
+ logger.log(
277
+ 15,
278
+ f"\t{round(time_fit_sample, 2)}s \t= Train Time (Using {samples}/{num_rows_max} rows) ({round(time_limit_left, 2)}s remaining time)",
279
+ )
247
280
  if time_required_for_next > time_limit_left and i != len(num_rows_samples) - 1:
248
281
  logger.log(
249
282
  20,
250
- f"\tNot enough time to train KNN model on all training rows. Fit {samples}/{num_rows_max} rows. (Training KNN model on {num_rows_samples[i+1]} rows is expected to take {round(time_required_for_next, 2)}s)",
283
+ f"\tNot enough time to train KNN model on all training rows. Fit {samples}/{num_rows_max} rows. (Training KNN model on {num_rows_samples[i + 1]} rows is expected to take {round(time_required_for_next, 2)}s)",
251
284
  )
252
285
  break
253
286
  if idx is not None:
@@ -74,12 +74,15 @@ def early_stopping_custom(
74
74
 
75
75
  def _init(env):
76
76
  if not ignore_dart_warning:
77
- enabled[0] = not any((boost_alias in env.params and env.params[boost_alias] == "dart") for boost_alias in ("boosting", "boosting_type", "boost"))
77
+ enabled[0] = not any(
78
+ (boost_alias in env.params and env.params[boost_alias] == "dart")
79
+ for boost_alias in ("boosting", "boosting_type", "boost")
80
+ )
78
81
  if not enabled[0]:
79
82
  warnings.warn("Early stopping is not available in dart mode")
80
83
  return
81
84
  if not env.evaluation_result_list:
82
- raise ValueError("For early stopping, " "at least one dataset and eval metric is required for evaluation")
85
+ raise ValueError("For early stopping, at least one dataset and eval metric is required for evaluation")
83
86
 
84
87
  if verbose:
85
88
  msg = "Training until validation scores don't improve for {} rounds."
@@ -179,7 +182,9 @@ def early_stopping_custom(
179
182
  if not enabled[0]:
180
183
  return
181
184
  if train_loss_name is not None:
182
- train_loss_evals = [eval for eval in env.evaluation_result_list if eval[0] == "train_set" and eval[1] == train_loss_name]
185
+ train_loss_evals = [
186
+ eval for eval in env.evaluation_result_list if eval[0] == "train_set" and eval[1] == train_loss_name
187
+ ]
183
188
  train_loss_val = train_loss_evals[0][2]
184
189
  else:
185
190
  train_loss_val = 0.0
@@ -194,7 +199,9 @@ def early_stopping_custom(
194
199
  best_score_list[i] = env.evaluation_result_list
195
200
  best_trainloss[i] = train_loss_val
196
201
  if reporter is not None: # Report current best scores for iteration, used in HPO
197
- if i == indices_to_check[0]: # TODO: documentation needs to note that we assume 0th index is the 'official' validation performance metric.
202
+ if (
203
+ i == indices_to_check[0]
204
+ ): # TODO: documentation needs to note that we assume 0th index is the 'official' validation performance metric.
198
205
  if cmp_op[i] == gt:
199
206
  validation_perf = score
200
207
  else:
@@ -214,7 +221,10 @@ def early_stopping_custom(
214
221
  logger.log(
215
222
  15,
216
223
  "Early stopping, best iteration is:\n[%d]\t%s"
217
- % (best_iter[i] + 1, "\t".join([_format_eval_result(x, show_stdv=False) for x in best_score_list[i]])),
224
+ % (
225
+ best_iter[i] + 1,
226
+ "\t".join([_format_eval_result(x, show_stdv=False) for x in best_score_list[i]]),
227
+ ),
218
228
  )
219
229
  raise EarlyStopException(best_iter[i], best_score_list[i])
220
230
  elif (max_diff is not None) and (abs(score - best_score[i]) > max_diff):
@@ -224,7 +234,10 @@ def early_stopping_custom(
224
234
  logger.log(
225
235
  15,
226
236
  "Early stopping, best iteration is:\n[%d]\t%s"
227
- % (best_iter[i] + 1, "\t".join([_format_eval_result(x, show_stdv=False) for x in best_score_list[i]])),
237
+ % (
238
+ best_iter[i] + 1,
239
+ "\t".join([_format_eval_result(x, show_stdv=False) for x in best_score_list[i]]),
240
+ ),
228
241
  )
229
242
  raise EarlyStopException(best_iter[i], best_score_list[i])
230
243
  if env.iteration == env.end_iteration - 1:
@@ -232,7 +245,10 @@ def early_stopping_custom(
232
245
  logger.log(
233
246
  15,
234
247
  "Did not meet early stopping criterion. Best iteration is:\n[%d]\t%s"
235
- % (best_iter[i] + 1, "\t".join([_format_eval_result(x, show_stdv=False) for x in best_score_list[i]])),
248
+ % (
249
+ best_iter[i] + 1,
250
+ "\t".join([_format_eval_result(x, show_stdv=False) for x in best_score_list[i]]),
251
+ ),
236
252
  )
237
253
  raise EarlyStopException(best_iter[i], best_score_list[i])
238
254
  if verbose:
@@ -243,7 +259,10 @@ def early_stopping_custom(
243
259
  logger.log(
244
260
  20,
245
261
  "Found manual stop file, early stopping. Best iteration is:\n[%d]\t%s"
246
- % (best_iter[i] + 1, "\t".join([_format_eval_result(x, show_stdv=False) for x in best_score_list[i]])),
262
+ % (
263
+ best_iter[i] + 1,
264
+ "\t".join([_format_eval_result(x, show_stdv=False) for x in best_score_list[i]]),
265
+ ),
247
266
  )
248
267
  raise EarlyStopException(best_iter[i], best_score_list[i])
249
268
  if time_limit:
@@ -255,7 +274,11 @@ def early_stopping_custom(
255
274
  20,
256
275
  "\tRan out of time, early stopping on iteration "
257
276
  + str(env.iteration + 1)
258
- + ". Best iteration is:\n\t[%d]\t%s" % (best_iter[i] + 1, "\t".join([_format_eval_result(x, show_stdv=False) for x in best_score_list[i]])),
277
+ + ". Best iteration is:\n\t[%d]\t%s"
278
+ % (
279
+ best_iter[i] + 1,
280
+ "\t".join([_format_eval_result(x, show_stdv=False) for x in best_score_list[i]]),
281
+ ),
259
282
  )
260
283
  raise EarlyStopException(best_iter[i], best_score_list[i])
261
284
 
@@ -19,7 +19,9 @@ def get_searchspace_multiclass_baseline():
19
19
  params = {
20
20
  "learning_rate": space.Real(lower=5e-3, upper=0.2, default=0.05, log=True),
21
21
  "feature_fraction": space.Real(lower=0.75, upper=1.0, default=1.0),
22
- "min_data_in_leaf": space.Int(lower=2, upper=60, default=20), # TODO: Use size of dataset to set upper, if row count is small upper should be small
22
+ "min_data_in_leaf": space.Int(
23
+ lower=2, upper=60, default=20
24
+ ), # TODO: Use size of dataset to set upper, if row count is small upper should be small
23
25
  "num_leaves": space.Int(
24
26
  lower=16, upper=96, default=31
25
27
  ), # TODO: Use row count and feature count to set this, the higher feature count the higher num_leaves upper