upgini 1.2.79a1__py3-none-any.whl → 1.2.81__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/metrics.py CHANGED
@@ -6,20 +6,21 @@ import re
6
6
  from collections import defaultdict
7
7
  from copy import deepcopy
8
8
  from dataclasses import dataclass
9
- from typing import Any, Callable, Dict, List, Optional, Tuple, Union
9
+ from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
10
10
 
11
11
  import lightgbm as lgb
12
12
  import numpy as np
13
13
  import pandas as pd
14
+ from catboost import CatBoostClassifier, CatBoostRegressor
15
+ from category_encoders.cat_boost import CatBoostEncoder
14
16
  from lightgbm import LGBMClassifier, LGBMRegressor
15
17
  from numpy import log1p
16
- from pandas.api.types import is_numeric_dtype
18
+ from pandas.api.types import is_numeric_dtype, is_integer_dtype, is_float_dtype
17
19
  from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
18
- from sklearn.preprocessing import OrdinalEncoder
19
20
 
21
+ from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
20
22
  from upgini.utils.features_validator import FeaturesValidator
21
23
  from upgini.utils.sklearn_ext import cross_validate
22
- from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
23
24
 
24
25
  try:
25
26
  from sklearn.metrics import get_scorer_names
@@ -31,12 +32,15 @@ except ImportError:
31
32
  available_scorers = SCORERS
32
33
  from sklearn.metrics import mean_squared_error
33
34
  from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
34
- from sklearn.model_selection import BaseCrossValidator, TimeSeriesSplit
35
+ from sklearn.model_selection import ( # , TimeSeriesSplit
36
+ BaseCrossValidator,
37
+ TimeSeriesSplit,
38
+ )
35
39
 
36
40
  from upgini.errors import ValidationError
37
41
  from upgini.metadata import ModelTaskType
38
42
  from upgini.resource_bundle import bundle
39
- from upgini.utils.target_utils import correct_string_target
43
+ from upgini.utils.target_utils import prepare_target
40
44
 
41
45
  DEFAULT_RANDOM_STATE = 42
42
46
 
@@ -87,20 +91,9 @@ CATBOOST_MULTICLASS_PARAMS = {
87
91
 
88
92
  LIGHTGBM_REGRESSION_PARAMS = {
89
93
  "random_state": DEFAULT_RANDOM_STATE,
90
- "min_gain_to_split": 0.001,
91
94
  "n_estimators": 275,
92
- "max_depth": 5,
93
- "max_cat_threshold": 80,
94
- "min_data_per_group": 25,
95
- "cat_l2": 10,
96
- "cat_smooth": 12,
97
- "learning_rate": 0.05,
98
95
  "feature_fraction": 1.0,
99
- "min_sum_hessian_in_leaf": 0.01,
100
- "objective": "huber",
101
96
  "deterministic": "true",
102
- "force_col_wise": "true",
103
- "force_row_wise": "true",
104
97
  "verbosity": -1,
105
98
  }
106
99
 
@@ -115,13 +108,10 @@ LIGHTGBM_MULTICLASS_PARAMS = {
115
108
  "cat_smooth": 18,
116
109
  "cat_l2": 8,
117
110
  "objective": "multiclass",
118
- # "class_weight": "balanced",
119
111
  "use_quantized_grad": "true",
120
112
  "num_grad_quant_bins": "8",
121
113
  "stochastic_rounding": "true",
122
114
  "deterministic": "true",
123
- "force_col_wise": "true",
124
- "force_row_wise": "true",
125
115
  "verbosity": -1,
126
116
  }
127
117
 
@@ -132,14 +122,11 @@ LIGHTGBM_BINARY_PARAMS = {
132
122
  "max_depth": 5,
133
123
  "learning_rate": 0.05,
134
124
  "objective": "binary",
135
- # "class_weight": "balanced",
136
125
  "max_cat_threshold": 80,
137
126
  "min_data_per_group": 20,
138
127
  "cat_smooth": 18,
139
128
  "cat_l2": 8,
140
129
  "deterministic": "true",
141
- "force_col_wise": "true",
142
- "force_row_wise": "true",
143
130
  "verbosity": -1,
144
131
  }
145
132
 
@@ -148,34 +135,6 @@ LIGHTGBM_EARLY_STOPPING_ROUNDS = 20
148
135
  N_FOLDS = 5
149
136
  BLOCKED_TS_TEST_SIZE = 0.2
150
137
 
151
- NA_VALUES = [
152
- "",
153
- " ",
154
- " ",
155
- "#n/a",
156
- "#n/a n/a",
157
- "#na",
158
- "-1.#ind",
159
- "-1.#qnan",
160
- "-nan",
161
- "1.#ind",
162
- "1.#qnan",
163
- "n/a",
164
- "na",
165
- "null",
166
- "nan",
167
- "n/a",
168
- "nan",
169
- "none",
170
- "-",
171
- "undefined",
172
- "[[unknown]]",
173
- "[not provided]",
174
- "[unknown]",
175
- ]
176
-
177
- NA_REPLACEMENT = "NA"
178
-
179
138
  SUPPORTED_CATBOOST_METRICS = {
180
139
  s.upper(): s
181
140
  for s in (
@@ -285,11 +244,55 @@ class _CrossValResults:
285
244
  return f"{self.metric:.3f} ± {self.metric_std:.3f}"
286
245
 
287
246
 
247
+ def is_numeric_object(x: pd.Series) -> bool:
248
+ try:
249
+ pd.to_numeric(x, errors="raise")
250
+ return True
251
+ except (ValueError, TypeError):
252
+ return False
253
+
254
+
255
+ def is_valid_numeric_array_data(data: pd.Series) -> bool:
256
+ data_without_na = data.dropna()
257
+ if data_without_na.empty:
258
+ return False
259
+
260
+ first_element = data_without_na.iloc[0]
261
+
262
+ # numpy.ndarray with numeric types
263
+ if isinstance(first_element, np.ndarray):
264
+ return np.issubdtype(first_element.dtype, np.number)
265
+
266
+ # DataFrame with all numeric columns
267
+ elif isinstance(first_element, pd.DataFrame):
268
+ return all(np.issubdtype(dtype, np.number) for dtype in first_element.dtypes)
269
+
270
+ # list or list of lists with numeric types
271
+ elif isinstance(first_element, list):
272
+ try:
273
+ # flat list
274
+ if all(isinstance(x, (int, float, np.number)) or pd.isna(x) for x in first_element):
275
+ return True
276
+ # list of lists
277
+ elif all(
278
+ isinstance(x, list) and all(isinstance(y, (int, float, np.number)) or pd.isna(y) for y in x)
279
+ for x in first_element
280
+ ):
281
+ return True
282
+ except Exception:
283
+ return False
284
+
285
+ return False
286
+
287
+
288
288
  class EstimatorWrapper:
289
+ default_estimator: Literal["catboost", "lightgbm"] = "catboost"
290
+
289
291
  def __init__(
290
292
  self,
291
293
  estimator,
292
294
  scorer: Callable,
295
+ cat_features: Optional[List[str]],
293
296
  metric_name: str,
294
297
  multiplier: int,
295
298
  cv: BaseCrossValidator,
@@ -301,9 +304,8 @@ class EstimatorWrapper:
301
304
  ):
302
305
  self.estimator = estimator
303
306
  self.scorer = scorer
304
- self.metric_name = (
305
- "GINI" if metric_name.upper() == "ROC_AUC" and target_type == ModelTaskType.BINARY else metric_name
306
- )
307
+ self.cat_features = cat_features
308
+ self.metric_name = metric_name
307
309
  self.multiplier = multiplier
308
310
  self.cv = cv
309
311
  self.target_type = target_type
@@ -312,6 +314,10 @@ class EstimatorWrapper:
312
314
  self.groups = groups
313
315
  self.text_features = text_features
314
316
  self.logger = logger or logging.getLogger()
317
+ self.droped_features = []
318
+ self.converted_to_int = []
319
+ self.converted_to_str = []
320
+ self.converted_to_numeric = []
315
321
 
316
322
  def fit(self, x: pd.DataFrame, y: np.ndarray, **kwargs):
317
323
  x, y, _, fit_params = self._prepare_to_fit(x, y)
@@ -319,22 +325,13 @@ class EstimatorWrapper:
319
325
  self.estimator.fit(x, y, **kwargs)
320
326
  return self
321
327
 
322
- def predict(self, **kwargs):
323
- return self.estimator.predict(**kwargs)
324
-
325
- def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
326
- x, y, groups = self._prepare_data(x, y, groups=self.groups)
327
- return x, y, groups, {}
328
+ def predict(self, x: pd.DataFrame, **kwargs):
329
+ x, _, _ = self._prepare_to_calculate(x, None)
330
+ return self.estimator.predict(x, **kwargs)
328
331
 
329
332
  def _prepare_data(
330
333
  self, x: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
331
334
  ) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
332
- self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
333
- for c in x.columns:
334
- if is_numeric_dtype(x[c]):
335
- x[c] = x[c].astype(float)
336
- elif not x[c].dtype == "category":
337
- x[c] = x[c].astype(str)
338
335
 
339
336
  if not isinstance(y, pd.Series):
340
337
  raise Exception(bundle.get("metrics_unsupported_target_type").format(type(y)))
@@ -348,6 +345,8 @@ class EstimatorWrapper:
348
345
  else:
349
346
  x, y = self._remove_empty_target_rows(x, y)
350
347
 
348
+ y = prepare_target(y, self.target_type)
349
+
351
350
  self.logger.info(f"After preparing data columns: {x.columns.to_list()}")
352
351
  return x, y, groups
353
352
 
@@ -360,8 +359,84 @@ class EstimatorWrapper:
360
359
 
361
360
  return x, y
362
361
 
362
+ def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
363
+ x, y, groups = self._prepare_data(x, y, groups=self.groups)
364
+
365
+ self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
366
+ self.droped_features = []
367
+ self.converted_to_int = []
368
+ self.converted_to_str = []
369
+ self.converted_to_numeric = []
370
+ for c in x.columns:
371
+
372
+ if _get_unique_count(x[c]) < 2:
373
+ self.logger.warning(f"Remove feature {c} because it has less than 2 unique values")
374
+ if c in self.cat_features:
375
+ self.cat_features.remove(c)
376
+ x.drop(columns=[c], inplace=True)
377
+ self.droped_features.append(c)
378
+ elif self.text_features is not None and c in self.text_features:
379
+ x[c] = x[c].astype(str)
380
+ self.converted_to_str.append(c)
381
+ elif c in self.cat_features:
382
+ if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
383
+ x[c] = x[c].astype(np.int64)
384
+ self.converted_to_int.append(c)
385
+ elif x[c].dtype == "category" and is_integer_dtype(x[c].cat.categories):
386
+ self.logger.info(
387
+ f"Convert categorical feature {c} with integer categories"
388
+ " to int64 and remove from cat_features"
389
+ )
390
+ x[c] = x[c].astype(np.int64)
391
+ self.converted_to_int.append(c)
392
+ self.cat_features.remove(c)
393
+ elif is_float_dtype(x[c]) or (x[c].dtype == "category" and is_float_dtype(x[c].cat.categories)):
394
+ self.logger.info(
395
+ f"Convert float cat feature {c} to string"
396
+ )
397
+ x[c] = x[c].astype(str)
398
+ self.converted_to_str.append(c)
399
+ elif x[c].dtype not in ["category", "int64"]:
400
+ x[c] = x[c].astype(str)
401
+ self.converted_to_str.append(c)
402
+ else:
403
+ if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
404
+ self.logger.info(f"Convert bool feature {c} to int64")
405
+ x[c] = x[c].astype(np.int64)
406
+ self.converted_to_int.append(c)
407
+ elif not is_valid_numeric_array_data(x[c]) and not is_numeric_dtype(x[c]):
408
+ try:
409
+ x[c] = pd.to_numeric(x[c], errors="raise")
410
+ self.converted_to_numeric.append(c)
411
+ except (ValueError, TypeError):
412
+ self.logger.warning(f"Remove feature {c} because it is not numeric and not in cat_features")
413
+ x.drop(columns=[c], inplace=True)
414
+ self.droped_features.append(c)
415
+
416
+ return x, y, groups, {}
417
+
363
418
  def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
364
419
  x, y, _ = self._prepare_data(x, y)
420
+
421
+ if self.droped_features:
422
+ self.logger.info(f"Drop features on calculate metrics: {self.droped_features}")
423
+ x = x.drop(columns=self.droped_features)
424
+
425
+ if self.converted_to_int:
426
+ self.logger.info(f"Convert to int features on calculate metrics: {self.converted_to_int}")
427
+ for c in self.converted_to_int:
428
+ x[c] = x[c].astype(np.int64)
429
+
430
+ if self.converted_to_str:
431
+ self.logger.info(f"Convert to str features on calculate metrics: {self.converted_to_str}")
432
+ for c in self.converted_to_str:
433
+ x[c] = x[c].astype(str)
434
+
435
+ if self.converted_to_numeric:
436
+ self.logger.info(f"Convert to numeric features on calculate metrics: {self.converted_to_numeric}")
437
+ for c in self.converted_to_numeric:
438
+ x[c] = pd.to_numeric(x[c], errors="coerce")
439
+
365
440
  return x, y, {}
366
441
 
367
442
  def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
@@ -381,7 +456,10 @@ class EstimatorWrapper:
381
456
  if baseline_score_column is not None and self.metric_name == "GINI":
382
457
  self.logger.info("Calculate baseline GINI on passed baseline_score_column and target")
383
458
  metric = roc_auc_score(y, x[baseline_score_column])
459
+ metric_std = None
460
+ average_shap_values = None
384
461
  else:
462
+ self.logger.info(f"Cross validate with estimeator: {self.estimator}")
385
463
  cv_results = cross_validate(
386
464
  estimator=self.estimator,
387
465
  x=x,
@@ -412,7 +490,6 @@ class EstimatorWrapper:
412
490
  shaps = self.calculate_shap(cv_x, cv_y, estimator)
413
491
  if shaps is not None:
414
492
  for feature, shap_value in shaps.items():
415
- # shap_values_all_folds[feature] = shap_values_all_folds.get(feature, []) + shap_value.tolist()
416
493
  shap_values_all_folds[feature].append(shap_value)
417
494
 
418
495
  if shap_values_all_folds:
@@ -468,7 +545,7 @@ class EstimatorWrapper:
468
545
  logger: logging.Logger,
469
546
  target_type: ModelTaskType,
470
547
  cv: BaseCrossValidator,
471
- x: pd.DataFrame,
548
+ *,
472
549
  scoring: Union[Callable, str, None] = None,
473
550
  cat_features: Optional[List[str]] = None,
474
551
  text_features: Optional[List[str]] = None,
@@ -476,9 +553,10 @@ class EstimatorWrapper:
476
553
  groups: Optional[List[str]] = None,
477
554
  has_date: Optional[bool] = None,
478
555
  ) -> EstimatorWrapper:
479
- scorer, metric_name, multiplier = _get_scorer(target_type, scoring)
556
+ scorer, metric_name, multiplier = define_scorer(target_type, scoring)
480
557
  kwargs = {
481
558
  "scorer": scorer,
559
+ "cat_features": cat_features,
482
560
  "metric_name": metric_name,
483
561
  "multiplier": multiplier,
484
562
  "cv": cv,
@@ -488,22 +566,43 @@ class EstimatorWrapper:
488
566
  "logger": logger,
489
567
  }
490
568
  if estimator is None:
491
- params = {"random_state": DEFAULT_RANDOM_STATE, "verbose": -1}
492
- if target_type == ModelTaskType.MULTICLASS:
493
- params = _get_add_params(params, LIGHTGBM_MULTICLASS_PARAMS)
494
- params = _get_add_params(params, add_params)
495
- estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
496
- elif target_type == ModelTaskType.BINARY:
497
- params = _get_add_params(params, LIGHTGBM_BINARY_PARAMS)
498
- params = _get_add_params(params, add_params)
499
- estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
500
- elif target_type == ModelTaskType.REGRESSION:
501
- if not isinstance(cv, TimeSeriesSplit) and not isinstance(cv, BlockedTimeSeriesSplit):
502
- params = _get_add_params(params, LIGHTGBM_REGRESSION_PARAMS)
503
- params = _get_add_params(params, add_params)
504
- estimator = LightGBMWrapper(LGBMRegressor(**params), **kwargs)
569
+ if EstimatorWrapper.default_estimator == "catboost":
570
+ logger.info("Using CatBoost as default estimator")
571
+ params = {"has_time": has_date}
572
+ if target_type == ModelTaskType.MULTICLASS:
573
+ params = _get_add_params(params, CATBOOST_MULTICLASS_PARAMS)
574
+ params = _get_add_params(params, add_params)
575
+ estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
576
+ elif target_type == ModelTaskType.BINARY:
577
+ params = _get_add_params(params, CATBOOST_BINARY_PARAMS)
578
+ params = _get_add_params(params, add_params)
579
+ estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
580
+ elif target_type == ModelTaskType.REGRESSION:
581
+ params = _get_add_params(params, CATBOOST_REGRESSION_PARAMS)
582
+ params = _get_add_params(params, add_params)
583
+ estimator = CatBoostWrapper(CatBoostRegressor(**params), **kwargs)
584
+ else:
585
+ raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
586
+ elif EstimatorWrapper.default_estimator == "lightgbm":
587
+ logger.info("Using LightGBM as default estimator")
588
+ params = {"random_state": DEFAULT_RANDOM_STATE, "verbose": -1}
589
+ if target_type == ModelTaskType.MULTICLASS:
590
+ params = _get_add_params(params, LIGHTGBM_MULTICLASS_PARAMS)
591
+ params = _get_add_params(params, add_params)
592
+ estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
593
+ elif target_type == ModelTaskType.BINARY:
594
+ params = _get_add_params(params, LIGHTGBM_BINARY_PARAMS)
595
+ params = _get_add_params(params, add_params)
596
+ estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
597
+ elif target_type == ModelTaskType.REGRESSION:
598
+ if not isinstance(cv, TimeSeriesSplit) and not isinstance(cv, BlockedTimeSeriesSplit):
599
+ params = _get_add_params(params, LIGHTGBM_REGRESSION_PARAMS)
600
+ params = _get_add_params(params, add_params)
601
+ estimator = LightGBMWrapper(LGBMRegressor(**params), **kwargs)
602
+ else:
603
+ raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
505
604
  else:
506
- raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
605
+ raise Exception("Unsupported default_estimator. Available: catboost, lightgbm")
507
606
  else:
508
607
  if hasattr(estimator, "copy"):
509
608
  estimator_copy = estimator.copy()
@@ -511,19 +610,12 @@ class EstimatorWrapper:
511
610
  estimator_copy = deepcopy(estimator)
512
611
  kwargs["estimator"] = estimator_copy
513
612
  if is_catboost_estimator(estimator):
514
- if cat_features is not None:
515
- for cat_feature in cat_features:
516
- if cat_feature not in x.columns:
517
- logger.error(
518
- f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
519
- )
520
- estimator_copy.set_params(cat_features=cat_features, has_time=has_date)
613
+ if has_date is not None:
614
+ estimator_copy.set_params(has_time=has_date)
521
615
  estimator = CatBoostWrapper(**kwargs)
522
616
  else:
523
617
  if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
524
618
  estimator = LightGBMWrapper(**kwargs)
525
- elif is_catboost_estimator(estimator):
526
- estimator = CatBoostWrapper(**kwargs)
527
619
  else:
528
620
  logger.warning(
529
621
  f"Unexpected estimator is used for metrics: {estimator}. "
@@ -539,6 +631,7 @@ class CatBoostWrapper(EstimatorWrapper):
539
631
  self,
540
632
  estimator,
541
633
  scorer: Callable,
634
+ cat_features: Optional[List[str]],
542
635
  metric_name: str,
543
636
  multiplier: int,
544
637
  cv: BaseCrossValidator,
@@ -550,6 +643,7 @@ class CatBoostWrapper(EstimatorWrapper):
550
643
  super(CatBoostWrapper, self).__init__(
551
644
  estimator,
552
645
  scorer,
646
+ cat_features,
553
647
  metric_name,
554
648
  multiplier,
555
649
  cv,
@@ -558,10 +652,8 @@ class CatBoostWrapper(EstimatorWrapper):
558
652
  text_features=text_features,
559
653
  logger=logger,
560
654
  )
561
- self.cat_features = None
562
655
  self.emb_features = None
563
656
  self.grouped_embedding_features = None
564
- self.exclude_features = []
565
657
 
566
658
  def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
567
659
  x, y, groups, params = super()._prepare_to_fit(x, y)
@@ -570,76 +662,60 @@ class CatBoostWrapper(EstimatorWrapper):
570
662
  import catboost
571
663
  from catboost import CatBoostClassifier
572
664
 
573
- if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
665
+ if not hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
666
+ self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
667
+ else:
574
668
  emb_pattern = r"(.+)_emb\d+"
575
669
  self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
576
- if len(self.emb_features) > 3: # There is no reason to reduce embeddings dimension with less than 4
577
- self.logger.info(
578
- "Embedding features count more than 3, so group them into one vector for CatBoost: "
579
- f"{self.emb_features}"
580
- )
581
- x, self.grouped_embedding_features = self.group_embeddings(x)
670
+ x, self.grouped_embedding_features = self.group_embeddings(x)
671
+ if len(self.grouped_embedding_features) > 0:
582
672
  params["embedding_features"] = self.grouped_embedding_features
583
- else:
584
- self.logger.info(f"Embedding features count less than 3, so use them separately: {self.emb_features}")
585
- self.grouped_embedding_features = None
586
- else:
587
- self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
588
673
 
589
674
  # Find text features from passed in generate_features
590
- if hasattr(CatBoostClassifier, "get_text_feature_indices"):
675
+ if not hasattr(CatBoostClassifier, "get_text_feature_indices"):
676
+ self.text_features = None
677
+ self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
678
+ else:
591
679
  if self.text_features is not None:
592
680
  self.logger.info(f"Passed text features for CatBoost: {self.text_features}")
593
681
  self.text_features = [f for f in self.text_features if f in x.columns and not is_numeric_dtype(x[f])]
594
682
  self.logger.info(f"Rest text features after checks: {self.text_features}")
595
683
  params["text_features"] = self.text_features
596
- else:
597
- self.text_features = None
598
- self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
599
684
 
600
685
  # Find rest categorical features
601
- self.cat_features = _get_cat_features(x, self.text_features, self.grouped_embedding_features)
602
- # x = fill_na_cat_features(x, self.cat_features)
603
- unique_cat_features = []
604
- for name in self.cat_features:
605
- # Remove constant categorical features
606
- if x[name].nunique() > 1:
607
- unique_cat_features.append(name)
608
- else:
609
- self.logger.info(f"Drop column {name} on preparing data for fit")
610
- x = x.drop(columns=name)
611
- self.exclude_features.append(name)
612
- self.cat_features = unique_cat_features
613
- if (
614
- hasattr(self.estimator, "get_param")
615
- and hasattr(self.estimator, "_init_params")
616
- and self.estimator.get_param("cat_features") is not None
617
- ):
618
- estimator_cat_features = self.estimator.get_param("cat_features")
619
- if all([isinstance(c, int) for c in estimator_cat_features]):
620
- cat_features_idx = {x.columns.get_loc(c) for c in self.cat_features}
621
- cat_features_idx.update(estimator_cat_features)
622
- self.cat_features = [x.columns[idx] for idx in cat_features_idx]
623
- elif all([isinstance(c, str) for c in estimator_cat_features]):
624
- self.cat_features = list(set(self.cat_features + estimator_cat_features))
625
- else:
626
- print(f"WARNING: Unsupported type of cat_features in CatBoost estimator: {estimator_cat_features}")
627
-
628
- del self.estimator._init_params["cat_features"]
629
-
630
- self.logger.info(f"Selected categorical features: {self.cat_features}")
631
- params["cat_features"] = self.cat_features
686
+ self.cat_features = [
687
+ f
688
+ for f in self.cat_features
689
+ if f not in (self.text_features or []) and f not in (self.grouped_embedding_features or [])
690
+ ]
691
+ if self.cat_features:
692
+ for c in self.cat_features:
693
+ if is_numeric_dtype(x[c]):
694
+ x[c] = x[c].fillna(np.nan)
695
+ elif x[c].dtype != "category":
696
+ x[c] = x[c].fillna("NA")
697
+ params["cat_features"] = self.cat_features
632
698
 
633
699
  return x, y, groups, params
634
700
 
635
701
  def group_embeddings(self, df: pd.DataFrame):
636
- emb_name = "__grouped_embeddings"
637
- df = df.copy()
638
- df[self.emb_features] = df[self.emb_features].fillna(0.0)
639
- df[emb_name] = pd.Series(df[self.emb_features].values.tolist())
640
- df = df.drop(columns=self.emb_features)
641
-
642
- return df, [emb_name]
702
+ embeddings_columns = []
703
+ if len(self.emb_features) > 3:
704
+ self.logger.info(
705
+ "Embedding features count more than 3, so group them into one vector for CatBoost: "
706
+ f"{self.emb_features}"
707
+ )
708
+ emb_name = "__grouped_embeddings"
709
+ df = df.copy()
710
+ df[self.emb_features] = df[self.emb_features].fillna(0.0)
711
+ embeddings_series = pd.Series(df[self.emb_features].values.tolist(), index=df.index)
712
+ df = pd.concat([df.drop(columns=self.emb_features), pd.DataFrame({emb_name: embeddings_series})], axis=1)
713
+ embeddings_columns.append(emb_name)
714
+ for c in df.columns:
715
+ if is_valid_numeric_array_data(df[c]):
716
+ embeddings_columns.append(c)
717
+
718
+ return df, embeddings_columns
643
719
 
644
720
  def process_shap_values(self, shap_values: Dict[str, float]) -> Dict[str, float]:
645
721
  if "__grouped_embeddings" in shap_values:
@@ -649,16 +725,19 @@ class CatBoostWrapper(EstimatorWrapper):
649
725
  return shap_values
650
726
 
651
727
  def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
652
- if self.exclude_features:
653
- x = x.drop(columns=self.exclude_features)
654
728
  x, y, params = super()._prepare_to_calculate(x, y)
655
729
  if self.text_features:
656
730
  params["text_features"] = self.text_features
657
731
  if self.grouped_embedding_features:
658
732
  x, emb_columns = self.group_embeddings(x)
659
733
  params["embedding_features"] = emb_columns
734
+
660
735
  if self.cat_features:
661
- # x = fill_na_cat_features(x, self.cat_features)
736
+ for c in self.cat_features:
737
+ if is_numeric_dtype(x[c]):
738
+ x[c] = x[c].fillna(np.nan)
739
+ elif x[c].dtype != "category":
740
+ x[c] = x[c].fillna("NA")
662
741
  params["cat_features"] = self.cat_features
663
742
 
664
743
  return x, y, params
@@ -684,7 +763,7 @@ class CatBoostWrapper(EstimatorWrapper):
684
763
  )
685
764
  for f in high_cardinality_features:
686
765
  self.text_features.remove(f)
687
- self.exclude_features.append(f)
766
+ self.droped_features.append(f)
688
767
  x = x.drop(columns=f, errors="ignore")
689
768
  return super().cross_val_predict(x, y, baseline_score_column)
690
769
  else:
@@ -703,23 +782,29 @@ class CatBoostWrapper(EstimatorWrapper):
703
782
  embedding_features=self.grouped_embedding_features,
704
783
  )
705
784
 
706
- # Get SHAP values of current estimator
707
- shap_values_fold = estimator.get_feature_importance(data=fold_pool, type="ShapValues")
785
+ shap_values = estimator.get_feature_importance(data=fold_pool, type="ShapValues")
708
786
 
709
- # Remove last columns (base value) and flatten
710
787
  if self.target_type == ModelTaskType.MULTICLASS:
711
- all_shaps = shap_values_fold[:, :, :-1]
712
- all_shaps = [all_shaps[:, :, k].flatten() for k in range(all_shaps.shape[2])]
788
+ # For multiclass, shap_values has shape (n_samples, n_classes, n_features + 1)
789
+ # Last column is bias term
790
+ shap_values = shap_values[:, :, :-1] # Remove bias term
791
+ # Average SHAP values across classes
792
+ shap_values = np.mean(np.abs(shap_values), axis=1)
713
793
  else:
714
- all_shaps = shap_values_fold[:, :-1]
715
- all_shaps = [all_shaps[:, k].flatten() for k in range(all_shaps.shape[1])]
794
+ # For binary/regression, shap_values has shape (n_samples, n_features + 1)
795
+ # Last column is bias term
796
+ shap_values = shap_values[:, :-1] # Remove bias term
797
+ # Take absolute values
798
+ shap_values = np.abs(shap_values)
716
799
 
717
- all_shaps = np.abs(all_shaps)
800
+ feature_importance = {}
801
+ for i, col in enumerate(x.columns):
802
+ feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
718
803
 
719
- return dict(zip(estimator.feature_names_, all_shaps))
804
+ return feature_importance
720
805
 
721
- except Exception:
722
- self.logger.exception("Failed to recalculate new SHAP values")
806
+ except Exception as e:
807
+ self.logger.exception(f"Failed to recalculate new SHAP values: {str(e)}")
723
808
  return None
724
809
 
725
810
 
@@ -728,6 +813,7 @@ class LightGBMWrapper(EstimatorWrapper):
728
813
  self,
729
814
  estimator,
730
815
  scorer: Callable,
816
+ cat_features: Optional[List[str]],
731
817
  metric_name: str,
732
818
  multiplier: int,
733
819
  cv: BaseCrossValidator,
@@ -739,6 +825,7 @@ class LightGBMWrapper(EstimatorWrapper):
739
825
  super(LightGBMWrapper, self).__init__(
740
826
  estimator,
741
827
  scorer,
828
+ cat_features,
742
829
  metric_name,
743
830
  multiplier,
744
831
  cv,
@@ -747,7 +834,6 @@ class LightGBMWrapper(EstimatorWrapper):
747
834
  text_features=text_features,
748
835
  logger=logger,
749
836
  )
750
- self.cat_features = None
751
837
  self.cat_encoder = None
752
838
  self.n_classes = None
753
839
 
@@ -759,30 +845,23 @@ class LightGBMWrapper(EstimatorWrapper):
759
845
  if self.target_type == ModelTaskType.BINARY:
760
846
  params["eval_metric"] = "auc"
761
847
  params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
762
- self.cat_features = _get_cat_features(x)
763
848
  if self.cat_features:
764
- x = fill_na_cat_features(x, self.cat_features)
765
- encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
766
- encoded = pd.DataFrame(
767
- encoder.fit_transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
768
- )
849
+ encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, cols=self.cat_features, return_df=True)
850
+ encoded = encoder.fit_transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
769
851
  x[self.cat_features] = encoded
770
852
  self.cat_encoder = encoder
771
- if not is_numeric_dtype(y_numpy):
772
- y_numpy = correct_string_target(y_numpy)
773
-
853
+ for c in x.columns:
854
+ if x[c].dtype not in ["category", "int64", "float64", "bool"]:
855
+ self.logger.warning(f"Feature {c} is not numeric and will be dropped")
856
+ self.droped_features.append(c)
857
+ x = x.drop(columns=c, errors="ignore")
774
858
  return x, y_numpy, groups, params
775
859
 
776
860
  def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
777
861
  x, y_numpy, params = super()._prepare_to_calculate(x, y)
778
- if self.cat_features is not None:
779
- x = fill_na_cat_features(x, self.cat_features)
780
- if self.cat_encoder is not None:
781
- x[self.cat_features] = pd.DataFrame(
782
- self.cat_encoder.transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
783
- )
784
- if not is_numeric_dtype(y):
785
- y_numpy = correct_string_target(y_numpy)
862
+ if self.cat_features is not None and self.cat_encoder is not None:
863
+ encoded = self.cat_encoder.transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
864
+ x[self.cat_features] = encoded
786
865
  return x, y_numpy, params
787
866
 
788
867
  def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
@@ -808,20 +887,6 @@ class LightGBMWrapper(EstimatorWrapper):
808
887
  for i, col in enumerate(x.columns):
809
888
  feature_importance[col] = np.mean(np.abs(shap_matrix[:, i]))
810
889
 
811
- # # exclude last column (base value)
812
- # shap_values_only = shap_values[:, :-1]
813
- # mean_abs_shap = np.mean(np.abs(shap_values_only), axis=0)
814
-
815
- # # For classification, shap_values is returned as a list for each class
816
- # # Take values for the positive class
817
- # if isinstance(shap_values, list):
818
- # shap_values = shap_values[1]
819
-
820
- # # Calculate mean absolute SHAP value for each feature
821
- # feature_importance = {}
822
- # for i, col in enumerate(x.columns):
823
- # feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
824
-
825
890
  return feature_importance
826
891
 
827
892
  except Exception as e:
@@ -834,6 +899,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
834
899
  self,
835
900
  estimator,
836
901
  scorer: Callable,
902
+ cat_features: Optional[List[str]],
837
903
  metric_name: str,
838
904
  multiplier: int,
839
905
  cv: BaseCrossValidator,
@@ -845,6 +911,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
845
911
  super(OtherEstimatorWrapper, self).__init__(
846
912
  estimator,
847
913
  scorer,
914
+ cat_features,
848
915
  metric_name,
849
916
  multiplier,
850
917
  cv,
@@ -853,33 +920,33 @@ class OtherEstimatorWrapper(EstimatorWrapper):
853
920
  text_features=text_features,
854
921
  logger=logger,
855
922
  )
856
- self.cat_features = None
857
923
 
858
924
  def _prepare_to_fit(self, x: pd.DataFrame, y: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
859
- x, y, groups, params = super()._prepare_to_fit(x, y)
860
- self.cat_features = _get_cat_features(x)
925
+ x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
861
926
  num_features = [col for col in x.columns if col not in self.cat_features]
862
927
  x[num_features] = x[num_features].fillna(-999)
863
- x = fill_na_cat_features(x, self.cat_features)
864
- # TODO use one-hot encoding if cardinality is less 50
865
- for feature in self.cat_features:
866
- x[feature] = x[feature].astype("category").cat.codes
867
- if not is_numeric_dtype(y):
868
- y = correct_string_target(y)
869
- return x, y, groups, params
928
+ if self.cat_features:
929
+ encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, return_df=True)
930
+ encoded = encoder.fit_transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
931
+ x[self.cat_features] = encoded
932
+ self.cat_encoder = encoder
933
+ for c in x.columns:
934
+ if x[c].dtype not in ["category", "int64", "float64", "bool"]:
935
+ self.logger.warning(f"Feature {c} is not numeric and will be dropped")
936
+ self.droped_features.append(c)
937
+ x = x.drop(columns=c, errors="ignore")
938
+ return x, y_numpy, groups, params
870
939
 
871
940
  def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
872
- x, y, params = super()._prepare_to_calculate(x, y)
941
+ x, y_numpy, params = super()._prepare_to_calculate(x, y)
873
942
  if self.cat_features is not None:
874
943
  num_features = [col for col in x.columns if col not in self.cat_features]
875
944
  x[num_features] = x[num_features].fillna(-999)
876
- x = fill_na_cat_features(x, self.cat_features)
877
- # TODO use one-hot encoding if cardinality is less 50
878
- for feature in self.cat_features:
879
- x[feature] = x[feature].astype("category").cat.codes
880
- if not is_numeric_dtype(y):
881
- y = correct_string_target(y)
882
- return x, y, params
945
+ if self.cat_features and self.cat_encoder is not None:
946
+ x[self.cat_features] = self.cat_encoder.transform(
947
+ x[self.cat_features].astype("object"), y_numpy
948
+ ).astype("category")
949
+ return x, y_numpy, params
883
950
 
884
951
 
885
952
  def validate_scoring_argument(scoring: Union[Callable, str, None]):
@@ -941,7 +1008,7 @@ def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
941
1008
  return scoring, metric_name, multiplier
942
1009
 
943
1010
 
944
- def _get_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None]) -> Tuple[Callable, str, int]:
1011
+ def define_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None]) -> Tuple[Callable, str, int]:
945
1012
  if scoring is None:
946
1013
  if target_type == ModelTaskType.BINARY:
947
1014
  scoring = "roc_auc"
@@ -960,16 +1027,9 @@ def _get_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None])
960
1027
  else:
961
1028
  metric_name = str(scoring)
962
1029
 
963
- return scoring, metric_name, multiplier
964
-
1030
+ metric_name = "GINI" if metric_name.upper() == "ROC_AUC" and target_type == ModelTaskType.BINARY else metric_name
965
1031
 
966
- def _get_cat_features(
967
- x: pd.DataFrame, text_features: Optional[List[str]] = None, emb_features: Optional[List[str]] = None
968
- ) -> List[str]:
969
- text_features = text_features or []
970
- emb_features = emb_features or []
971
- exclude_features = text_features + emb_features
972
- return [c for c in x.columns if c not in exclude_features and not is_numeric_dtype(x[c])]
1032
+ return scoring, metric_name, multiplier
973
1033
 
974
1034
 
975
1035
  def _get_add_params(input_params, add_params):
@@ -1059,10 +1119,8 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
1059
1119
  return mse if squared else np.sqrt(mse)
1060
1120
 
1061
1121
 
1062
- def fill_na_cat_features(df: pd.DataFrame, cat_features: List[str]) -> pd.DataFrame:
1063
- for c in cat_features:
1064
- if c in df.columns:
1065
- df[c] = df[c].astype("string").fillna(NA_REPLACEMENT).astype(str)
1066
- na_filter = df[c].str.lower().isin(NA_VALUES)
1067
- df.loc[na_filter, c] = NA_REPLACEMENT
1068
- return df
1122
+ def _get_unique_count(series: pd.Series) -> int:
1123
+ try:
1124
+ return series.nunique(dropna=False)
1125
+ except TypeError:
1126
+ return series.astype(str).nunique(dropna=False)