upgini 1.2.80__py3-none-any.whl → 1.2.81__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/metrics.py CHANGED
@@ -6,20 +6,21 @@ import re
6
6
  from collections import defaultdict
7
7
  from copy import deepcopy
8
8
  from dataclasses import dataclass
9
- from typing import Any, Callable, Dict, List, Optional, Tuple, Union
9
+ from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
10
10
 
11
11
  import lightgbm as lgb
12
12
  import numpy as np
13
13
  import pandas as pd
14
+ from catboost import CatBoostClassifier, CatBoostRegressor
15
+ from category_encoders.cat_boost import CatBoostEncoder
14
16
  from lightgbm import LGBMClassifier, LGBMRegressor
15
17
  from numpy import log1p
16
- from pandas.api.types import is_numeric_dtype
18
+ from pandas.api.types import is_numeric_dtype, is_integer_dtype, is_float_dtype
17
19
  from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
18
- from sklearn.preprocessing import OrdinalEncoder
19
20
 
21
+ from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
20
22
  from upgini.utils.features_validator import FeaturesValidator
21
23
  from upgini.utils.sklearn_ext import cross_validate
22
- from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
23
24
 
24
25
  try:
25
26
  from sklearn.metrics import get_scorer_names
@@ -31,12 +32,15 @@ except ImportError:
31
32
  available_scorers = SCORERS
32
33
  from sklearn.metrics import mean_squared_error
33
34
  from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
34
- from sklearn.model_selection import BaseCrossValidator, TimeSeriesSplit
35
+ from sklearn.model_selection import ( # , TimeSeriesSplit
36
+ BaseCrossValidator,
37
+ TimeSeriesSplit,
38
+ )
35
39
 
36
40
  from upgini.errors import ValidationError
37
41
  from upgini.metadata import ModelTaskType
38
42
  from upgini.resource_bundle import bundle
39
- from upgini.utils.target_utils import correct_string_target
43
+ from upgini.utils.target_utils import prepare_target
40
44
 
41
45
  DEFAULT_RANDOM_STATE = 42
42
46
 
@@ -87,19 +91,9 @@ CATBOOST_MULTICLASS_PARAMS = {
87
91
 
88
92
  LIGHTGBM_REGRESSION_PARAMS = {
89
93
  "random_state": DEFAULT_RANDOM_STATE,
90
- "min_gain_to_split": 0.001,
91
94
  "n_estimators": 275,
92
- "max_depth": 5,
93
- "max_cat_threshold": 80,
94
- "min_data_per_group": 25,
95
- "cat_l2": 10,
96
- "cat_smooth": 12,
97
- "learning_rate": 0.05,
98
95
  "feature_fraction": 1.0,
99
- "min_sum_hessian_in_leaf": 0.01,
100
- "objective": "huber",
101
96
  "deterministic": "true",
102
- # "force_col_wise": "true",
103
97
  "verbosity": -1,
104
98
  }
105
99
 
@@ -114,12 +108,10 @@ LIGHTGBM_MULTICLASS_PARAMS = {
114
108
  "cat_smooth": 18,
115
109
  "cat_l2": 8,
116
110
  "objective": "multiclass",
117
- # "class_weight": "balanced",
118
111
  "use_quantized_grad": "true",
119
112
  "num_grad_quant_bins": "8",
120
113
  "stochastic_rounding": "true",
121
114
  "deterministic": "true",
122
- # "force_col_wise": "true",
123
115
  "verbosity": -1,
124
116
  }
125
117
 
@@ -130,13 +122,11 @@ LIGHTGBM_BINARY_PARAMS = {
130
122
  "max_depth": 5,
131
123
  "learning_rate": 0.05,
132
124
  "objective": "binary",
133
- # "class_weight": "balanced",
134
125
  "max_cat_threshold": 80,
135
126
  "min_data_per_group": 20,
136
127
  "cat_smooth": 18,
137
128
  "cat_l2": 8,
138
129
  "deterministic": "true",
139
- # "force_col_wise": "true",
140
130
  "verbosity": -1,
141
131
  }
142
132
 
@@ -145,34 +135,6 @@ LIGHTGBM_EARLY_STOPPING_ROUNDS = 20
145
135
  N_FOLDS = 5
146
136
  BLOCKED_TS_TEST_SIZE = 0.2
147
137
 
148
- # NA_VALUES = [
149
- # "",
150
- # " ",
151
- # " ",
152
- # "#n/a",
153
- # "#n/a n/a",
154
- # "#na",
155
- # "-1.#ind",
156
- # "-1.#qnan",
157
- # "-nan",
158
- # "1.#ind",
159
- # "1.#qnan",
160
- # "n/a",
161
- # "na",
162
- # "null",
163
- # "nan",
164
- # "n/a",
165
- # "nan",
166
- # "none",
167
- # "-",
168
- # "undefined",
169
- # "[[unknown]]",
170
- # "[not provided]",
171
- # "[unknown]",
172
- # ]
173
-
174
- # NA_REPLACEMENT = "NA"
175
-
176
138
  SUPPORTED_CATBOOST_METRICS = {
177
139
  s.upper(): s
178
140
  for s in (
@@ -282,11 +244,55 @@ class _CrossValResults:
282
244
  return f"{self.metric:.3f} ± {self.metric_std:.3f}"
283
245
 
284
246
 
247
+ def is_numeric_object(x: pd.Series) -> bool:
248
+ try:
249
+ pd.to_numeric(x, errors="raise")
250
+ return True
251
+ except (ValueError, TypeError):
252
+ return False
253
+
254
+
255
+ def is_valid_numeric_array_data(data: pd.Series) -> bool:
256
+ data_without_na = data.dropna()
257
+ if data_without_na.empty:
258
+ return False
259
+
260
+ first_element = data_without_na.iloc[0]
261
+
262
+ # numpy.ndarray with numeric types
263
+ if isinstance(first_element, np.ndarray):
264
+ return np.issubdtype(first_element.dtype, np.number)
265
+
266
+ # DataFrame with all numeric columns
267
+ elif isinstance(first_element, pd.DataFrame):
268
+ return all(np.issubdtype(dtype, np.number) for dtype in first_element.dtypes)
269
+
270
+ # list or list of lists with numeric types
271
+ elif isinstance(first_element, list):
272
+ try:
273
+ # flat list
274
+ if all(isinstance(x, (int, float, np.number)) or pd.isna(x) for x in first_element):
275
+ return True
276
+ # list of lists
277
+ elif all(
278
+ isinstance(x, list) and all(isinstance(y, (int, float, np.number)) or pd.isna(y) for y in x)
279
+ for x in first_element
280
+ ):
281
+ return True
282
+ except Exception:
283
+ return False
284
+
285
+ return False
286
+
287
+
285
288
  class EstimatorWrapper:
289
+ default_estimator: Literal["catboost", "lightgbm"] = "catboost"
290
+
286
291
  def __init__(
287
292
  self,
288
293
  estimator,
289
294
  scorer: Callable,
295
+ cat_features: Optional[List[str]],
290
296
  metric_name: str,
291
297
  multiplier: int,
292
298
  cv: BaseCrossValidator,
@@ -298,9 +304,8 @@ class EstimatorWrapper:
298
304
  ):
299
305
  self.estimator = estimator
300
306
  self.scorer = scorer
301
- self.metric_name = (
302
- "GINI" if metric_name.upper() == "ROC_AUC" and target_type == ModelTaskType.BINARY else metric_name
303
- )
307
+ self.cat_features = cat_features
308
+ self.metric_name = metric_name
304
309
  self.multiplier = multiplier
305
310
  self.cv = cv
306
311
  self.target_type = target_type
@@ -309,6 +314,10 @@ class EstimatorWrapper:
309
314
  self.groups = groups
310
315
  self.text_features = text_features
311
316
  self.logger = logger or logging.getLogger()
317
+ self.droped_features = []
318
+ self.converted_to_int = []
319
+ self.converted_to_str = []
320
+ self.converted_to_numeric = []
312
321
 
313
322
  def fit(self, x: pd.DataFrame, y: np.ndarray, **kwargs):
314
323
  x, y, _, fit_params = self._prepare_to_fit(x, y)
@@ -316,22 +325,13 @@ class EstimatorWrapper:
316
325
  self.estimator.fit(x, y, **kwargs)
317
326
  return self
318
327
 
319
- def predict(self, **kwargs):
320
- return self.estimator.predict(**kwargs)
321
-
322
- def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
323
- x, y, groups = self._prepare_data(x, y, groups=self.groups)
324
- return x, y, groups, {}
328
+ def predict(self, x: pd.DataFrame, **kwargs):
329
+ x, _, _ = self._prepare_to_calculate(x, None)
330
+ return self.estimator.predict(x, **kwargs)
325
331
 
326
332
  def _prepare_data(
327
333
  self, x: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
328
334
  ) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
329
- self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
330
- for c in x.columns:
331
- if is_numeric_dtype(x[c]):
332
- x[c] = x[c].astype(float)
333
- elif not x[c].dtype == "category":
334
- x[c] = x[c].astype(str)
335
335
 
336
336
  if not isinstance(y, pd.Series):
337
337
  raise Exception(bundle.get("metrics_unsupported_target_type").format(type(y)))
@@ -345,6 +345,8 @@ class EstimatorWrapper:
345
345
  else:
346
346
  x, y = self._remove_empty_target_rows(x, y)
347
347
 
348
+ y = prepare_target(y, self.target_type)
349
+
348
350
  self.logger.info(f"After preparing data columns: {x.columns.to_list()}")
349
351
  return x, y, groups
350
352
 
@@ -357,8 +359,84 @@ class EstimatorWrapper:
357
359
 
358
360
  return x, y
359
361
 
362
+ def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
363
+ x, y, groups = self._prepare_data(x, y, groups=self.groups)
364
+
365
+ self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
366
+ self.droped_features = []
367
+ self.converted_to_int = []
368
+ self.converted_to_str = []
369
+ self.converted_to_numeric = []
370
+ for c in x.columns:
371
+
372
+ if _get_unique_count(x[c]) < 2:
373
+ self.logger.warning(f"Remove feature {c} because it has less than 2 unique values")
374
+ if c in self.cat_features:
375
+ self.cat_features.remove(c)
376
+ x.drop(columns=[c], inplace=True)
377
+ self.droped_features.append(c)
378
+ elif self.text_features is not None and c in self.text_features:
379
+ x[c] = x[c].astype(str)
380
+ self.converted_to_str.append(c)
381
+ elif c in self.cat_features:
382
+ if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
383
+ x[c] = x[c].astype(np.int64)
384
+ self.converted_to_int.append(c)
385
+ elif x[c].dtype == "category" and is_integer_dtype(x[c].cat.categories):
386
+ self.logger.info(
387
+ f"Convert categorical feature {c} with integer categories"
388
+ " to int64 and remove from cat_features"
389
+ )
390
+ x[c] = x[c].astype(np.int64)
391
+ self.converted_to_int.append(c)
392
+ self.cat_features.remove(c)
393
+ elif is_float_dtype(x[c]) or (x[c].dtype == "category" and is_float_dtype(x[c].cat.categories)):
394
+ self.logger.info(
395
+ f"Convert float cat feature {c} to string"
396
+ )
397
+ x[c] = x[c].astype(str)
398
+ self.converted_to_str.append(c)
399
+ elif x[c].dtype not in ["category", "int64"]:
400
+ x[c] = x[c].astype(str)
401
+ self.converted_to_str.append(c)
402
+ else:
403
+ if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
404
+ self.logger.info(f"Convert bool feature {c} to int64")
405
+ x[c] = x[c].astype(np.int64)
406
+ self.converted_to_int.append(c)
407
+ elif not is_valid_numeric_array_data(x[c]) and not is_numeric_dtype(x[c]):
408
+ try:
409
+ x[c] = pd.to_numeric(x[c], errors="raise")
410
+ self.converted_to_numeric.append(c)
411
+ except (ValueError, TypeError):
412
+ self.logger.warning(f"Remove feature {c} because it is not numeric and not in cat_features")
413
+ x.drop(columns=[c], inplace=True)
414
+ self.droped_features.append(c)
415
+
416
+ return x, y, groups, {}
417
+
360
418
  def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
361
419
  x, y, _ = self._prepare_data(x, y)
420
+
421
+ if self.droped_features:
422
+ self.logger.info(f"Drop features on calculate metrics: {self.droped_features}")
423
+ x = x.drop(columns=self.droped_features)
424
+
425
+ if self.converted_to_int:
426
+ self.logger.info(f"Convert to int features on calculate metrics: {self.converted_to_int}")
427
+ for c in self.converted_to_int:
428
+ x[c] = x[c].astype(np.int64)
429
+
430
+ if self.converted_to_str:
431
+ self.logger.info(f"Convert to str features on calculate metrics: {self.converted_to_str}")
432
+ for c in self.converted_to_str:
433
+ x[c] = x[c].astype(str)
434
+
435
+ if self.converted_to_numeric:
436
+ self.logger.info(f"Convert to numeric features on calculate metrics: {self.converted_to_numeric}")
437
+ for c in self.converted_to_numeric:
438
+ x[c] = pd.to_numeric(x[c], errors="coerce")
439
+
362
440
  return x, y, {}
363
441
 
364
442
  def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
@@ -378,7 +456,10 @@ class EstimatorWrapper:
378
456
  if baseline_score_column is not None and self.metric_name == "GINI":
379
457
  self.logger.info("Calculate baseline GINI on passed baseline_score_column and target")
380
458
  metric = roc_auc_score(y, x[baseline_score_column])
459
+ metric_std = None
460
+ average_shap_values = None
381
461
  else:
462
+ self.logger.info(f"Cross validate with estimeator: {self.estimator}")
382
463
  cv_results = cross_validate(
383
464
  estimator=self.estimator,
384
465
  x=x,
@@ -409,7 +490,6 @@ class EstimatorWrapper:
409
490
  shaps = self.calculate_shap(cv_x, cv_y, estimator)
410
491
  if shaps is not None:
411
492
  for feature, shap_value in shaps.items():
412
- # shap_values_all_folds[feature] = shap_values_all_folds.get(feature, []) + shap_value.tolist()
413
493
  shap_values_all_folds[feature].append(shap_value)
414
494
 
415
495
  if shap_values_all_folds:
@@ -465,7 +545,7 @@ class EstimatorWrapper:
465
545
  logger: logging.Logger,
466
546
  target_type: ModelTaskType,
467
547
  cv: BaseCrossValidator,
468
- x: pd.DataFrame,
548
+ *,
469
549
  scoring: Union[Callable, str, None] = None,
470
550
  cat_features: Optional[List[str]] = None,
471
551
  text_features: Optional[List[str]] = None,
@@ -473,9 +553,10 @@ class EstimatorWrapper:
473
553
  groups: Optional[List[str]] = None,
474
554
  has_date: Optional[bool] = None,
475
555
  ) -> EstimatorWrapper:
476
- scorer, metric_name, multiplier = _get_scorer(target_type, scoring)
556
+ scorer, metric_name, multiplier = define_scorer(target_type, scoring)
477
557
  kwargs = {
478
558
  "scorer": scorer,
559
+ "cat_features": cat_features,
479
560
  "metric_name": metric_name,
480
561
  "multiplier": multiplier,
481
562
  "cv": cv,
@@ -485,22 +566,43 @@ class EstimatorWrapper:
485
566
  "logger": logger,
486
567
  }
487
568
  if estimator is None:
488
- params = {"random_state": DEFAULT_RANDOM_STATE, "verbose": -1}
489
- if target_type == ModelTaskType.MULTICLASS:
490
- params = _get_add_params(params, LIGHTGBM_MULTICLASS_PARAMS)
491
- params = _get_add_params(params, add_params)
492
- estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
493
- elif target_type == ModelTaskType.BINARY:
494
- params = _get_add_params(params, LIGHTGBM_BINARY_PARAMS)
495
- params = _get_add_params(params, add_params)
496
- estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
497
- elif target_type == ModelTaskType.REGRESSION:
498
- if not isinstance(cv, TimeSeriesSplit) and not isinstance(cv, BlockedTimeSeriesSplit):
499
- params = _get_add_params(params, LIGHTGBM_REGRESSION_PARAMS)
500
- params = _get_add_params(params, add_params)
501
- estimator = LightGBMWrapper(LGBMRegressor(**params), **kwargs)
569
+ if EstimatorWrapper.default_estimator == "catboost":
570
+ logger.info("Using CatBoost as default estimator")
571
+ params = {"has_time": has_date}
572
+ if target_type == ModelTaskType.MULTICLASS:
573
+ params = _get_add_params(params, CATBOOST_MULTICLASS_PARAMS)
574
+ params = _get_add_params(params, add_params)
575
+ estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
576
+ elif target_type == ModelTaskType.BINARY:
577
+ params = _get_add_params(params, CATBOOST_BINARY_PARAMS)
578
+ params = _get_add_params(params, add_params)
579
+ estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
580
+ elif target_type == ModelTaskType.REGRESSION:
581
+ params = _get_add_params(params, CATBOOST_REGRESSION_PARAMS)
582
+ params = _get_add_params(params, add_params)
583
+ estimator = CatBoostWrapper(CatBoostRegressor(**params), **kwargs)
584
+ else:
585
+ raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
586
+ elif EstimatorWrapper.default_estimator == "lightgbm":
587
+ logger.info("Using LightGBM as default estimator")
588
+ params = {"random_state": DEFAULT_RANDOM_STATE, "verbose": -1}
589
+ if target_type == ModelTaskType.MULTICLASS:
590
+ params = _get_add_params(params, LIGHTGBM_MULTICLASS_PARAMS)
591
+ params = _get_add_params(params, add_params)
592
+ estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
593
+ elif target_type == ModelTaskType.BINARY:
594
+ params = _get_add_params(params, LIGHTGBM_BINARY_PARAMS)
595
+ params = _get_add_params(params, add_params)
596
+ estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
597
+ elif target_type == ModelTaskType.REGRESSION:
598
+ if not isinstance(cv, TimeSeriesSplit) and not isinstance(cv, BlockedTimeSeriesSplit):
599
+ params = _get_add_params(params, LIGHTGBM_REGRESSION_PARAMS)
600
+ params = _get_add_params(params, add_params)
601
+ estimator = LightGBMWrapper(LGBMRegressor(**params), **kwargs)
602
+ else:
603
+ raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
502
604
  else:
503
- raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
605
+ raise Exception("Unsupported default_estimator. Available: catboost, lightgbm")
504
606
  else:
505
607
  if hasattr(estimator, "copy"):
506
608
  estimator_copy = estimator.copy()
@@ -508,19 +610,12 @@ class EstimatorWrapper:
508
610
  estimator_copy = deepcopy(estimator)
509
611
  kwargs["estimator"] = estimator_copy
510
612
  if is_catboost_estimator(estimator):
511
- if cat_features is not None:
512
- for cat_feature in cat_features:
513
- if cat_feature not in x.columns:
514
- logger.error(
515
- f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
516
- )
517
- estimator_copy.set_params(cat_features=cat_features, has_time=has_date)
613
+ if has_date is not None:
614
+ estimator_copy.set_params(has_time=has_date)
518
615
  estimator = CatBoostWrapper(**kwargs)
519
616
  else:
520
617
  if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
521
618
  estimator = LightGBMWrapper(**kwargs)
522
- elif is_catboost_estimator(estimator):
523
- estimator = CatBoostWrapper(**kwargs)
524
619
  else:
525
620
  logger.warning(
526
621
  f"Unexpected estimator is used for metrics: {estimator}. "
@@ -536,6 +631,7 @@ class CatBoostWrapper(EstimatorWrapper):
536
631
  self,
537
632
  estimator,
538
633
  scorer: Callable,
634
+ cat_features: Optional[List[str]],
539
635
  metric_name: str,
540
636
  multiplier: int,
541
637
  cv: BaseCrossValidator,
@@ -547,6 +643,7 @@ class CatBoostWrapper(EstimatorWrapper):
547
643
  super(CatBoostWrapper, self).__init__(
548
644
  estimator,
549
645
  scorer,
646
+ cat_features,
550
647
  metric_name,
551
648
  multiplier,
552
649
  cv,
@@ -555,10 +652,8 @@ class CatBoostWrapper(EstimatorWrapper):
555
652
  text_features=text_features,
556
653
  logger=logger,
557
654
  )
558
- self.cat_features = None
559
655
  self.emb_features = None
560
656
  self.grouped_embedding_features = None
561
- self.exclude_features = []
562
657
 
563
658
  def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
564
659
  x, y, groups, params = super()._prepare_to_fit(x, y)
@@ -567,76 +662,60 @@ class CatBoostWrapper(EstimatorWrapper):
567
662
  import catboost
568
663
  from catboost import CatBoostClassifier
569
664
 
570
- if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
665
+ if not hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
666
+ self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
667
+ else:
571
668
  emb_pattern = r"(.+)_emb\d+"
572
669
  self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
573
- if len(self.emb_features) > 3: # There is no reason to reduce embeddings dimension with less than 4
574
- self.logger.info(
575
- "Embedding features count more than 3, so group them into one vector for CatBoost: "
576
- f"{self.emb_features}"
577
- )
578
- x, self.grouped_embedding_features = self.group_embeddings(x)
670
+ x, self.grouped_embedding_features = self.group_embeddings(x)
671
+ if len(self.grouped_embedding_features) > 0:
579
672
  params["embedding_features"] = self.grouped_embedding_features
580
- else:
581
- self.logger.info(f"Embedding features count less than 3, so use them separately: {self.emb_features}")
582
- self.grouped_embedding_features = None
583
- else:
584
- self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
585
673
 
586
674
  # Find text features from passed in generate_features
587
- if hasattr(CatBoostClassifier, "get_text_feature_indices"):
675
+ if not hasattr(CatBoostClassifier, "get_text_feature_indices"):
676
+ self.text_features = None
677
+ self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
678
+ else:
588
679
  if self.text_features is not None:
589
680
  self.logger.info(f"Passed text features for CatBoost: {self.text_features}")
590
681
  self.text_features = [f for f in self.text_features if f in x.columns and not is_numeric_dtype(x[f])]
591
682
  self.logger.info(f"Rest text features after checks: {self.text_features}")
592
683
  params["text_features"] = self.text_features
593
- else:
594
- self.text_features = None
595
- self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
596
684
 
597
685
  # Find rest categorical features
598
- self.cat_features = _get_cat_features(x, self.text_features, self.grouped_embedding_features)
599
- # x = fill_na_cat_features(x, self.cat_features)
600
- unique_cat_features = []
601
- for name in self.cat_features:
602
- # Remove constant categorical features
603
- if x[name].nunique() > 1:
604
- unique_cat_features.append(name)
605
- else:
606
- self.logger.info(f"Drop column {name} on preparing data for fit")
607
- x = x.drop(columns=name)
608
- self.exclude_features.append(name)
609
- self.cat_features = unique_cat_features
610
- if (
611
- hasattr(self.estimator, "get_param")
612
- and hasattr(self.estimator, "_init_params")
613
- and self.estimator.get_param("cat_features") is not None
614
- ):
615
- estimator_cat_features = self.estimator.get_param("cat_features")
616
- if all([isinstance(c, int) for c in estimator_cat_features]):
617
- cat_features_idx = {x.columns.get_loc(c) for c in self.cat_features}
618
- cat_features_idx.update(estimator_cat_features)
619
- self.cat_features = [x.columns[idx] for idx in cat_features_idx]
620
- elif all([isinstance(c, str) for c in estimator_cat_features]):
621
- self.cat_features = list(set(self.cat_features + estimator_cat_features))
622
- else:
623
- print(f"WARNING: Unsupported type of cat_features in CatBoost estimator: {estimator_cat_features}")
624
-
625
- del self.estimator._init_params["cat_features"]
626
-
627
- self.logger.info(f"Selected categorical features: {self.cat_features}")
628
- params["cat_features"] = self.cat_features
686
+ self.cat_features = [
687
+ f
688
+ for f in self.cat_features
689
+ if f not in (self.text_features or []) and f not in (self.grouped_embedding_features or [])
690
+ ]
691
+ if self.cat_features:
692
+ for c in self.cat_features:
693
+ if is_numeric_dtype(x[c]):
694
+ x[c] = x[c].fillna(np.nan)
695
+ elif x[c].dtype != "category":
696
+ x[c] = x[c].fillna("NA")
697
+ params["cat_features"] = self.cat_features
629
698
 
630
699
  return x, y, groups, params
631
700
 
632
701
  def group_embeddings(self, df: pd.DataFrame):
633
- emb_name = "__grouped_embeddings"
634
- df = df.copy()
635
- df[self.emb_features] = df[self.emb_features].fillna(0.0)
636
- df[emb_name] = pd.Series(df[self.emb_features].values.tolist())
637
- df = df.drop(columns=self.emb_features)
638
-
639
- return df, [emb_name]
702
+ embeddings_columns = []
703
+ if len(self.emb_features) > 3:
704
+ self.logger.info(
705
+ "Embedding features count more than 3, so group them into one vector for CatBoost: "
706
+ f"{self.emb_features}"
707
+ )
708
+ emb_name = "__grouped_embeddings"
709
+ df = df.copy()
710
+ df[self.emb_features] = df[self.emb_features].fillna(0.0)
711
+ embeddings_series = pd.Series(df[self.emb_features].values.tolist(), index=df.index)
712
+ df = pd.concat([df.drop(columns=self.emb_features), pd.DataFrame({emb_name: embeddings_series})], axis=1)
713
+ embeddings_columns.append(emb_name)
714
+ for c in df.columns:
715
+ if is_valid_numeric_array_data(df[c]):
716
+ embeddings_columns.append(c)
717
+
718
+ return df, embeddings_columns
640
719
 
641
720
  def process_shap_values(self, shap_values: Dict[str, float]) -> Dict[str, float]:
642
721
  if "__grouped_embeddings" in shap_values:
@@ -646,16 +725,19 @@ class CatBoostWrapper(EstimatorWrapper):
646
725
  return shap_values
647
726
 
648
727
  def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
649
- if self.exclude_features:
650
- x = x.drop(columns=self.exclude_features)
651
728
  x, y, params = super()._prepare_to_calculate(x, y)
652
729
  if self.text_features:
653
730
  params["text_features"] = self.text_features
654
731
  if self.grouped_embedding_features:
655
732
  x, emb_columns = self.group_embeddings(x)
656
733
  params["embedding_features"] = emb_columns
734
+
657
735
  if self.cat_features:
658
- # x = fill_na_cat_features(x, self.cat_features)
736
+ for c in self.cat_features:
737
+ if is_numeric_dtype(x[c]):
738
+ x[c] = x[c].fillna(np.nan)
739
+ elif x[c].dtype != "category":
740
+ x[c] = x[c].fillna("NA")
659
741
  params["cat_features"] = self.cat_features
660
742
 
661
743
  return x, y, params
@@ -681,7 +763,7 @@ class CatBoostWrapper(EstimatorWrapper):
681
763
  )
682
764
  for f in high_cardinality_features:
683
765
  self.text_features.remove(f)
684
- self.exclude_features.append(f)
766
+ self.droped_features.append(f)
685
767
  x = x.drop(columns=f, errors="ignore")
686
768
  return super().cross_val_predict(x, y, baseline_score_column)
687
769
  else:
@@ -700,23 +782,29 @@ class CatBoostWrapper(EstimatorWrapper):
700
782
  embedding_features=self.grouped_embedding_features,
701
783
  )
702
784
 
703
- # Get SHAP values of current estimator
704
- shap_values_fold = estimator.get_feature_importance(data=fold_pool, type="ShapValues")
785
+ shap_values = estimator.get_feature_importance(data=fold_pool, type="ShapValues")
705
786
 
706
- # Remove last columns (base value) and flatten
707
787
  if self.target_type == ModelTaskType.MULTICLASS:
708
- all_shaps = shap_values_fold[:, :, :-1]
709
- all_shaps = [all_shaps[:, :, k].flatten() for k in range(all_shaps.shape[2])]
788
+ # For multiclass, shap_values has shape (n_samples, n_classes, n_features + 1)
789
+ # Last column is bias term
790
+ shap_values = shap_values[:, :, :-1] # Remove bias term
791
+ # Average SHAP values across classes
792
+ shap_values = np.mean(np.abs(shap_values), axis=1)
710
793
  else:
711
- all_shaps = shap_values_fold[:, :-1]
712
- all_shaps = [all_shaps[:, k].flatten() for k in range(all_shaps.shape[1])]
794
+ # For binary/regression, shap_values has shape (n_samples, n_features + 1)
795
+ # Last column is bias term
796
+ shap_values = shap_values[:, :-1] # Remove bias term
797
+ # Take absolute values
798
+ shap_values = np.abs(shap_values)
713
799
 
714
- all_shaps = np.abs(all_shaps)
800
+ feature_importance = {}
801
+ for i, col in enumerate(x.columns):
802
+ feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
715
803
 
716
- return dict(zip(estimator.feature_names_, all_shaps))
804
+ return feature_importance
717
805
 
718
- except Exception:
719
- self.logger.exception("Failed to recalculate new SHAP values")
806
+ except Exception as e:
807
+ self.logger.exception(f"Failed to recalculate new SHAP values: {str(e)}")
720
808
  return None
721
809
 
722
810
 
@@ -725,6 +813,7 @@ class LightGBMWrapper(EstimatorWrapper):
725
813
  self,
726
814
  estimator,
727
815
  scorer: Callable,
816
+ cat_features: Optional[List[str]],
728
817
  metric_name: str,
729
818
  multiplier: int,
730
819
  cv: BaseCrossValidator,
@@ -736,6 +825,7 @@ class LightGBMWrapper(EstimatorWrapper):
736
825
  super(LightGBMWrapper, self).__init__(
737
826
  estimator,
738
827
  scorer,
828
+ cat_features,
739
829
  metric_name,
740
830
  multiplier,
741
831
  cv,
@@ -744,7 +834,6 @@ class LightGBMWrapper(EstimatorWrapper):
744
834
  text_features=text_features,
745
835
  logger=logger,
746
836
  )
747
- self.cat_features = None
748
837
  self.cat_encoder = None
749
838
  self.n_classes = None
750
839
 
@@ -756,30 +845,23 @@ class LightGBMWrapper(EstimatorWrapper):
756
845
  if self.target_type == ModelTaskType.BINARY:
757
846
  params["eval_metric"] = "auc"
758
847
  params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
759
- self.cat_features = _get_cat_features(x)
760
848
  if self.cat_features:
761
- # x = fill_na_cat_features(x, self.cat_features)
762
- encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan)
763
- encoded = pd.DataFrame(
764
- encoder.fit_transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
765
- )
849
+ encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, cols=self.cat_features, return_df=True)
850
+ encoded = encoder.fit_transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
766
851
  x[self.cat_features] = encoded
767
852
  self.cat_encoder = encoder
768
- if not is_numeric_dtype(y_numpy):
769
- y_numpy = correct_string_target(y_numpy)
770
-
853
+ for c in x.columns:
854
+ if x[c].dtype not in ["category", "int64", "float64", "bool"]:
855
+ self.logger.warning(f"Feature {c} is not numeric and will be dropped")
856
+ self.droped_features.append(c)
857
+ x = x.drop(columns=c, errors="ignore")
771
858
  return x, y_numpy, groups, params
772
859
 
773
860
  def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
774
861
  x, y_numpy, params = super()._prepare_to_calculate(x, y)
775
- if self.cat_features is not None:
776
- # x = fill_na_cat_features(x, self.cat_features)
777
- if self.cat_encoder is not None:
778
- x[self.cat_features] = pd.DataFrame(
779
- self.cat_encoder.transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
780
- )
781
- if not is_numeric_dtype(y):
782
- y_numpy = correct_string_target(y_numpy)
862
+ if self.cat_features is not None and self.cat_encoder is not None:
863
+ encoded = self.cat_encoder.transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
864
+ x[self.cat_features] = encoded
783
865
  return x, y_numpy, params
784
866
 
785
867
  def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
@@ -805,20 +887,6 @@ class LightGBMWrapper(EstimatorWrapper):
805
887
  for i, col in enumerate(x.columns):
806
888
  feature_importance[col] = np.mean(np.abs(shap_matrix[:, i]))
807
889
 
808
- # # exclude last column (base value)
809
- # shap_values_only = shap_values[:, :-1]
810
- # mean_abs_shap = np.mean(np.abs(shap_values_only), axis=0)
811
-
812
- # # For classification, shap_values is returned as a list for each class
813
- # # Take values for the positive class
814
- # if isinstance(shap_values, list):
815
- # shap_values = shap_values[1]
816
-
817
- # # Calculate mean absolute SHAP value for each feature
818
- # feature_importance = {}
819
- # for i, col in enumerate(x.columns):
820
- # feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
821
-
822
890
  return feature_importance
823
891
 
824
892
  except Exception as e:
@@ -831,6 +899,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
831
899
  self,
832
900
  estimator,
833
901
  scorer: Callable,
902
+ cat_features: Optional[List[str]],
834
903
  metric_name: str,
835
904
  multiplier: int,
836
905
  cv: BaseCrossValidator,
@@ -842,6 +911,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
842
911
  super(OtherEstimatorWrapper, self).__init__(
843
912
  estimator,
844
913
  scorer,
914
+ cat_features,
845
915
  metric_name,
846
916
  multiplier,
847
917
  cv,
@@ -850,33 +920,33 @@ class OtherEstimatorWrapper(EstimatorWrapper):
850
920
  text_features=text_features,
851
921
  logger=logger,
852
922
  )
853
- self.cat_features = None
854
923
 
855
924
  def _prepare_to_fit(self, x: pd.DataFrame, y: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
856
- x, y, groups, params = super()._prepare_to_fit(x, y)
857
- self.cat_features = _get_cat_features(x)
925
+ x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
858
926
  num_features = [col for col in x.columns if col not in self.cat_features]
859
927
  x[num_features] = x[num_features].fillna(-999)
860
- # x = fill_na_cat_features(x, self.cat_features)
861
- # TODO use one-hot encoding if cardinality is less 50
862
- for feature in self.cat_features:
863
- x[feature] = x[feature].astype("category").cat.codes
864
- if not is_numeric_dtype(y):
865
- y = correct_string_target(y)
866
- return x, y, groups, params
928
+ if self.cat_features:
929
+ encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, return_df=True)
930
+ encoded = encoder.fit_transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
931
+ x[self.cat_features] = encoded
932
+ self.cat_encoder = encoder
933
+ for c in x.columns:
934
+ if x[c].dtype not in ["category", "int64", "float64", "bool"]:
935
+ self.logger.warning(f"Feature {c} is not numeric and will be dropped")
936
+ self.droped_features.append(c)
937
+ x = x.drop(columns=c, errors="ignore")
938
+ return x, y_numpy, groups, params
867
939
 
868
940
  def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
869
- x, y, params = super()._prepare_to_calculate(x, y)
941
+ x, y_numpy, params = super()._prepare_to_calculate(x, y)
870
942
  if self.cat_features is not None:
871
943
  num_features = [col for col in x.columns if col not in self.cat_features]
872
944
  x[num_features] = x[num_features].fillna(-999)
873
- # x = fill_na_cat_features(x, self.cat_features)
874
- # TODO use one-hot encoding if cardinality is less 50
875
- for feature in self.cat_features:
876
- x[feature] = x[feature].astype("category").cat.codes
877
- if not is_numeric_dtype(y):
878
- y = correct_string_target(y)
879
- return x, y, params
945
+ if self.cat_features and self.cat_encoder is not None:
946
+ x[self.cat_features] = self.cat_encoder.transform(
947
+ x[self.cat_features].astype("object"), y_numpy
948
+ ).astype("category")
949
+ return x, y_numpy, params
880
950
 
881
951
 
882
952
  def validate_scoring_argument(scoring: Union[Callable, str, None]):
@@ -938,7 +1008,7 @@ def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
938
1008
  return scoring, metric_name, multiplier
939
1009
 
940
1010
 
941
- def _get_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None]) -> Tuple[Callable, str, int]:
1011
+ def define_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None]) -> Tuple[Callable, str, int]:
942
1012
  if scoring is None:
943
1013
  if target_type == ModelTaskType.BINARY:
944
1014
  scoring = "roc_auc"
@@ -957,16 +1027,9 @@ def _get_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None])
957
1027
  else:
958
1028
  metric_name = str(scoring)
959
1029
 
960
- return scoring, metric_name, multiplier
961
-
1030
+ metric_name = "GINI" if metric_name.upper() == "ROC_AUC" and target_type == ModelTaskType.BINARY else metric_name
962
1031
 
963
- def _get_cat_features(
964
- x: pd.DataFrame, text_features: Optional[List[str]] = None, emb_features: Optional[List[str]] = None
965
- ) -> List[str]:
966
- text_features = text_features or []
967
- emb_features = emb_features or []
968
- exclude_features = text_features + emb_features
969
- return [c for c in x.columns if c not in exclude_features and not is_numeric_dtype(x[c])]
1032
+ return scoring, metric_name, multiplier
970
1033
 
971
1034
 
972
1035
  def _get_add_params(input_params, add_params):
@@ -1056,10 +1119,8 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
1056
1119
  return mse if squared else np.sqrt(mse)
1057
1120
 
1058
1121
 
1059
- # def fill_na_cat_features(df: pd.DataFrame, cat_features: List[str]) -> pd.DataFrame:
1060
- # for c in cat_features:
1061
- # if c in df.columns:
1062
- # df[c] = df[c].astype("string").fillna(NA_REPLACEMENT).astype(str)
1063
- # na_filter = df[c].str.lower().isin(NA_VALUES)
1064
- # df.loc[na_filter, c] = NA_REPLACEMENT
1065
- # return df
1122
+ def _get_unique_count(series: pd.Series) -> int:
1123
+ try:
1124
+ return series.nunique(dropna=False)
1125
+ except TypeError:
1126
+ return series.astype(str).nunique(dropna=False)