upgini 1.2.80__py3-none-any.whl → 1.2.81__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/autofe/binary.py +2 -2
- upgini/autofe/timeseries/volatility.py +6 -4
- upgini/features_enricher.py +155 -91
- upgini/http.py +21 -21
- upgini/mdc/__init__.py +1 -1
- upgini/metadata.py +1 -1
- upgini/metrics.py +289 -228
- upgini/resource_bundle/strings.properties +1 -1
- upgini/search_task.py +1 -0
- upgini/utils/display_utils.py +12 -7
- upgini/utils/target_utils.py +9 -6
- {upgini-1.2.80.dist-info → upgini-1.2.81.dist-info}/METADATA +3 -1
- {upgini-1.2.80.dist-info → upgini-1.2.81.dist-info}/RECORD +16 -16
- {upgini-1.2.80.dist-info → upgini-1.2.81.dist-info}/WHEEL +0 -0
- {upgini-1.2.80.dist-info → upgini-1.2.81.dist-info}/licenses/LICENSE +0 -0
upgini/metrics.py
CHANGED
@@ -6,20 +6,21 @@ import re
|
|
6
6
|
from collections import defaultdict
|
7
7
|
from copy import deepcopy
|
8
8
|
from dataclasses import dataclass
|
9
|
-
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
9
|
+
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
|
10
10
|
|
11
11
|
import lightgbm as lgb
|
12
12
|
import numpy as np
|
13
13
|
import pandas as pd
|
14
|
+
from catboost import CatBoostClassifier, CatBoostRegressor
|
15
|
+
from category_encoders.cat_boost import CatBoostEncoder
|
14
16
|
from lightgbm import LGBMClassifier, LGBMRegressor
|
15
17
|
from numpy import log1p
|
16
|
-
from pandas.api.types import is_numeric_dtype
|
18
|
+
from pandas.api.types import is_numeric_dtype, is_integer_dtype, is_float_dtype
|
17
19
|
from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
|
18
|
-
from sklearn.preprocessing import OrdinalEncoder
|
19
20
|
|
21
|
+
from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
|
20
22
|
from upgini.utils.features_validator import FeaturesValidator
|
21
23
|
from upgini.utils.sklearn_ext import cross_validate
|
22
|
-
from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
|
23
24
|
|
24
25
|
try:
|
25
26
|
from sklearn.metrics import get_scorer_names
|
@@ -31,12 +32,15 @@ except ImportError:
|
|
31
32
|
available_scorers = SCORERS
|
32
33
|
from sklearn.metrics import mean_squared_error
|
33
34
|
from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
|
34
|
-
from sklearn.model_selection import
|
35
|
+
from sklearn.model_selection import ( # , TimeSeriesSplit
|
36
|
+
BaseCrossValidator,
|
37
|
+
TimeSeriesSplit,
|
38
|
+
)
|
35
39
|
|
36
40
|
from upgini.errors import ValidationError
|
37
41
|
from upgini.metadata import ModelTaskType
|
38
42
|
from upgini.resource_bundle import bundle
|
39
|
-
from upgini.utils.target_utils import
|
43
|
+
from upgini.utils.target_utils import prepare_target
|
40
44
|
|
41
45
|
DEFAULT_RANDOM_STATE = 42
|
42
46
|
|
@@ -87,19 +91,9 @@ CATBOOST_MULTICLASS_PARAMS = {
|
|
87
91
|
|
88
92
|
LIGHTGBM_REGRESSION_PARAMS = {
|
89
93
|
"random_state": DEFAULT_RANDOM_STATE,
|
90
|
-
"min_gain_to_split": 0.001,
|
91
94
|
"n_estimators": 275,
|
92
|
-
"max_depth": 5,
|
93
|
-
"max_cat_threshold": 80,
|
94
|
-
"min_data_per_group": 25,
|
95
|
-
"cat_l2": 10,
|
96
|
-
"cat_smooth": 12,
|
97
|
-
"learning_rate": 0.05,
|
98
95
|
"feature_fraction": 1.0,
|
99
|
-
"min_sum_hessian_in_leaf": 0.01,
|
100
|
-
"objective": "huber",
|
101
96
|
"deterministic": "true",
|
102
|
-
# "force_col_wise": "true",
|
103
97
|
"verbosity": -1,
|
104
98
|
}
|
105
99
|
|
@@ -114,12 +108,10 @@ LIGHTGBM_MULTICLASS_PARAMS = {
|
|
114
108
|
"cat_smooth": 18,
|
115
109
|
"cat_l2": 8,
|
116
110
|
"objective": "multiclass",
|
117
|
-
# "class_weight": "balanced",
|
118
111
|
"use_quantized_grad": "true",
|
119
112
|
"num_grad_quant_bins": "8",
|
120
113
|
"stochastic_rounding": "true",
|
121
114
|
"deterministic": "true",
|
122
|
-
# "force_col_wise": "true",
|
123
115
|
"verbosity": -1,
|
124
116
|
}
|
125
117
|
|
@@ -130,13 +122,11 @@ LIGHTGBM_BINARY_PARAMS = {
|
|
130
122
|
"max_depth": 5,
|
131
123
|
"learning_rate": 0.05,
|
132
124
|
"objective": "binary",
|
133
|
-
# "class_weight": "balanced",
|
134
125
|
"max_cat_threshold": 80,
|
135
126
|
"min_data_per_group": 20,
|
136
127
|
"cat_smooth": 18,
|
137
128
|
"cat_l2": 8,
|
138
129
|
"deterministic": "true",
|
139
|
-
# "force_col_wise": "true",
|
140
130
|
"verbosity": -1,
|
141
131
|
}
|
142
132
|
|
@@ -145,34 +135,6 @@ LIGHTGBM_EARLY_STOPPING_ROUNDS = 20
|
|
145
135
|
N_FOLDS = 5
|
146
136
|
BLOCKED_TS_TEST_SIZE = 0.2
|
147
137
|
|
148
|
-
# NA_VALUES = [
|
149
|
-
# "",
|
150
|
-
# " ",
|
151
|
-
# " ",
|
152
|
-
# "#n/a",
|
153
|
-
# "#n/a n/a",
|
154
|
-
# "#na",
|
155
|
-
# "-1.#ind",
|
156
|
-
# "-1.#qnan",
|
157
|
-
# "-nan",
|
158
|
-
# "1.#ind",
|
159
|
-
# "1.#qnan",
|
160
|
-
# "n/a",
|
161
|
-
# "na",
|
162
|
-
# "null",
|
163
|
-
# "nan",
|
164
|
-
# "n/a",
|
165
|
-
# "nan",
|
166
|
-
# "none",
|
167
|
-
# "-",
|
168
|
-
# "undefined",
|
169
|
-
# "[[unknown]]",
|
170
|
-
# "[not provided]",
|
171
|
-
# "[unknown]",
|
172
|
-
# ]
|
173
|
-
|
174
|
-
# NA_REPLACEMENT = "NA"
|
175
|
-
|
176
138
|
SUPPORTED_CATBOOST_METRICS = {
|
177
139
|
s.upper(): s
|
178
140
|
for s in (
|
@@ -282,11 +244,55 @@ class _CrossValResults:
|
|
282
244
|
return f"{self.metric:.3f} ± {self.metric_std:.3f}"
|
283
245
|
|
284
246
|
|
247
|
+
def is_numeric_object(x: pd.Series) -> bool:
|
248
|
+
try:
|
249
|
+
pd.to_numeric(x, errors="raise")
|
250
|
+
return True
|
251
|
+
except (ValueError, TypeError):
|
252
|
+
return False
|
253
|
+
|
254
|
+
|
255
|
+
def is_valid_numeric_array_data(data: pd.Series) -> bool:
|
256
|
+
data_without_na = data.dropna()
|
257
|
+
if data_without_na.empty:
|
258
|
+
return False
|
259
|
+
|
260
|
+
first_element = data_without_na.iloc[0]
|
261
|
+
|
262
|
+
# numpy.ndarray with numeric types
|
263
|
+
if isinstance(first_element, np.ndarray):
|
264
|
+
return np.issubdtype(first_element.dtype, np.number)
|
265
|
+
|
266
|
+
# DataFrame with all numeric columns
|
267
|
+
elif isinstance(first_element, pd.DataFrame):
|
268
|
+
return all(np.issubdtype(dtype, np.number) for dtype in first_element.dtypes)
|
269
|
+
|
270
|
+
# list or list of lists with numeric types
|
271
|
+
elif isinstance(first_element, list):
|
272
|
+
try:
|
273
|
+
# flat list
|
274
|
+
if all(isinstance(x, (int, float, np.number)) or pd.isna(x) for x in first_element):
|
275
|
+
return True
|
276
|
+
# list of lists
|
277
|
+
elif all(
|
278
|
+
isinstance(x, list) and all(isinstance(y, (int, float, np.number)) or pd.isna(y) for y in x)
|
279
|
+
for x in first_element
|
280
|
+
):
|
281
|
+
return True
|
282
|
+
except Exception:
|
283
|
+
return False
|
284
|
+
|
285
|
+
return False
|
286
|
+
|
287
|
+
|
285
288
|
class EstimatorWrapper:
|
289
|
+
default_estimator: Literal["catboost", "lightgbm"] = "catboost"
|
290
|
+
|
286
291
|
def __init__(
|
287
292
|
self,
|
288
293
|
estimator,
|
289
294
|
scorer: Callable,
|
295
|
+
cat_features: Optional[List[str]],
|
290
296
|
metric_name: str,
|
291
297
|
multiplier: int,
|
292
298
|
cv: BaseCrossValidator,
|
@@ -298,9 +304,8 @@ class EstimatorWrapper:
|
|
298
304
|
):
|
299
305
|
self.estimator = estimator
|
300
306
|
self.scorer = scorer
|
301
|
-
self.
|
302
|
-
|
303
|
-
)
|
307
|
+
self.cat_features = cat_features
|
308
|
+
self.metric_name = metric_name
|
304
309
|
self.multiplier = multiplier
|
305
310
|
self.cv = cv
|
306
311
|
self.target_type = target_type
|
@@ -309,6 +314,10 @@ class EstimatorWrapper:
|
|
309
314
|
self.groups = groups
|
310
315
|
self.text_features = text_features
|
311
316
|
self.logger = logger or logging.getLogger()
|
317
|
+
self.droped_features = []
|
318
|
+
self.converted_to_int = []
|
319
|
+
self.converted_to_str = []
|
320
|
+
self.converted_to_numeric = []
|
312
321
|
|
313
322
|
def fit(self, x: pd.DataFrame, y: np.ndarray, **kwargs):
|
314
323
|
x, y, _, fit_params = self._prepare_to_fit(x, y)
|
@@ -316,22 +325,13 @@ class EstimatorWrapper:
|
|
316
325
|
self.estimator.fit(x, y, **kwargs)
|
317
326
|
return self
|
318
327
|
|
319
|
-
def predict(self, **kwargs):
|
320
|
-
|
321
|
-
|
322
|
-
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
323
|
-
x, y, groups = self._prepare_data(x, y, groups=self.groups)
|
324
|
-
return x, y, groups, {}
|
328
|
+
def predict(self, x: pd.DataFrame, **kwargs):
|
329
|
+
x, _, _ = self._prepare_to_calculate(x, None)
|
330
|
+
return self.estimator.predict(x, **kwargs)
|
325
331
|
|
326
332
|
def _prepare_data(
|
327
333
|
self, x: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
|
328
334
|
) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
|
329
|
-
self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
|
330
|
-
for c in x.columns:
|
331
|
-
if is_numeric_dtype(x[c]):
|
332
|
-
x[c] = x[c].astype(float)
|
333
|
-
elif not x[c].dtype == "category":
|
334
|
-
x[c] = x[c].astype(str)
|
335
335
|
|
336
336
|
if not isinstance(y, pd.Series):
|
337
337
|
raise Exception(bundle.get("metrics_unsupported_target_type").format(type(y)))
|
@@ -345,6 +345,8 @@ class EstimatorWrapper:
|
|
345
345
|
else:
|
346
346
|
x, y = self._remove_empty_target_rows(x, y)
|
347
347
|
|
348
|
+
y = prepare_target(y, self.target_type)
|
349
|
+
|
348
350
|
self.logger.info(f"After preparing data columns: {x.columns.to_list()}")
|
349
351
|
return x, y, groups
|
350
352
|
|
@@ -357,8 +359,84 @@ class EstimatorWrapper:
|
|
357
359
|
|
358
360
|
return x, y
|
359
361
|
|
362
|
+
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
363
|
+
x, y, groups = self._prepare_data(x, y, groups=self.groups)
|
364
|
+
|
365
|
+
self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
|
366
|
+
self.droped_features = []
|
367
|
+
self.converted_to_int = []
|
368
|
+
self.converted_to_str = []
|
369
|
+
self.converted_to_numeric = []
|
370
|
+
for c in x.columns:
|
371
|
+
|
372
|
+
if _get_unique_count(x[c]) < 2:
|
373
|
+
self.logger.warning(f"Remove feature {c} because it has less than 2 unique values")
|
374
|
+
if c in self.cat_features:
|
375
|
+
self.cat_features.remove(c)
|
376
|
+
x.drop(columns=[c], inplace=True)
|
377
|
+
self.droped_features.append(c)
|
378
|
+
elif self.text_features is not None and c in self.text_features:
|
379
|
+
x[c] = x[c].astype(str)
|
380
|
+
self.converted_to_str.append(c)
|
381
|
+
elif c in self.cat_features:
|
382
|
+
if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
|
383
|
+
x[c] = x[c].astype(np.int64)
|
384
|
+
self.converted_to_int.append(c)
|
385
|
+
elif x[c].dtype == "category" and is_integer_dtype(x[c].cat.categories):
|
386
|
+
self.logger.info(
|
387
|
+
f"Convert categorical feature {c} with integer categories"
|
388
|
+
" to int64 and remove from cat_features"
|
389
|
+
)
|
390
|
+
x[c] = x[c].astype(np.int64)
|
391
|
+
self.converted_to_int.append(c)
|
392
|
+
self.cat_features.remove(c)
|
393
|
+
elif is_float_dtype(x[c]) or (x[c].dtype == "category" and is_float_dtype(x[c].cat.categories)):
|
394
|
+
self.logger.info(
|
395
|
+
f"Convert float cat feature {c} to string"
|
396
|
+
)
|
397
|
+
x[c] = x[c].astype(str)
|
398
|
+
self.converted_to_str.append(c)
|
399
|
+
elif x[c].dtype not in ["category", "int64"]:
|
400
|
+
x[c] = x[c].astype(str)
|
401
|
+
self.converted_to_str.append(c)
|
402
|
+
else:
|
403
|
+
if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
|
404
|
+
self.logger.info(f"Convert bool feature {c} to int64")
|
405
|
+
x[c] = x[c].astype(np.int64)
|
406
|
+
self.converted_to_int.append(c)
|
407
|
+
elif not is_valid_numeric_array_data(x[c]) and not is_numeric_dtype(x[c]):
|
408
|
+
try:
|
409
|
+
x[c] = pd.to_numeric(x[c], errors="raise")
|
410
|
+
self.converted_to_numeric.append(c)
|
411
|
+
except (ValueError, TypeError):
|
412
|
+
self.logger.warning(f"Remove feature {c} because it is not numeric and not in cat_features")
|
413
|
+
x.drop(columns=[c], inplace=True)
|
414
|
+
self.droped_features.append(c)
|
415
|
+
|
416
|
+
return x, y, groups, {}
|
417
|
+
|
360
418
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
361
419
|
x, y, _ = self._prepare_data(x, y)
|
420
|
+
|
421
|
+
if self.droped_features:
|
422
|
+
self.logger.info(f"Drop features on calculate metrics: {self.droped_features}")
|
423
|
+
x = x.drop(columns=self.droped_features)
|
424
|
+
|
425
|
+
if self.converted_to_int:
|
426
|
+
self.logger.info(f"Convert to int features on calculate metrics: {self.converted_to_int}")
|
427
|
+
for c in self.converted_to_int:
|
428
|
+
x[c] = x[c].astype(np.int64)
|
429
|
+
|
430
|
+
if self.converted_to_str:
|
431
|
+
self.logger.info(f"Convert to str features on calculate metrics: {self.converted_to_str}")
|
432
|
+
for c in self.converted_to_str:
|
433
|
+
x[c] = x[c].astype(str)
|
434
|
+
|
435
|
+
if self.converted_to_numeric:
|
436
|
+
self.logger.info(f"Convert to numeric features on calculate metrics: {self.converted_to_numeric}")
|
437
|
+
for c in self.converted_to_numeric:
|
438
|
+
x[c] = pd.to_numeric(x[c], errors="coerce")
|
439
|
+
|
362
440
|
return x, y, {}
|
363
441
|
|
364
442
|
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
@@ -378,7 +456,10 @@ class EstimatorWrapper:
|
|
378
456
|
if baseline_score_column is not None and self.metric_name == "GINI":
|
379
457
|
self.logger.info("Calculate baseline GINI on passed baseline_score_column and target")
|
380
458
|
metric = roc_auc_score(y, x[baseline_score_column])
|
459
|
+
metric_std = None
|
460
|
+
average_shap_values = None
|
381
461
|
else:
|
462
|
+
self.logger.info(f"Cross validate with estimeator: {self.estimator}")
|
382
463
|
cv_results = cross_validate(
|
383
464
|
estimator=self.estimator,
|
384
465
|
x=x,
|
@@ -409,7 +490,6 @@ class EstimatorWrapper:
|
|
409
490
|
shaps = self.calculate_shap(cv_x, cv_y, estimator)
|
410
491
|
if shaps is not None:
|
411
492
|
for feature, shap_value in shaps.items():
|
412
|
-
# shap_values_all_folds[feature] = shap_values_all_folds.get(feature, []) + shap_value.tolist()
|
413
493
|
shap_values_all_folds[feature].append(shap_value)
|
414
494
|
|
415
495
|
if shap_values_all_folds:
|
@@ -465,7 +545,7 @@ class EstimatorWrapper:
|
|
465
545
|
logger: logging.Logger,
|
466
546
|
target_type: ModelTaskType,
|
467
547
|
cv: BaseCrossValidator,
|
468
|
-
|
548
|
+
*,
|
469
549
|
scoring: Union[Callable, str, None] = None,
|
470
550
|
cat_features: Optional[List[str]] = None,
|
471
551
|
text_features: Optional[List[str]] = None,
|
@@ -473,9 +553,10 @@ class EstimatorWrapper:
|
|
473
553
|
groups: Optional[List[str]] = None,
|
474
554
|
has_date: Optional[bool] = None,
|
475
555
|
) -> EstimatorWrapper:
|
476
|
-
scorer, metric_name, multiplier =
|
556
|
+
scorer, metric_name, multiplier = define_scorer(target_type, scoring)
|
477
557
|
kwargs = {
|
478
558
|
"scorer": scorer,
|
559
|
+
"cat_features": cat_features,
|
479
560
|
"metric_name": metric_name,
|
480
561
|
"multiplier": multiplier,
|
481
562
|
"cv": cv,
|
@@ -485,22 +566,43 @@ class EstimatorWrapper:
|
|
485
566
|
"logger": logger,
|
486
567
|
}
|
487
568
|
if estimator is None:
|
488
|
-
|
489
|
-
|
490
|
-
params =
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
569
|
+
if EstimatorWrapper.default_estimator == "catboost":
|
570
|
+
logger.info("Using CatBoost as default estimator")
|
571
|
+
params = {"has_time": has_date}
|
572
|
+
if target_type == ModelTaskType.MULTICLASS:
|
573
|
+
params = _get_add_params(params, CATBOOST_MULTICLASS_PARAMS)
|
574
|
+
params = _get_add_params(params, add_params)
|
575
|
+
estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
|
576
|
+
elif target_type == ModelTaskType.BINARY:
|
577
|
+
params = _get_add_params(params, CATBOOST_BINARY_PARAMS)
|
578
|
+
params = _get_add_params(params, add_params)
|
579
|
+
estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
|
580
|
+
elif target_type == ModelTaskType.REGRESSION:
|
581
|
+
params = _get_add_params(params, CATBOOST_REGRESSION_PARAMS)
|
582
|
+
params = _get_add_params(params, add_params)
|
583
|
+
estimator = CatBoostWrapper(CatBoostRegressor(**params), **kwargs)
|
584
|
+
else:
|
585
|
+
raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
|
586
|
+
elif EstimatorWrapper.default_estimator == "lightgbm":
|
587
|
+
logger.info("Using LightGBM as default estimator")
|
588
|
+
params = {"random_state": DEFAULT_RANDOM_STATE, "verbose": -1}
|
589
|
+
if target_type == ModelTaskType.MULTICLASS:
|
590
|
+
params = _get_add_params(params, LIGHTGBM_MULTICLASS_PARAMS)
|
591
|
+
params = _get_add_params(params, add_params)
|
592
|
+
estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
|
593
|
+
elif target_type == ModelTaskType.BINARY:
|
594
|
+
params = _get_add_params(params, LIGHTGBM_BINARY_PARAMS)
|
595
|
+
params = _get_add_params(params, add_params)
|
596
|
+
estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
|
597
|
+
elif target_type == ModelTaskType.REGRESSION:
|
598
|
+
if not isinstance(cv, TimeSeriesSplit) and not isinstance(cv, BlockedTimeSeriesSplit):
|
599
|
+
params = _get_add_params(params, LIGHTGBM_REGRESSION_PARAMS)
|
600
|
+
params = _get_add_params(params, add_params)
|
601
|
+
estimator = LightGBMWrapper(LGBMRegressor(**params), **kwargs)
|
602
|
+
else:
|
603
|
+
raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
|
502
604
|
else:
|
503
|
-
raise Exception(
|
605
|
+
raise Exception("Unsupported default_estimator. Available: catboost, lightgbm")
|
504
606
|
else:
|
505
607
|
if hasattr(estimator, "copy"):
|
506
608
|
estimator_copy = estimator.copy()
|
@@ -508,19 +610,12 @@ class EstimatorWrapper:
|
|
508
610
|
estimator_copy = deepcopy(estimator)
|
509
611
|
kwargs["estimator"] = estimator_copy
|
510
612
|
if is_catboost_estimator(estimator):
|
511
|
-
if
|
512
|
-
|
513
|
-
if cat_feature not in x.columns:
|
514
|
-
logger.error(
|
515
|
-
f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
|
516
|
-
)
|
517
|
-
estimator_copy.set_params(cat_features=cat_features, has_time=has_date)
|
613
|
+
if has_date is not None:
|
614
|
+
estimator_copy.set_params(has_time=has_date)
|
518
615
|
estimator = CatBoostWrapper(**kwargs)
|
519
616
|
else:
|
520
617
|
if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
|
521
618
|
estimator = LightGBMWrapper(**kwargs)
|
522
|
-
elif is_catboost_estimator(estimator):
|
523
|
-
estimator = CatBoostWrapper(**kwargs)
|
524
619
|
else:
|
525
620
|
logger.warning(
|
526
621
|
f"Unexpected estimator is used for metrics: {estimator}. "
|
@@ -536,6 +631,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
536
631
|
self,
|
537
632
|
estimator,
|
538
633
|
scorer: Callable,
|
634
|
+
cat_features: Optional[List[str]],
|
539
635
|
metric_name: str,
|
540
636
|
multiplier: int,
|
541
637
|
cv: BaseCrossValidator,
|
@@ -547,6 +643,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
547
643
|
super(CatBoostWrapper, self).__init__(
|
548
644
|
estimator,
|
549
645
|
scorer,
|
646
|
+
cat_features,
|
550
647
|
metric_name,
|
551
648
|
multiplier,
|
552
649
|
cv,
|
@@ -555,10 +652,8 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
555
652
|
text_features=text_features,
|
556
653
|
logger=logger,
|
557
654
|
)
|
558
|
-
self.cat_features = None
|
559
655
|
self.emb_features = None
|
560
656
|
self.grouped_embedding_features = None
|
561
|
-
self.exclude_features = []
|
562
657
|
|
563
658
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
564
659
|
x, y, groups, params = super()._prepare_to_fit(x, y)
|
@@ -567,76 +662,60 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
567
662
|
import catboost
|
568
663
|
from catboost import CatBoostClassifier
|
569
664
|
|
570
|
-
if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
|
665
|
+
if not hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
|
666
|
+
self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
|
667
|
+
else:
|
571
668
|
emb_pattern = r"(.+)_emb\d+"
|
572
669
|
self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
|
573
|
-
|
574
|
-
|
575
|
-
"Embedding features count more than 3, so group them into one vector for CatBoost: "
|
576
|
-
f"{self.emb_features}"
|
577
|
-
)
|
578
|
-
x, self.grouped_embedding_features = self.group_embeddings(x)
|
670
|
+
x, self.grouped_embedding_features = self.group_embeddings(x)
|
671
|
+
if len(self.grouped_embedding_features) > 0:
|
579
672
|
params["embedding_features"] = self.grouped_embedding_features
|
580
|
-
else:
|
581
|
-
self.logger.info(f"Embedding features count less than 3, so use them separately: {self.emb_features}")
|
582
|
-
self.grouped_embedding_features = None
|
583
|
-
else:
|
584
|
-
self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
|
585
673
|
|
586
674
|
# Find text features from passed in generate_features
|
587
|
-
if hasattr(CatBoostClassifier, "get_text_feature_indices"):
|
675
|
+
if not hasattr(CatBoostClassifier, "get_text_feature_indices"):
|
676
|
+
self.text_features = None
|
677
|
+
self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
|
678
|
+
else:
|
588
679
|
if self.text_features is not None:
|
589
680
|
self.logger.info(f"Passed text features for CatBoost: {self.text_features}")
|
590
681
|
self.text_features = [f for f in self.text_features if f in x.columns and not is_numeric_dtype(x[f])]
|
591
682
|
self.logger.info(f"Rest text features after checks: {self.text_features}")
|
592
683
|
params["text_features"] = self.text_features
|
593
|
-
else:
|
594
|
-
self.text_features = None
|
595
|
-
self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
|
596
684
|
|
597
685
|
# Find rest categorical features
|
598
|
-
self.cat_features =
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
x
|
608
|
-
|
609
|
-
|
610
|
-
if (
|
611
|
-
hasattr(self.estimator, "get_param")
|
612
|
-
and hasattr(self.estimator, "_init_params")
|
613
|
-
and self.estimator.get_param("cat_features") is not None
|
614
|
-
):
|
615
|
-
estimator_cat_features = self.estimator.get_param("cat_features")
|
616
|
-
if all([isinstance(c, int) for c in estimator_cat_features]):
|
617
|
-
cat_features_idx = {x.columns.get_loc(c) for c in self.cat_features}
|
618
|
-
cat_features_idx.update(estimator_cat_features)
|
619
|
-
self.cat_features = [x.columns[idx] for idx in cat_features_idx]
|
620
|
-
elif all([isinstance(c, str) for c in estimator_cat_features]):
|
621
|
-
self.cat_features = list(set(self.cat_features + estimator_cat_features))
|
622
|
-
else:
|
623
|
-
print(f"WARNING: Unsupported type of cat_features in CatBoost estimator: {estimator_cat_features}")
|
624
|
-
|
625
|
-
del self.estimator._init_params["cat_features"]
|
626
|
-
|
627
|
-
self.logger.info(f"Selected categorical features: {self.cat_features}")
|
628
|
-
params["cat_features"] = self.cat_features
|
686
|
+
self.cat_features = [
|
687
|
+
f
|
688
|
+
for f in self.cat_features
|
689
|
+
if f not in (self.text_features or []) and f not in (self.grouped_embedding_features or [])
|
690
|
+
]
|
691
|
+
if self.cat_features:
|
692
|
+
for c in self.cat_features:
|
693
|
+
if is_numeric_dtype(x[c]):
|
694
|
+
x[c] = x[c].fillna(np.nan)
|
695
|
+
elif x[c].dtype != "category":
|
696
|
+
x[c] = x[c].fillna("NA")
|
697
|
+
params["cat_features"] = self.cat_features
|
629
698
|
|
630
699
|
return x, y, groups, params
|
631
700
|
|
632
701
|
def group_embeddings(self, df: pd.DataFrame):
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
702
|
+
embeddings_columns = []
|
703
|
+
if len(self.emb_features) > 3:
|
704
|
+
self.logger.info(
|
705
|
+
"Embedding features count more than 3, so group them into one vector for CatBoost: "
|
706
|
+
f"{self.emb_features}"
|
707
|
+
)
|
708
|
+
emb_name = "__grouped_embeddings"
|
709
|
+
df = df.copy()
|
710
|
+
df[self.emb_features] = df[self.emb_features].fillna(0.0)
|
711
|
+
embeddings_series = pd.Series(df[self.emb_features].values.tolist(), index=df.index)
|
712
|
+
df = pd.concat([df.drop(columns=self.emb_features), pd.DataFrame({emb_name: embeddings_series})], axis=1)
|
713
|
+
embeddings_columns.append(emb_name)
|
714
|
+
for c in df.columns:
|
715
|
+
if is_valid_numeric_array_data(df[c]):
|
716
|
+
embeddings_columns.append(c)
|
717
|
+
|
718
|
+
return df, embeddings_columns
|
640
719
|
|
641
720
|
def process_shap_values(self, shap_values: Dict[str, float]) -> Dict[str, float]:
|
642
721
|
if "__grouped_embeddings" in shap_values:
|
@@ -646,16 +725,19 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
646
725
|
return shap_values
|
647
726
|
|
648
727
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
649
|
-
if self.exclude_features:
|
650
|
-
x = x.drop(columns=self.exclude_features)
|
651
728
|
x, y, params = super()._prepare_to_calculate(x, y)
|
652
729
|
if self.text_features:
|
653
730
|
params["text_features"] = self.text_features
|
654
731
|
if self.grouped_embedding_features:
|
655
732
|
x, emb_columns = self.group_embeddings(x)
|
656
733
|
params["embedding_features"] = emb_columns
|
734
|
+
|
657
735
|
if self.cat_features:
|
658
|
-
|
736
|
+
for c in self.cat_features:
|
737
|
+
if is_numeric_dtype(x[c]):
|
738
|
+
x[c] = x[c].fillna(np.nan)
|
739
|
+
elif x[c].dtype != "category":
|
740
|
+
x[c] = x[c].fillna("NA")
|
659
741
|
params["cat_features"] = self.cat_features
|
660
742
|
|
661
743
|
return x, y, params
|
@@ -681,7 +763,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
681
763
|
)
|
682
764
|
for f in high_cardinality_features:
|
683
765
|
self.text_features.remove(f)
|
684
|
-
self.
|
766
|
+
self.droped_features.append(f)
|
685
767
|
x = x.drop(columns=f, errors="ignore")
|
686
768
|
return super().cross_val_predict(x, y, baseline_score_column)
|
687
769
|
else:
|
@@ -700,23 +782,29 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
700
782
|
embedding_features=self.grouped_embedding_features,
|
701
783
|
)
|
702
784
|
|
703
|
-
|
704
|
-
shap_values_fold = estimator.get_feature_importance(data=fold_pool, type="ShapValues")
|
785
|
+
shap_values = estimator.get_feature_importance(data=fold_pool, type="ShapValues")
|
705
786
|
|
706
|
-
# Remove last columns (base value) and flatten
|
707
787
|
if self.target_type == ModelTaskType.MULTICLASS:
|
708
|
-
|
709
|
-
|
788
|
+
# For multiclass, shap_values has shape (n_samples, n_classes, n_features + 1)
|
789
|
+
# Last column is bias term
|
790
|
+
shap_values = shap_values[:, :, :-1] # Remove bias term
|
791
|
+
# Average SHAP values across classes
|
792
|
+
shap_values = np.mean(np.abs(shap_values), axis=1)
|
710
793
|
else:
|
711
|
-
|
712
|
-
|
794
|
+
# For binary/regression, shap_values has shape (n_samples, n_features + 1)
|
795
|
+
# Last column is bias term
|
796
|
+
shap_values = shap_values[:, :-1] # Remove bias term
|
797
|
+
# Take absolute values
|
798
|
+
shap_values = np.abs(shap_values)
|
713
799
|
|
714
|
-
|
800
|
+
feature_importance = {}
|
801
|
+
for i, col in enumerate(x.columns):
|
802
|
+
feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
|
715
803
|
|
716
|
-
return
|
804
|
+
return feature_importance
|
717
805
|
|
718
|
-
except Exception:
|
719
|
-
self.logger.exception("Failed to recalculate new SHAP values")
|
806
|
+
except Exception as e:
|
807
|
+
self.logger.exception(f"Failed to recalculate new SHAP values: {str(e)}")
|
720
808
|
return None
|
721
809
|
|
722
810
|
|
@@ -725,6 +813,7 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
725
813
|
self,
|
726
814
|
estimator,
|
727
815
|
scorer: Callable,
|
816
|
+
cat_features: Optional[List[str]],
|
728
817
|
metric_name: str,
|
729
818
|
multiplier: int,
|
730
819
|
cv: BaseCrossValidator,
|
@@ -736,6 +825,7 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
736
825
|
super(LightGBMWrapper, self).__init__(
|
737
826
|
estimator,
|
738
827
|
scorer,
|
828
|
+
cat_features,
|
739
829
|
metric_name,
|
740
830
|
multiplier,
|
741
831
|
cv,
|
@@ -744,7 +834,6 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
744
834
|
text_features=text_features,
|
745
835
|
logger=logger,
|
746
836
|
)
|
747
|
-
self.cat_features = None
|
748
837
|
self.cat_encoder = None
|
749
838
|
self.n_classes = None
|
750
839
|
|
@@ -756,30 +845,23 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
756
845
|
if self.target_type == ModelTaskType.BINARY:
|
757
846
|
params["eval_metric"] = "auc"
|
758
847
|
params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
|
759
|
-
self.cat_features = _get_cat_features(x)
|
760
848
|
if self.cat_features:
|
761
|
-
|
762
|
-
|
763
|
-
encoded = pd.DataFrame(
|
764
|
-
encoder.fit_transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
|
765
|
-
)
|
849
|
+
encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, cols=self.cat_features, return_df=True)
|
850
|
+
encoded = encoder.fit_transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
|
766
851
|
x[self.cat_features] = encoded
|
767
852
|
self.cat_encoder = encoder
|
768
|
-
|
769
|
-
|
770
|
-
|
853
|
+
for c in x.columns:
|
854
|
+
if x[c].dtype not in ["category", "int64", "float64", "bool"]:
|
855
|
+
self.logger.warning(f"Feature {c} is not numeric and will be dropped")
|
856
|
+
self.droped_features.append(c)
|
857
|
+
x = x.drop(columns=c, errors="ignore")
|
771
858
|
return x, y_numpy, groups, params
|
772
859
|
|
773
860
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
774
861
|
x, y_numpy, params = super()._prepare_to_calculate(x, y)
|
775
|
-
if self.cat_features is not None:
|
776
|
-
|
777
|
-
|
778
|
-
x[self.cat_features] = pd.DataFrame(
|
779
|
-
self.cat_encoder.transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
|
780
|
-
)
|
781
|
-
if not is_numeric_dtype(y):
|
782
|
-
y_numpy = correct_string_target(y_numpy)
|
862
|
+
if self.cat_features is not None and self.cat_encoder is not None:
|
863
|
+
encoded = self.cat_encoder.transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
|
864
|
+
x[self.cat_features] = encoded
|
783
865
|
return x, y_numpy, params
|
784
866
|
|
785
867
|
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
@@ -805,20 +887,6 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
805
887
|
for i, col in enumerate(x.columns):
|
806
888
|
feature_importance[col] = np.mean(np.abs(shap_matrix[:, i]))
|
807
889
|
|
808
|
-
# # exclude last column (base value)
|
809
|
-
# shap_values_only = shap_values[:, :-1]
|
810
|
-
# mean_abs_shap = np.mean(np.abs(shap_values_only), axis=0)
|
811
|
-
|
812
|
-
# # For classification, shap_values is returned as a list for each class
|
813
|
-
# # Take values for the positive class
|
814
|
-
# if isinstance(shap_values, list):
|
815
|
-
# shap_values = shap_values[1]
|
816
|
-
|
817
|
-
# # Calculate mean absolute SHAP value for each feature
|
818
|
-
# feature_importance = {}
|
819
|
-
# for i, col in enumerate(x.columns):
|
820
|
-
# feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
|
821
|
-
|
822
890
|
return feature_importance
|
823
891
|
|
824
892
|
except Exception as e:
|
@@ -831,6 +899,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
831
899
|
self,
|
832
900
|
estimator,
|
833
901
|
scorer: Callable,
|
902
|
+
cat_features: Optional[List[str]],
|
834
903
|
metric_name: str,
|
835
904
|
multiplier: int,
|
836
905
|
cv: BaseCrossValidator,
|
@@ -842,6 +911,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
842
911
|
super(OtherEstimatorWrapper, self).__init__(
|
843
912
|
estimator,
|
844
913
|
scorer,
|
914
|
+
cat_features,
|
845
915
|
metric_name,
|
846
916
|
multiplier,
|
847
917
|
cv,
|
@@ -850,33 +920,33 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
850
920
|
text_features=text_features,
|
851
921
|
logger=logger,
|
852
922
|
)
|
853
|
-
self.cat_features = None
|
854
923
|
|
855
924
|
def _prepare_to_fit(self, x: pd.DataFrame, y: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
856
|
-
x,
|
857
|
-
self.cat_features = _get_cat_features(x)
|
925
|
+
x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
|
858
926
|
num_features = [col for col in x.columns if col not in self.cat_features]
|
859
927
|
x[num_features] = x[num_features].fillna(-999)
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
x[
|
864
|
-
|
865
|
-
|
866
|
-
|
928
|
+
if self.cat_features:
|
929
|
+
encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, return_df=True)
|
930
|
+
encoded = encoder.fit_transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
|
931
|
+
x[self.cat_features] = encoded
|
932
|
+
self.cat_encoder = encoder
|
933
|
+
for c in x.columns:
|
934
|
+
if x[c].dtype not in ["category", "int64", "float64", "bool"]:
|
935
|
+
self.logger.warning(f"Feature {c} is not numeric and will be dropped")
|
936
|
+
self.droped_features.append(c)
|
937
|
+
x = x.drop(columns=c, errors="ignore")
|
938
|
+
return x, y_numpy, groups, params
|
867
939
|
|
868
940
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
869
|
-
x,
|
941
|
+
x, y_numpy, params = super()._prepare_to_calculate(x, y)
|
870
942
|
if self.cat_features is not None:
|
871
943
|
num_features = [col for col in x.columns if col not in self.cat_features]
|
872
944
|
x[num_features] = x[num_features].fillna(-999)
|
873
|
-
|
874
|
-
|
875
|
-
|
876
|
-
|
877
|
-
|
878
|
-
y = correct_string_target(y)
|
879
|
-
return x, y, params
|
945
|
+
if self.cat_features and self.cat_encoder is not None:
|
946
|
+
x[self.cat_features] = self.cat_encoder.transform(
|
947
|
+
x[self.cat_features].astype("object"), y_numpy
|
948
|
+
).astype("category")
|
949
|
+
return x, y_numpy, params
|
880
950
|
|
881
951
|
|
882
952
|
def validate_scoring_argument(scoring: Union[Callable, str, None]):
|
@@ -938,7 +1008,7 @@ def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
|
|
938
1008
|
return scoring, metric_name, multiplier
|
939
1009
|
|
940
1010
|
|
941
|
-
def
|
1011
|
+
def define_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None]) -> Tuple[Callable, str, int]:
|
942
1012
|
if scoring is None:
|
943
1013
|
if target_type == ModelTaskType.BINARY:
|
944
1014
|
scoring = "roc_auc"
|
@@ -957,16 +1027,9 @@ def _get_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None])
|
|
957
1027
|
else:
|
958
1028
|
metric_name = str(scoring)
|
959
1029
|
|
960
|
-
|
961
|
-
|
1030
|
+
metric_name = "GINI" if metric_name.upper() == "ROC_AUC" and target_type == ModelTaskType.BINARY else metric_name
|
962
1031
|
|
963
|
-
|
964
|
-
x: pd.DataFrame, text_features: Optional[List[str]] = None, emb_features: Optional[List[str]] = None
|
965
|
-
) -> List[str]:
|
966
|
-
text_features = text_features or []
|
967
|
-
emb_features = emb_features or []
|
968
|
-
exclude_features = text_features + emb_features
|
969
|
-
return [c for c in x.columns if c not in exclude_features and not is_numeric_dtype(x[c])]
|
1032
|
+
return scoring, metric_name, multiplier
|
970
1033
|
|
971
1034
|
|
972
1035
|
def _get_add_params(input_params, add_params):
|
@@ -1056,10 +1119,8 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
|
|
1056
1119
|
return mse if squared else np.sqrt(mse)
|
1057
1120
|
|
1058
1121
|
|
1059
|
-
|
1060
|
-
|
1061
|
-
|
1062
|
-
|
1063
|
-
|
1064
|
-
# df.loc[na_filter, c] = NA_REPLACEMENT
|
1065
|
-
# return df
|
1122
|
+
def _get_unique_count(series: pd.Series) -> int:
|
1123
|
+
try:
|
1124
|
+
return series.nunique(dropna=False)
|
1125
|
+
except TypeError:
|
1126
|
+
return series.astype(str).nunique(dropna=False)
|