upgini 1.2.79a1__py3-none-any.whl → 1.2.81__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/binary.py +2 -2
- upgini/autofe/timeseries/volatility.py +6 -4
- upgini/features_enricher.py +161 -96
- upgini/http.py +21 -21
- upgini/mdc/__init__.py +1 -1
- upgini/metadata.py +1 -1
- upgini/metrics.py +289 -231
- upgini/resource_bundle/strings.properties +1 -1
- upgini/search_task.py +1 -0
- upgini/utils/display_utils.py +12 -7
- upgini/utils/target_utils.py +9 -6
- {upgini-1.2.79a1.dist-info → upgini-1.2.81.dist-info}/METADATA +3 -1
- {upgini-1.2.79a1.dist-info → upgini-1.2.81.dist-info}/RECORD +16 -16
- {upgini-1.2.79a1.dist-info → upgini-1.2.81.dist-info}/WHEEL +0 -0
- {upgini-1.2.79a1.dist-info → upgini-1.2.81.dist-info}/licenses/LICENSE +0 -0
upgini/metrics.py
CHANGED
|
@@ -6,20 +6,21 @@ import re
|
|
|
6
6
|
from collections import defaultdict
|
|
7
7
|
from copy import deepcopy
|
|
8
8
|
from dataclasses import dataclass
|
|
9
|
-
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
9
|
+
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
|
|
10
10
|
|
|
11
11
|
import lightgbm as lgb
|
|
12
12
|
import numpy as np
|
|
13
13
|
import pandas as pd
|
|
14
|
+
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
15
|
+
from category_encoders.cat_boost import CatBoostEncoder
|
|
14
16
|
from lightgbm import LGBMClassifier, LGBMRegressor
|
|
15
17
|
from numpy import log1p
|
|
16
|
-
from pandas.api.types import is_numeric_dtype
|
|
18
|
+
from pandas.api.types import is_numeric_dtype, is_integer_dtype, is_float_dtype
|
|
17
19
|
from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
|
|
18
|
-
from sklearn.preprocessing import OrdinalEncoder
|
|
19
20
|
|
|
21
|
+
from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
|
|
20
22
|
from upgini.utils.features_validator import FeaturesValidator
|
|
21
23
|
from upgini.utils.sklearn_ext import cross_validate
|
|
22
|
-
from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
|
|
23
24
|
|
|
24
25
|
try:
|
|
25
26
|
from sklearn.metrics import get_scorer_names
|
|
@@ -31,12 +32,15 @@ except ImportError:
|
|
|
31
32
|
available_scorers = SCORERS
|
|
32
33
|
from sklearn.metrics import mean_squared_error
|
|
33
34
|
from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
|
|
34
|
-
from sklearn.model_selection import
|
|
35
|
+
from sklearn.model_selection import ( # , TimeSeriesSplit
|
|
36
|
+
BaseCrossValidator,
|
|
37
|
+
TimeSeriesSplit,
|
|
38
|
+
)
|
|
35
39
|
|
|
36
40
|
from upgini.errors import ValidationError
|
|
37
41
|
from upgini.metadata import ModelTaskType
|
|
38
42
|
from upgini.resource_bundle import bundle
|
|
39
|
-
from upgini.utils.target_utils import
|
|
43
|
+
from upgini.utils.target_utils import prepare_target
|
|
40
44
|
|
|
41
45
|
DEFAULT_RANDOM_STATE = 42
|
|
42
46
|
|
|
@@ -87,20 +91,9 @@ CATBOOST_MULTICLASS_PARAMS = {
|
|
|
87
91
|
|
|
88
92
|
LIGHTGBM_REGRESSION_PARAMS = {
|
|
89
93
|
"random_state": DEFAULT_RANDOM_STATE,
|
|
90
|
-
"min_gain_to_split": 0.001,
|
|
91
94
|
"n_estimators": 275,
|
|
92
|
-
"max_depth": 5,
|
|
93
|
-
"max_cat_threshold": 80,
|
|
94
|
-
"min_data_per_group": 25,
|
|
95
|
-
"cat_l2": 10,
|
|
96
|
-
"cat_smooth": 12,
|
|
97
|
-
"learning_rate": 0.05,
|
|
98
95
|
"feature_fraction": 1.0,
|
|
99
|
-
"min_sum_hessian_in_leaf": 0.01,
|
|
100
|
-
"objective": "huber",
|
|
101
96
|
"deterministic": "true",
|
|
102
|
-
"force_col_wise": "true",
|
|
103
|
-
"force_row_wise": "true",
|
|
104
97
|
"verbosity": -1,
|
|
105
98
|
}
|
|
106
99
|
|
|
@@ -115,13 +108,10 @@ LIGHTGBM_MULTICLASS_PARAMS = {
|
|
|
115
108
|
"cat_smooth": 18,
|
|
116
109
|
"cat_l2": 8,
|
|
117
110
|
"objective": "multiclass",
|
|
118
|
-
# "class_weight": "balanced",
|
|
119
111
|
"use_quantized_grad": "true",
|
|
120
112
|
"num_grad_quant_bins": "8",
|
|
121
113
|
"stochastic_rounding": "true",
|
|
122
114
|
"deterministic": "true",
|
|
123
|
-
"force_col_wise": "true",
|
|
124
|
-
"force_row_wise": "true",
|
|
125
115
|
"verbosity": -1,
|
|
126
116
|
}
|
|
127
117
|
|
|
@@ -132,14 +122,11 @@ LIGHTGBM_BINARY_PARAMS = {
|
|
|
132
122
|
"max_depth": 5,
|
|
133
123
|
"learning_rate": 0.05,
|
|
134
124
|
"objective": "binary",
|
|
135
|
-
# "class_weight": "balanced",
|
|
136
125
|
"max_cat_threshold": 80,
|
|
137
126
|
"min_data_per_group": 20,
|
|
138
127
|
"cat_smooth": 18,
|
|
139
128
|
"cat_l2": 8,
|
|
140
129
|
"deterministic": "true",
|
|
141
|
-
"force_col_wise": "true",
|
|
142
|
-
"force_row_wise": "true",
|
|
143
130
|
"verbosity": -1,
|
|
144
131
|
}
|
|
145
132
|
|
|
@@ -148,34 +135,6 @@ LIGHTGBM_EARLY_STOPPING_ROUNDS = 20
|
|
|
148
135
|
N_FOLDS = 5
|
|
149
136
|
BLOCKED_TS_TEST_SIZE = 0.2
|
|
150
137
|
|
|
151
|
-
NA_VALUES = [
|
|
152
|
-
"",
|
|
153
|
-
" ",
|
|
154
|
-
" ",
|
|
155
|
-
"#n/a",
|
|
156
|
-
"#n/a n/a",
|
|
157
|
-
"#na",
|
|
158
|
-
"-1.#ind",
|
|
159
|
-
"-1.#qnan",
|
|
160
|
-
"-nan",
|
|
161
|
-
"1.#ind",
|
|
162
|
-
"1.#qnan",
|
|
163
|
-
"n/a",
|
|
164
|
-
"na",
|
|
165
|
-
"null",
|
|
166
|
-
"nan",
|
|
167
|
-
"n/a",
|
|
168
|
-
"nan",
|
|
169
|
-
"none",
|
|
170
|
-
"-",
|
|
171
|
-
"undefined",
|
|
172
|
-
"[[unknown]]",
|
|
173
|
-
"[not provided]",
|
|
174
|
-
"[unknown]",
|
|
175
|
-
]
|
|
176
|
-
|
|
177
|
-
NA_REPLACEMENT = "NA"
|
|
178
|
-
|
|
179
138
|
SUPPORTED_CATBOOST_METRICS = {
|
|
180
139
|
s.upper(): s
|
|
181
140
|
for s in (
|
|
@@ -285,11 +244,55 @@ class _CrossValResults:
|
|
|
285
244
|
return f"{self.metric:.3f} ± {self.metric_std:.3f}"
|
|
286
245
|
|
|
287
246
|
|
|
247
|
+
def is_numeric_object(x: pd.Series) -> bool:
|
|
248
|
+
try:
|
|
249
|
+
pd.to_numeric(x, errors="raise")
|
|
250
|
+
return True
|
|
251
|
+
except (ValueError, TypeError):
|
|
252
|
+
return False
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def is_valid_numeric_array_data(data: pd.Series) -> bool:
|
|
256
|
+
data_without_na = data.dropna()
|
|
257
|
+
if data_without_na.empty:
|
|
258
|
+
return False
|
|
259
|
+
|
|
260
|
+
first_element = data_without_na.iloc[0]
|
|
261
|
+
|
|
262
|
+
# numpy.ndarray with numeric types
|
|
263
|
+
if isinstance(first_element, np.ndarray):
|
|
264
|
+
return np.issubdtype(first_element.dtype, np.number)
|
|
265
|
+
|
|
266
|
+
# DataFrame with all numeric columns
|
|
267
|
+
elif isinstance(first_element, pd.DataFrame):
|
|
268
|
+
return all(np.issubdtype(dtype, np.number) for dtype in first_element.dtypes)
|
|
269
|
+
|
|
270
|
+
# list or list of lists with numeric types
|
|
271
|
+
elif isinstance(first_element, list):
|
|
272
|
+
try:
|
|
273
|
+
# flat list
|
|
274
|
+
if all(isinstance(x, (int, float, np.number)) or pd.isna(x) for x in first_element):
|
|
275
|
+
return True
|
|
276
|
+
# list of lists
|
|
277
|
+
elif all(
|
|
278
|
+
isinstance(x, list) and all(isinstance(y, (int, float, np.number)) or pd.isna(y) for y in x)
|
|
279
|
+
for x in first_element
|
|
280
|
+
):
|
|
281
|
+
return True
|
|
282
|
+
except Exception:
|
|
283
|
+
return False
|
|
284
|
+
|
|
285
|
+
return False
|
|
286
|
+
|
|
287
|
+
|
|
288
288
|
class EstimatorWrapper:
|
|
289
|
+
default_estimator: Literal["catboost", "lightgbm"] = "catboost"
|
|
290
|
+
|
|
289
291
|
def __init__(
|
|
290
292
|
self,
|
|
291
293
|
estimator,
|
|
292
294
|
scorer: Callable,
|
|
295
|
+
cat_features: Optional[List[str]],
|
|
293
296
|
metric_name: str,
|
|
294
297
|
multiplier: int,
|
|
295
298
|
cv: BaseCrossValidator,
|
|
@@ -301,9 +304,8 @@ class EstimatorWrapper:
|
|
|
301
304
|
):
|
|
302
305
|
self.estimator = estimator
|
|
303
306
|
self.scorer = scorer
|
|
304
|
-
self.
|
|
305
|
-
|
|
306
|
-
)
|
|
307
|
+
self.cat_features = cat_features
|
|
308
|
+
self.metric_name = metric_name
|
|
307
309
|
self.multiplier = multiplier
|
|
308
310
|
self.cv = cv
|
|
309
311
|
self.target_type = target_type
|
|
@@ -312,6 +314,10 @@ class EstimatorWrapper:
|
|
|
312
314
|
self.groups = groups
|
|
313
315
|
self.text_features = text_features
|
|
314
316
|
self.logger = logger or logging.getLogger()
|
|
317
|
+
self.droped_features = []
|
|
318
|
+
self.converted_to_int = []
|
|
319
|
+
self.converted_to_str = []
|
|
320
|
+
self.converted_to_numeric = []
|
|
315
321
|
|
|
316
322
|
def fit(self, x: pd.DataFrame, y: np.ndarray, **kwargs):
|
|
317
323
|
x, y, _, fit_params = self._prepare_to_fit(x, y)
|
|
@@ -319,22 +325,13 @@ class EstimatorWrapper:
|
|
|
319
325
|
self.estimator.fit(x, y, **kwargs)
|
|
320
326
|
return self
|
|
321
327
|
|
|
322
|
-
def predict(self, **kwargs):
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
|
326
|
-
x, y, groups = self._prepare_data(x, y, groups=self.groups)
|
|
327
|
-
return x, y, groups, {}
|
|
328
|
+
def predict(self, x: pd.DataFrame, **kwargs):
|
|
329
|
+
x, _, _ = self._prepare_to_calculate(x, None)
|
|
330
|
+
return self.estimator.predict(x, **kwargs)
|
|
328
331
|
|
|
329
332
|
def _prepare_data(
|
|
330
333
|
self, x: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
|
|
331
334
|
) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
|
|
332
|
-
self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
|
|
333
|
-
for c in x.columns:
|
|
334
|
-
if is_numeric_dtype(x[c]):
|
|
335
|
-
x[c] = x[c].astype(float)
|
|
336
|
-
elif not x[c].dtype == "category":
|
|
337
|
-
x[c] = x[c].astype(str)
|
|
338
335
|
|
|
339
336
|
if not isinstance(y, pd.Series):
|
|
340
337
|
raise Exception(bundle.get("metrics_unsupported_target_type").format(type(y)))
|
|
@@ -348,6 +345,8 @@ class EstimatorWrapper:
|
|
|
348
345
|
else:
|
|
349
346
|
x, y = self._remove_empty_target_rows(x, y)
|
|
350
347
|
|
|
348
|
+
y = prepare_target(y, self.target_type)
|
|
349
|
+
|
|
351
350
|
self.logger.info(f"After preparing data columns: {x.columns.to_list()}")
|
|
352
351
|
return x, y, groups
|
|
353
352
|
|
|
@@ -360,8 +359,84 @@ class EstimatorWrapper:
|
|
|
360
359
|
|
|
361
360
|
return x, y
|
|
362
361
|
|
|
362
|
+
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
|
363
|
+
x, y, groups = self._prepare_data(x, y, groups=self.groups)
|
|
364
|
+
|
|
365
|
+
self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
|
|
366
|
+
self.droped_features = []
|
|
367
|
+
self.converted_to_int = []
|
|
368
|
+
self.converted_to_str = []
|
|
369
|
+
self.converted_to_numeric = []
|
|
370
|
+
for c in x.columns:
|
|
371
|
+
|
|
372
|
+
if _get_unique_count(x[c]) < 2:
|
|
373
|
+
self.logger.warning(f"Remove feature {c} because it has less than 2 unique values")
|
|
374
|
+
if c in self.cat_features:
|
|
375
|
+
self.cat_features.remove(c)
|
|
376
|
+
x.drop(columns=[c], inplace=True)
|
|
377
|
+
self.droped_features.append(c)
|
|
378
|
+
elif self.text_features is not None and c in self.text_features:
|
|
379
|
+
x[c] = x[c].astype(str)
|
|
380
|
+
self.converted_to_str.append(c)
|
|
381
|
+
elif c in self.cat_features:
|
|
382
|
+
if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
|
|
383
|
+
x[c] = x[c].astype(np.int64)
|
|
384
|
+
self.converted_to_int.append(c)
|
|
385
|
+
elif x[c].dtype == "category" and is_integer_dtype(x[c].cat.categories):
|
|
386
|
+
self.logger.info(
|
|
387
|
+
f"Convert categorical feature {c} with integer categories"
|
|
388
|
+
" to int64 and remove from cat_features"
|
|
389
|
+
)
|
|
390
|
+
x[c] = x[c].astype(np.int64)
|
|
391
|
+
self.converted_to_int.append(c)
|
|
392
|
+
self.cat_features.remove(c)
|
|
393
|
+
elif is_float_dtype(x[c]) or (x[c].dtype == "category" and is_float_dtype(x[c].cat.categories)):
|
|
394
|
+
self.logger.info(
|
|
395
|
+
f"Convert float cat feature {c} to string"
|
|
396
|
+
)
|
|
397
|
+
x[c] = x[c].astype(str)
|
|
398
|
+
self.converted_to_str.append(c)
|
|
399
|
+
elif x[c].dtype not in ["category", "int64"]:
|
|
400
|
+
x[c] = x[c].astype(str)
|
|
401
|
+
self.converted_to_str.append(c)
|
|
402
|
+
else:
|
|
403
|
+
if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
|
|
404
|
+
self.logger.info(f"Convert bool feature {c} to int64")
|
|
405
|
+
x[c] = x[c].astype(np.int64)
|
|
406
|
+
self.converted_to_int.append(c)
|
|
407
|
+
elif not is_valid_numeric_array_data(x[c]) and not is_numeric_dtype(x[c]):
|
|
408
|
+
try:
|
|
409
|
+
x[c] = pd.to_numeric(x[c], errors="raise")
|
|
410
|
+
self.converted_to_numeric.append(c)
|
|
411
|
+
except (ValueError, TypeError):
|
|
412
|
+
self.logger.warning(f"Remove feature {c} because it is not numeric and not in cat_features")
|
|
413
|
+
x.drop(columns=[c], inplace=True)
|
|
414
|
+
self.droped_features.append(c)
|
|
415
|
+
|
|
416
|
+
return x, y, groups, {}
|
|
417
|
+
|
|
363
418
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
|
364
419
|
x, y, _ = self._prepare_data(x, y)
|
|
420
|
+
|
|
421
|
+
if self.droped_features:
|
|
422
|
+
self.logger.info(f"Drop features on calculate metrics: {self.droped_features}")
|
|
423
|
+
x = x.drop(columns=self.droped_features)
|
|
424
|
+
|
|
425
|
+
if self.converted_to_int:
|
|
426
|
+
self.logger.info(f"Convert to int features on calculate metrics: {self.converted_to_int}")
|
|
427
|
+
for c in self.converted_to_int:
|
|
428
|
+
x[c] = x[c].astype(np.int64)
|
|
429
|
+
|
|
430
|
+
if self.converted_to_str:
|
|
431
|
+
self.logger.info(f"Convert to str features on calculate metrics: {self.converted_to_str}")
|
|
432
|
+
for c in self.converted_to_str:
|
|
433
|
+
x[c] = x[c].astype(str)
|
|
434
|
+
|
|
435
|
+
if self.converted_to_numeric:
|
|
436
|
+
self.logger.info(f"Convert to numeric features on calculate metrics: {self.converted_to_numeric}")
|
|
437
|
+
for c in self.converted_to_numeric:
|
|
438
|
+
x[c] = pd.to_numeric(x[c], errors="coerce")
|
|
439
|
+
|
|
365
440
|
return x, y, {}
|
|
366
441
|
|
|
367
442
|
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
|
@@ -381,7 +456,10 @@ class EstimatorWrapper:
|
|
|
381
456
|
if baseline_score_column is not None and self.metric_name == "GINI":
|
|
382
457
|
self.logger.info("Calculate baseline GINI on passed baseline_score_column and target")
|
|
383
458
|
metric = roc_auc_score(y, x[baseline_score_column])
|
|
459
|
+
metric_std = None
|
|
460
|
+
average_shap_values = None
|
|
384
461
|
else:
|
|
462
|
+
self.logger.info(f"Cross validate with estimeator: {self.estimator}")
|
|
385
463
|
cv_results = cross_validate(
|
|
386
464
|
estimator=self.estimator,
|
|
387
465
|
x=x,
|
|
@@ -412,7 +490,6 @@ class EstimatorWrapper:
|
|
|
412
490
|
shaps = self.calculate_shap(cv_x, cv_y, estimator)
|
|
413
491
|
if shaps is not None:
|
|
414
492
|
for feature, shap_value in shaps.items():
|
|
415
|
-
# shap_values_all_folds[feature] = shap_values_all_folds.get(feature, []) + shap_value.tolist()
|
|
416
493
|
shap_values_all_folds[feature].append(shap_value)
|
|
417
494
|
|
|
418
495
|
if shap_values_all_folds:
|
|
@@ -468,7 +545,7 @@ class EstimatorWrapper:
|
|
|
468
545
|
logger: logging.Logger,
|
|
469
546
|
target_type: ModelTaskType,
|
|
470
547
|
cv: BaseCrossValidator,
|
|
471
|
-
|
|
548
|
+
*,
|
|
472
549
|
scoring: Union[Callable, str, None] = None,
|
|
473
550
|
cat_features: Optional[List[str]] = None,
|
|
474
551
|
text_features: Optional[List[str]] = None,
|
|
@@ -476,9 +553,10 @@ class EstimatorWrapper:
|
|
|
476
553
|
groups: Optional[List[str]] = None,
|
|
477
554
|
has_date: Optional[bool] = None,
|
|
478
555
|
) -> EstimatorWrapper:
|
|
479
|
-
scorer, metric_name, multiplier =
|
|
556
|
+
scorer, metric_name, multiplier = define_scorer(target_type, scoring)
|
|
480
557
|
kwargs = {
|
|
481
558
|
"scorer": scorer,
|
|
559
|
+
"cat_features": cat_features,
|
|
482
560
|
"metric_name": metric_name,
|
|
483
561
|
"multiplier": multiplier,
|
|
484
562
|
"cv": cv,
|
|
@@ -488,22 +566,43 @@ class EstimatorWrapper:
|
|
|
488
566
|
"logger": logger,
|
|
489
567
|
}
|
|
490
568
|
if estimator is None:
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
params =
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
569
|
+
if EstimatorWrapper.default_estimator == "catboost":
|
|
570
|
+
logger.info("Using CatBoost as default estimator")
|
|
571
|
+
params = {"has_time": has_date}
|
|
572
|
+
if target_type == ModelTaskType.MULTICLASS:
|
|
573
|
+
params = _get_add_params(params, CATBOOST_MULTICLASS_PARAMS)
|
|
574
|
+
params = _get_add_params(params, add_params)
|
|
575
|
+
estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
|
|
576
|
+
elif target_type == ModelTaskType.BINARY:
|
|
577
|
+
params = _get_add_params(params, CATBOOST_BINARY_PARAMS)
|
|
578
|
+
params = _get_add_params(params, add_params)
|
|
579
|
+
estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
|
|
580
|
+
elif target_type == ModelTaskType.REGRESSION:
|
|
581
|
+
params = _get_add_params(params, CATBOOST_REGRESSION_PARAMS)
|
|
582
|
+
params = _get_add_params(params, add_params)
|
|
583
|
+
estimator = CatBoostWrapper(CatBoostRegressor(**params), **kwargs)
|
|
584
|
+
else:
|
|
585
|
+
raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
|
|
586
|
+
elif EstimatorWrapper.default_estimator == "lightgbm":
|
|
587
|
+
logger.info("Using LightGBM as default estimator")
|
|
588
|
+
params = {"random_state": DEFAULT_RANDOM_STATE, "verbose": -1}
|
|
589
|
+
if target_type == ModelTaskType.MULTICLASS:
|
|
590
|
+
params = _get_add_params(params, LIGHTGBM_MULTICLASS_PARAMS)
|
|
591
|
+
params = _get_add_params(params, add_params)
|
|
592
|
+
estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
|
|
593
|
+
elif target_type == ModelTaskType.BINARY:
|
|
594
|
+
params = _get_add_params(params, LIGHTGBM_BINARY_PARAMS)
|
|
595
|
+
params = _get_add_params(params, add_params)
|
|
596
|
+
estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
|
|
597
|
+
elif target_type == ModelTaskType.REGRESSION:
|
|
598
|
+
if not isinstance(cv, TimeSeriesSplit) and not isinstance(cv, BlockedTimeSeriesSplit):
|
|
599
|
+
params = _get_add_params(params, LIGHTGBM_REGRESSION_PARAMS)
|
|
600
|
+
params = _get_add_params(params, add_params)
|
|
601
|
+
estimator = LightGBMWrapper(LGBMRegressor(**params), **kwargs)
|
|
602
|
+
else:
|
|
603
|
+
raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
|
|
505
604
|
else:
|
|
506
|
-
raise Exception(
|
|
605
|
+
raise Exception("Unsupported default_estimator. Available: catboost, lightgbm")
|
|
507
606
|
else:
|
|
508
607
|
if hasattr(estimator, "copy"):
|
|
509
608
|
estimator_copy = estimator.copy()
|
|
@@ -511,19 +610,12 @@ class EstimatorWrapper:
|
|
|
511
610
|
estimator_copy = deepcopy(estimator)
|
|
512
611
|
kwargs["estimator"] = estimator_copy
|
|
513
612
|
if is_catboost_estimator(estimator):
|
|
514
|
-
if
|
|
515
|
-
|
|
516
|
-
if cat_feature not in x.columns:
|
|
517
|
-
logger.error(
|
|
518
|
-
f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
|
|
519
|
-
)
|
|
520
|
-
estimator_copy.set_params(cat_features=cat_features, has_time=has_date)
|
|
613
|
+
if has_date is not None:
|
|
614
|
+
estimator_copy.set_params(has_time=has_date)
|
|
521
615
|
estimator = CatBoostWrapper(**kwargs)
|
|
522
616
|
else:
|
|
523
617
|
if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
|
|
524
618
|
estimator = LightGBMWrapper(**kwargs)
|
|
525
|
-
elif is_catboost_estimator(estimator):
|
|
526
|
-
estimator = CatBoostWrapper(**kwargs)
|
|
527
619
|
else:
|
|
528
620
|
logger.warning(
|
|
529
621
|
f"Unexpected estimator is used for metrics: {estimator}. "
|
|
@@ -539,6 +631,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
539
631
|
self,
|
|
540
632
|
estimator,
|
|
541
633
|
scorer: Callable,
|
|
634
|
+
cat_features: Optional[List[str]],
|
|
542
635
|
metric_name: str,
|
|
543
636
|
multiplier: int,
|
|
544
637
|
cv: BaseCrossValidator,
|
|
@@ -550,6 +643,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
550
643
|
super(CatBoostWrapper, self).__init__(
|
|
551
644
|
estimator,
|
|
552
645
|
scorer,
|
|
646
|
+
cat_features,
|
|
553
647
|
metric_name,
|
|
554
648
|
multiplier,
|
|
555
649
|
cv,
|
|
@@ -558,10 +652,8 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
558
652
|
text_features=text_features,
|
|
559
653
|
logger=logger,
|
|
560
654
|
)
|
|
561
|
-
self.cat_features = None
|
|
562
655
|
self.emb_features = None
|
|
563
656
|
self.grouped_embedding_features = None
|
|
564
|
-
self.exclude_features = []
|
|
565
657
|
|
|
566
658
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
|
567
659
|
x, y, groups, params = super()._prepare_to_fit(x, y)
|
|
@@ -570,76 +662,60 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
570
662
|
import catboost
|
|
571
663
|
from catboost import CatBoostClassifier
|
|
572
664
|
|
|
573
|
-
if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
|
|
665
|
+
if not hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
|
|
666
|
+
self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
|
|
667
|
+
else:
|
|
574
668
|
emb_pattern = r"(.+)_emb\d+"
|
|
575
669
|
self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
"Embedding features count more than 3, so group them into one vector for CatBoost: "
|
|
579
|
-
f"{self.emb_features}"
|
|
580
|
-
)
|
|
581
|
-
x, self.grouped_embedding_features = self.group_embeddings(x)
|
|
670
|
+
x, self.grouped_embedding_features = self.group_embeddings(x)
|
|
671
|
+
if len(self.grouped_embedding_features) > 0:
|
|
582
672
|
params["embedding_features"] = self.grouped_embedding_features
|
|
583
|
-
else:
|
|
584
|
-
self.logger.info(f"Embedding features count less than 3, so use them separately: {self.emb_features}")
|
|
585
|
-
self.grouped_embedding_features = None
|
|
586
|
-
else:
|
|
587
|
-
self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
|
|
588
673
|
|
|
589
674
|
# Find text features from passed in generate_features
|
|
590
|
-
if hasattr(CatBoostClassifier, "get_text_feature_indices"):
|
|
675
|
+
if not hasattr(CatBoostClassifier, "get_text_feature_indices"):
|
|
676
|
+
self.text_features = None
|
|
677
|
+
self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
|
|
678
|
+
else:
|
|
591
679
|
if self.text_features is not None:
|
|
592
680
|
self.logger.info(f"Passed text features for CatBoost: {self.text_features}")
|
|
593
681
|
self.text_features = [f for f in self.text_features if f in x.columns and not is_numeric_dtype(x[f])]
|
|
594
682
|
self.logger.info(f"Rest text features after checks: {self.text_features}")
|
|
595
683
|
params["text_features"] = self.text_features
|
|
596
|
-
else:
|
|
597
|
-
self.text_features = None
|
|
598
|
-
self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
|
|
599
684
|
|
|
600
685
|
# Find rest categorical features
|
|
601
|
-
self.cat_features =
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
x
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
if (
|
|
614
|
-
hasattr(self.estimator, "get_param")
|
|
615
|
-
and hasattr(self.estimator, "_init_params")
|
|
616
|
-
and self.estimator.get_param("cat_features") is not None
|
|
617
|
-
):
|
|
618
|
-
estimator_cat_features = self.estimator.get_param("cat_features")
|
|
619
|
-
if all([isinstance(c, int) for c in estimator_cat_features]):
|
|
620
|
-
cat_features_idx = {x.columns.get_loc(c) for c in self.cat_features}
|
|
621
|
-
cat_features_idx.update(estimator_cat_features)
|
|
622
|
-
self.cat_features = [x.columns[idx] for idx in cat_features_idx]
|
|
623
|
-
elif all([isinstance(c, str) for c in estimator_cat_features]):
|
|
624
|
-
self.cat_features = list(set(self.cat_features + estimator_cat_features))
|
|
625
|
-
else:
|
|
626
|
-
print(f"WARNING: Unsupported type of cat_features in CatBoost estimator: {estimator_cat_features}")
|
|
627
|
-
|
|
628
|
-
del self.estimator._init_params["cat_features"]
|
|
629
|
-
|
|
630
|
-
self.logger.info(f"Selected categorical features: {self.cat_features}")
|
|
631
|
-
params["cat_features"] = self.cat_features
|
|
686
|
+
self.cat_features = [
|
|
687
|
+
f
|
|
688
|
+
for f in self.cat_features
|
|
689
|
+
if f not in (self.text_features or []) and f not in (self.grouped_embedding_features or [])
|
|
690
|
+
]
|
|
691
|
+
if self.cat_features:
|
|
692
|
+
for c in self.cat_features:
|
|
693
|
+
if is_numeric_dtype(x[c]):
|
|
694
|
+
x[c] = x[c].fillna(np.nan)
|
|
695
|
+
elif x[c].dtype != "category":
|
|
696
|
+
x[c] = x[c].fillna("NA")
|
|
697
|
+
params["cat_features"] = self.cat_features
|
|
632
698
|
|
|
633
699
|
return x, y, groups, params
|
|
634
700
|
|
|
635
701
|
def group_embeddings(self, df: pd.DataFrame):
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
702
|
+
embeddings_columns = []
|
|
703
|
+
if len(self.emb_features) > 3:
|
|
704
|
+
self.logger.info(
|
|
705
|
+
"Embedding features count more than 3, so group them into one vector for CatBoost: "
|
|
706
|
+
f"{self.emb_features}"
|
|
707
|
+
)
|
|
708
|
+
emb_name = "__grouped_embeddings"
|
|
709
|
+
df = df.copy()
|
|
710
|
+
df[self.emb_features] = df[self.emb_features].fillna(0.0)
|
|
711
|
+
embeddings_series = pd.Series(df[self.emb_features].values.tolist(), index=df.index)
|
|
712
|
+
df = pd.concat([df.drop(columns=self.emb_features), pd.DataFrame({emb_name: embeddings_series})], axis=1)
|
|
713
|
+
embeddings_columns.append(emb_name)
|
|
714
|
+
for c in df.columns:
|
|
715
|
+
if is_valid_numeric_array_data(df[c]):
|
|
716
|
+
embeddings_columns.append(c)
|
|
717
|
+
|
|
718
|
+
return df, embeddings_columns
|
|
643
719
|
|
|
644
720
|
def process_shap_values(self, shap_values: Dict[str, float]) -> Dict[str, float]:
|
|
645
721
|
if "__grouped_embeddings" in shap_values:
|
|
@@ -649,16 +725,19 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
649
725
|
return shap_values
|
|
650
726
|
|
|
651
727
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
|
652
|
-
if self.exclude_features:
|
|
653
|
-
x = x.drop(columns=self.exclude_features)
|
|
654
728
|
x, y, params = super()._prepare_to_calculate(x, y)
|
|
655
729
|
if self.text_features:
|
|
656
730
|
params["text_features"] = self.text_features
|
|
657
731
|
if self.grouped_embedding_features:
|
|
658
732
|
x, emb_columns = self.group_embeddings(x)
|
|
659
733
|
params["embedding_features"] = emb_columns
|
|
734
|
+
|
|
660
735
|
if self.cat_features:
|
|
661
|
-
|
|
736
|
+
for c in self.cat_features:
|
|
737
|
+
if is_numeric_dtype(x[c]):
|
|
738
|
+
x[c] = x[c].fillna(np.nan)
|
|
739
|
+
elif x[c].dtype != "category":
|
|
740
|
+
x[c] = x[c].fillna("NA")
|
|
662
741
|
params["cat_features"] = self.cat_features
|
|
663
742
|
|
|
664
743
|
return x, y, params
|
|
@@ -684,7 +763,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
684
763
|
)
|
|
685
764
|
for f in high_cardinality_features:
|
|
686
765
|
self.text_features.remove(f)
|
|
687
|
-
self.
|
|
766
|
+
self.droped_features.append(f)
|
|
688
767
|
x = x.drop(columns=f, errors="ignore")
|
|
689
768
|
return super().cross_val_predict(x, y, baseline_score_column)
|
|
690
769
|
else:
|
|
@@ -703,23 +782,29 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
703
782
|
embedding_features=self.grouped_embedding_features,
|
|
704
783
|
)
|
|
705
784
|
|
|
706
|
-
|
|
707
|
-
shap_values_fold = estimator.get_feature_importance(data=fold_pool, type="ShapValues")
|
|
785
|
+
shap_values = estimator.get_feature_importance(data=fold_pool, type="ShapValues")
|
|
708
786
|
|
|
709
|
-
# Remove last columns (base value) and flatten
|
|
710
787
|
if self.target_type == ModelTaskType.MULTICLASS:
|
|
711
|
-
|
|
712
|
-
|
|
788
|
+
# For multiclass, shap_values has shape (n_samples, n_classes, n_features + 1)
|
|
789
|
+
# Last column is bias term
|
|
790
|
+
shap_values = shap_values[:, :, :-1] # Remove bias term
|
|
791
|
+
# Average SHAP values across classes
|
|
792
|
+
shap_values = np.mean(np.abs(shap_values), axis=1)
|
|
713
793
|
else:
|
|
714
|
-
|
|
715
|
-
|
|
794
|
+
# For binary/regression, shap_values has shape (n_samples, n_features + 1)
|
|
795
|
+
# Last column is bias term
|
|
796
|
+
shap_values = shap_values[:, :-1] # Remove bias term
|
|
797
|
+
# Take absolute values
|
|
798
|
+
shap_values = np.abs(shap_values)
|
|
716
799
|
|
|
717
|
-
|
|
800
|
+
feature_importance = {}
|
|
801
|
+
for i, col in enumerate(x.columns):
|
|
802
|
+
feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
|
|
718
803
|
|
|
719
|
-
return
|
|
804
|
+
return feature_importance
|
|
720
805
|
|
|
721
|
-
except Exception:
|
|
722
|
-
self.logger.exception("Failed to recalculate new SHAP values")
|
|
806
|
+
except Exception as e:
|
|
807
|
+
self.logger.exception(f"Failed to recalculate new SHAP values: {str(e)}")
|
|
723
808
|
return None
|
|
724
809
|
|
|
725
810
|
|
|
@@ -728,6 +813,7 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
|
728
813
|
self,
|
|
729
814
|
estimator,
|
|
730
815
|
scorer: Callable,
|
|
816
|
+
cat_features: Optional[List[str]],
|
|
731
817
|
metric_name: str,
|
|
732
818
|
multiplier: int,
|
|
733
819
|
cv: BaseCrossValidator,
|
|
@@ -739,6 +825,7 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
|
739
825
|
super(LightGBMWrapper, self).__init__(
|
|
740
826
|
estimator,
|
|
741
827
|
scorer,
|
|
828
|
+
cat_features,
|
|
742
829
|
metric_name,
|
|
743
830
|
multiplier,
|
|
744
831
|
cv,
|
|
@@ -747,7 +834,6 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
|
747
834
|
text_features=text_features,
|
|
748
835
|
logger=logger,
|
|
749
836
|
)
|
|
750
|
-
self.cat_features = None
|
|
751
837
|
self.cat_encoder = None
|
|
752
838
|
self.n_classes = None
|
|
753
839
|
|
|
@@ -759,30 +845,23 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
|
759
845
|
if self.target_type == ModelTaskType.BINARY:
|
|
760
846
|
params["eval_metric"] = "auc"
|
|
761
847
|
params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
|
|
762
|
-
self.cat_features = _get_cat_features(x)
|
|
763
848
|
if self.cat_features:
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
encoded = pd.DataFrame(
|
|
767
|
-
encoder.fit_transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
|
|
768
|
-
)
|
|
849
|
+
encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, cols=self.cat_features, return_df=True)
|
|
850
|
+
encoded = encoder.fit_transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
|
|
769
851
|
x[self.cat_features] = encoded
|
|
770
852
|
self.cat_encoder = encoder
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
853
|
+
for c in x.columns:
|
|
854
|
+
if x[c].dtype not in ["category", "int64", "float64", "bool"]:
|
|
855
|
+
self.logger.warning(f"Feature {c} is not numeric and will be dropped")
|
|
856
|
+
self.droped_features.append(c)
|
|
857
|
+
x = x.drop(columns=c, errors="ignore")
|
|
774
858
|
return x, y_numpy, groups, params
|
|
775
859
|
|
|
776
860
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
|
777
861
|
x, y_numpy, params = super()._prepare_to_calculate(x, y)
|
|
778
|
-
if self.cat_features is not None:
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
x[self.cat_features] = pd.DataFrame(
|
|
782
|
-
self.cat_encoder.transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
|
|
783
|
-
)
|
|
784
|
-
if not is_numeric_dtype(y):
|
|
785
|
-
y_numpy = correct_string_target(y_numpy)
|
|
862
|
+
if self.cat_features is not None and self.cat_encoder is not None:
|
|
863
|
+
encoded = self.cat_encoder.transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
|
|
864
|
+
x[self.cat_features] = encoded
|
|
786
865
|
return x, y_numpy, params
|
|
787
866
|
|
|
788
867
|
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
|
@@ -808,20 +887,6 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
|
808
887
|
for i, col in enumerate(x.columns):
|
|
809
888
|
feature_importance[col] = np.mean(np.abs(shap_matrix[:, i]))
|
|
810
889
|
|
|
811
|
-
# # exclude last column (base value)
|
|
812
|
-
# shap_values_only = shap_values[:, :-1]
|
|
813
|
-
# mean_abs_shap = np.mean(np.abs(shap_values_only), axis=0)
|
|
814
|
-
|
|
815
|
-
# # For classification, shap_values is returned as a list for each class
|
|
816
|
-
# # Take values for the positive class
|
|
817
|
-
# if isinstance(shap_values, list):
|
|
818
|
-
# shap_values = shap_values[1]
|
|
819
|
-
|
|
820
|
-
# # Calculate mean absolute SHAP value for each feature
|
|
821
|
-
# feature_importance = {}
|
|
822
|
-
# for i, col in enumerate(x.columns):
|
|
823
|
-
# feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
|
|
824
|
-
|
|
825
890
|
return feature_importance
|
|
826
891
|
|
|
827
892
|
except Exception as e:
|
|
@@ -834,6 +899,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
|
834
899
|
self,
|
|
835
900
|
estimator,
|
|
836
901
|
scorer: Callable,
|
|
902
|
+
cat_features: Optional[List[str]],
|
|
837
903
|
metric_name: str,
|
|
838
904
|
multiplier: int,
|
|
839
905
|
cv: BaseCrossValidator,
|
|
@@ -845,6 +911,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
|
845
911
|
super(OtherEstimatorWrapper, self).__init__(
|
|
846
912
|
estimator,
|
|
847
913
|
scorer,
|
|
914
|
+
cat_features,
|
|
848
915
|
metric_name,
|
|
849
916
|
multiplier,
|
|
850
917
|
cv,
|
|
@@ -853,33 +920,33 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
|
853
920
|
text_features=text_features,
|
|
854
921
|
logger=logger,
|
|
855
922
|
)
|
|
856
|
-
self.cat_features = None
|
|
857
923
|
|
|
858
924
|
def _prepare_to_fit(self, x: pd.DataFrame, y: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
|
859
|
-
x,
|
|
860
|
-
self.cat_features = _get_cat_features(x)
|
|
925
|
+
x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
|
|
861
926
|
num_features = [col for col in x.columns if col not in self.cat_features]
|
|
862
927
|
x[num_features] = x[num_features].fillna(-999)
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
x[
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
928
|
+
if self.cat_features:
|
|
929
|
+
encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, return_df=True)
|
|
930
|
+
encoded = encoder.fit_transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
|
|
931
|
+
x[self.cat_features] = encoded
|
|
932
|
+
self.cat_encoder = encoder
|
|
933
|
+
for c in x.columns:
|
|
934
|
+
if x[c].dtype not in ["category", "int64", "float64", "bool"]:
|
|
935
|
+
self.logger.warning(f"Feature {c} is not numeric and will be dropped")
|
|
936
|
+
self.droped_features.append(c)
|
|
937
|
+
x = x.drop(columns=c, errors="ignore")
|
|
938
|
+
return x, y_numpy, groups, params
|
|
870
939
|
|
|
871
940
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
|
872
|
-
x,
|
|
941
|
+
x, y_numpy, params = super()._prepare_to_calculate(x, y)
|
|
873
942
|
if self.cat_features is not None:
|
|
874
943
|
num_features = [col for col in x.columns if col not in self.cat_features]
|
|
875
944
|
x[num_features] = x[num_features].fillna(-999)
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
y = correct_string_target(y)
|
|
882
|
-
return x, y, params
|
|
945
|
+
if self.cat_features and self.cat_encoder is not None:
|
|
946
|
+
x[self.cat_features] = self.cat_encoder.transform(
|
|
947
|
+
x[self.cat_features].astype("object"), y_numpy
|
|
948
|
+
).astype("category")
|
|
949
|
+
return x, y_numpy, params
|
|
883
950
|
|
|
884
951
|
|
|
885
952
|
def validate_scoring_argument(scoring: Union[Callable, str, None]):
|
|
@@ -941,7 +1008,7 @@ def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
|
|
|
941
1008
|
return scoring, metric_name, multiplier
|
|
942
1009
|
|
|
943
1010
|
|
|
944
|
-
def
|
|
1011
|
+
def define_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None]) -> Tuple[Callable, str, int]:
|
|
945
1012
|
if scoring is None:
|
|
946
1013
|
if target_type == ModelTaskType.BINARY:
|
|
947
1014
|
scoring = "roc_auc"
|
|
@@ -960,16 +1027,9 @@ def _get_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None])
|
|
|
960
1027
|
else:
|
|
961
1028
|
metric_name = str(scoring)
|
|
962
1029
|
|
|
963
|
-
|
|
964
|
-
|
|
1030
|
+
metric_name = "GINI" if metric_name.upper() == "ROC_AUC" and target_type == ModelTaskType.BINARY else metric_name
|
|
965
1031
|
|
|
966
|
-
|
|
967
|
-
x: pd.DataFrame, text_features: Optional[List[str]] = None, emb_features: Optional[List[str]] = None
|
|
968
|
-
) -> List[str]:
|
|
969
|
-
text_features = text_features or []
|
|
970
|
-
emb_features = emb_features or []
|
|
971
|
-
exclude_features = text_features + emb_features
|
|
972
|
-
return [c for c in x.columns if c not in exclude_features and not is_numeric_dtype(x[c])]
|
|
1032
|
+
return scoring, metric_name, multiplier
|
|
973
1033
|
|
|
974
1034
|
|
|
975
1035
|
def _get_add_params(input_params, add_params):
|
|
@@ -1059,10 +1119,8 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
|
|
|
1059
1119
|
return mse if squared else np.sqrt(mse)
|
|
1060
1120
|
|
|
1061
1121
|
|
|
1062
|
-
def
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
df.loc[na_filter, c] = NA_REPLACEMENT
|
|
1068
|
-
return df
|
|
1122
|
+
def _get_unique_count(series: pd.Series) -> int:
|
|
1123
|
+
try:
|
|
1124
|
+
return series.nunique(dropna=False)
|
|
1125
|
+
except TypeError:
|
|
1126
|
+
return series.astype(str).nunique(dropna=False)
|