upgini 1.2.79a1__py3-none-any.whl → 1.2.81a3832.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +54 -35
- upgini/metrics.py +118 -138
- upgini/utils/target_utils.py +9 -6
- {upgini-1.2.79a1.dist-info → upgini-1.2.81a3832.dev1.dist-info}/METADATA +2 -1
- {upgini-1.2.79a1.dist-info → upgini-1.2.81a3832.dev1.dist-info}/RECORD +8 -8
- {upgini-1.2.79a1.dist-info → upgini-1.2.81a3832.dev1.dist-info}/WHEEL +0 -0
- {upgini-1.2.79a1.dist-info → upgini-1.2.81a3832.dev1.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.81a3832.dev1"
|
upgini/features_enricher.py
CHANGED
@@ -63,7 +63,7 @@ from upgini.metadata import (
|
|
63
63
|
RuntimeParameters,
|
64
64
|
SearchKey,
|
65
65
|
)
|
66
|
-
from upgini.metrics import EstimatorWrapper, validate_scoring_argument
|
66
|
+
from upgini.metrics import EstimatorWrapper, define_scorer, validate_scoring_argument
|
67
67
|
from upgini.normalizer.normalize_utils import Normalizer
|
68
68
|
from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
|
69
69
|
from upgini.search_task import SearchTask
|
@@ -957,7 +957,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
957
957
|
self.__display_support_link(msg)
|
958
958
|
return None
|
959
959
|
|
960
|
-
|
960
|
+
client_cat_features, search_keys_for_metrics = self._get_client_cat_features(
|
961
961
|
estimator, validated_X, self.search_keys
|
962
962
|
)
|
963
963
|
search_keys_for_metrics.extend([c for c in self.id_columns or [] if c not in search_keys_for_metrics])
|
@@ -976,7 +976,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
976
976
|
search_keys_for_metrics=search_keys_for_metrics,
|
977
977
|
progress_bar=progress_bar,
|
978
978
|
progress_callback=progress_callback,
|
979
|
-
cat_features=
|
979
|
+
cat_features=client_cat_features,
|
980
980
|
)
|
981
981
|
if prepared_data is None:
|
982
982
|
return None
|
@@ -994,11 +994,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
994
994
|
) = prepared_data
|
995
995
|
|
996
996
|
# rename cat_features
|
997
|
-
if
|
997
|
+
if client_cat_features:
|
998
998
|
for new_c, old_c in columns_renaming.items():
|
999
|
-
if old_c in
|
1000
|
-
|
1001
|
-
|
999
|
+
if old_c in client_cat_features:
|
1000
|
+
client_cat_features.remove(old_c)
|
1001
|
+
client_cat_features.append(new_c)
|
1002
|
+
for cat_feature in client_cat_features:
|
1003
|
+
if cat_feature not in fitting_X.columns:
|
1004
|
+
self.logger.error(
|
1005
|
+
f"Client cat_feature `{cat_feature}` not found in"
|
1006
|
+
f" x columns: {fitting_X.columns.to_list()}"
|
1007
|
+
)
|
1008
|
+
else:
|
1009
|
+
client_cat_features = []
|
1002
1010
|
|
1003
1011
|
gc.collect()
|
1004
1012
|
|
@@ -1019,20 +1027,17 @@ class FeaturesEnricher(TransformerMixin):
|
|
1019
1027
|
|
1020
1028
|
has_date = self._get_date_column(search_keys) is not None
|
1021
1029
|
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
1030
|
+
cat_features_from_backend = self.__get_categorical_features()
|
1031
|
+
cat_features = list(set(client_cat_features + cat_features_from_backend))
|
1032
|
+
baseline_cat_features = [f for f in cat_features if f in fitting_X.columns]
|
1033
|
+
enriched_cat_features = [f for f in cat_features if f in fitting_enriched_X.columns]
|
1034
|
+
if len(enriched_cat_features) < len(cat_features):
|
1035
|
+
missing_cat_features = [f for f in cat_features if f not in fitting_enriched_X.columns]
|
1036
|
+
self.logger.warning(
|
1037
|
+
f"Some cat_features were not found in enriched_X: {missing_cat_features}"
|
1038
|
+
)
|
1022
1039
|
|
1023
|
-
|
1024
|
-
estimator,
|
1025
|
-
self.logger,
|
1026
|
-
model_task_type,
|
1027
|
-
_cv,
|
1028
|
-
fitting_enriched_X,
|
1029
|
-
scoring,
|
1030
|
-
groups=groups,
|
1031
|
-
text_features=text_features,
|
1032
|
-
has_date=has_date,
|
1033
|
-
)
|
1034
|
-
metric = wrapper.metric_name
|
1035
|
-
multiplier = wrapper.multiplier
|
1040
|
+
_, metric, multiplier = define_scorer(model_task_type, scoring)
|
1036
1041
|
|
1037
1042
|
# 1 If client features are presented - fit and predict with KFold estimator
|
1038
1043
|
# on etalon features and calculate baseline metric
|
@@ -1050,9 +1055,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1050
1055
|
self.logger,
|
1051
1056
|
model_task_type,
|
1052
1057
|
_cv,
|
1053
|
-
|
1054
|
-
|
1055
|
-
cat_features,
|
1058
|
+
scoring=scoring,
|
1059
|
+
cat_features=baseline_cat_features,
|
1056
1060
|
add_params=custom_loss_add_params,
|
1057
1061
|
groups=groups,
|
1058
1062
|
text_features=text_features,
|
@@ -1085,9 +1089,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1085
1089
|
self.logger,
|
1086
1090
|
model_task_type,
|
1087
1091
|
_cv,
|
1088
|
-
|
1089
|
-
|
1090
|
-
cat_features,
|
1092
|
+
scoring=scoring,
|
1093
|
+
cat_features=enriched_cat_features,
|
1091
1094
|
add_params=custom_loss_add_params,
|
1092
1095
|
groups=groups,
|
1093
1096
|
text_features=text_features,
|
@@ -1119,7 +1122,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1119
1122
|
self.bundle.get("quality_metrics_rows_header"): _num_samples(fitting_X),
|
1120
1123
|
}
|
1121
1124
|
if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
|
1122
|
-
|
1125
|
+
y_sorted
|
1123
1126
|
):
|
1124
1127
|
train_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
|
1125
1128
|
# np.mean(validated_y), 4
|
@@ -1197,7 +1200,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1197
1200
|
# self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
|
1198
1201
|
}
|
1199
1202
|
if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
|
1200
|
-
|
1203
|
+
eval_y_sorted
|
1201
1204
|
):
|
1202
1205
|
eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
|
1203
1206
|
# np.mean(validated_eval_set[idx][1]), 4
|
@@ -1428,12 +1431,20 @@ class FeaturesEnricher(TransformerMixin):
|
|
1428
1431
|
if (
|
1429
1432
|
estimator is not None
|
1430
1433
|
and hasattr(estimator, "get_param")
|
1434
|
+
and hasattr(estimator, "_init_params")
|
1431
1435
|
and estimator.get_param("cat_features") is not None
|
1432
1436
|
):
|
1433
|
-
|
1434
|
-
if
|
1435
|
-
|
1436
|
-
|
1437
|
+
estimator_cat_features = estimator.get_param("cat_features")
|
1438
|
+
if all([isinstance(c, int) for c in estimator_cat_features]):
|
1439
|
+
cat_features = [X.columns[idx] for idx in estimator_cat_features]
|
1440
|
+
elif all([isinstance(c, str) for c in estimator_cat_features]):
|
1441
|
+
cat_features = estimator_cat_features
|
1442
|
+
else:
|
1443
|
+
print(f"WARNING: Unsupported type of cat_features in CatBoost estimator: {estimator_cat_features}")
|
1444
|
+
|
1445
|
+
del estimator._init_params["cat_features"]
|
1446
|
+
|
1447
|
+
if cat_features:
|
1437
1448
|
self.logger.info(f"Collected categorical features {cat_features} from user estimator")
|
1438
1449
|
for cat_feature in cat_features:
|
1439
1450
|
if cat_feature in search_keys:
|
@@ -3855,6 +3866,13 @@ if response.status_code == 200:
|
|
3855
3866
|
|
3856
3867
|
return importances
|
3857
3868
|
|
3869
|
+
def __get_categorical_features(self) -> List[str]:
|
3870
|
+
features_meta = self._search_task.get_all_features_metadata_v2()
|
3871
|
+
if features_meta is None:
|
3872
|
+
raise Exception(self.bundle.get("missing_features_meta"))
|
3873
|
+
|
3874
|
+
return [f.name for f in features_meta if f.type == "categorical"]
|
3875
|
+
|
3858
3876
|
def __prepare_feature_importances(
|
3859
3877
|
self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
|
3860
3878
|
):
|
@@ -3886,9 +3904,10 @@ if response.status_code == 200:
|
|
3886
3904
|
if updated_shaps is not None:
|
3887
3905
|
updating_shap = updated_shaps.get(feature_meta.name)
|
3888
3906
|
if updating_shap is None:
|
3889
|
-
|
3890
|
-
|
3891
|
-
|
3907
|
+
if feature_meta.shap_value != 0.0:
|
3908
|
+
self.logger.warning(
|
3909
|
+
f"WARNING: Shap value for feature {feature_meta.name} not found and will be set to 0.0"
|
3910
|
+
)
|
3892
3911
|
updating_shap = 0.0
|
3893
3912
|
feature_meta.shap_value = updating_shap
|
3894
3913
|
|
upgini/metrics.py
CHANGED
@@ -11,15 +11,15 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
11
11
|
import lightgbm as lgb
|
12
12
|
import numpy as np
|
13
13
|
import pandas as pd
|
14
|
+
from category_encoders.cat_boost import CatBoostEncoder
|
14
15
|
from lightgbm import LGBMClassifier, LGBMRegressor
|
15
16
|
from numpy import log1p
|
16
17
|
from pandas.api.types import is_numeric_dtype
|
17
18
|
from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
|
18
|
-
from sklearn.preprocessing import OrdinalEncoder
|
19
19
|
|
20
|
+
from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
|
20
21
|
from upgini.utils.features_validator import FeaturesValidator
|
21
22
|
from upgini.utils.sklearn_ext import cross_validate
|
22
|
-
from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
|
23
23
|
|
24
24
|
try:
|
25
25
|
from sklearn.metrics import get_scorer_names
|
@@ -36,7 +36,7 @@ from sklearn.model_selection import BaseCrossValidator, TimeSeriesSplit
|
|
36
36
|
from upgini.errors import ValidationError
|
37
37
|
from upgini.metadata import ModelTaskType
|
38
38
|
from upgini.resource_bundle import bundle
|
39
|
-
from upgini.utils.target_utils import
|
39
|
+
from upgini.utils.target_utils import prepare_target
|
40
40
|
|
41
41
|
DEFAULT_RANDOM_STATE = 42
|
42
42
|
|
@@ -99,8 +99,7 @@ LIGHTGBM_REGRESSION_PARAMS = {
|
|
99
99
|
"min_sum_hessian_in_leaf": 0.01,
|
100
100
|
"objective": "huber",
|
101
101
|
"deterministic": "true",
|
102
|
-
"force_col_wise": "true",
|
103
|
-
"force_row_wise": "true",
|
102
|
+
# "force_col_wise": "true",
|
104
103
|
"verbosity": -1,
|
105
104
|
}
|
106
105
|
|
@@ -120,8 +119,7 @@ LIGHTGBM_MULTICLASS_PARAMS = {
|
|
120
119
|
"num_grad_quant_bins": "8",
|
121
120
|
"stochastic_rounding": "true",
|
122
121
|
"deterministic": "true",
|
123
|
-
"force_col_wise": "true",
|
124
|
-
"force_row_wise": "true",
|
122
|
+
# "force_col_wise": "true",
|
125
123
|
"verbosity": -1,
|
126
124
|
}
|
127
125
|
|
@@ -138,8 +136,7 @@ LIGHTGBM_BINARY_PARAMS = {
|
|
138
136
|
"cat_smooth": 18,
|
139
137
|
"cat_l2": 8,
|
140
138
|
"deterministic": "true",
|
141
|
-
"force_col_wise": "true",
|
142
|
-
"force_row_wise": "true",
|
139
|
+
# "force_col_wise": "true",
|
143
140
|
"verbosity": -1,
|
144
141
|
}
|
145
142
|
|
@@ -148,33 +145,33 @@ LIGHTGBM_EARLY_STOPPING_ROUNDS = 20
|
|
148
145
|
N_FOLDS = 5
|
149
146
|
BLOCKED_TS_TEST_SIZE = 0.2
|
150
147
|
|
151
|
-
NA_VALUES = [
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
]
|
176
|
-
|
177
|
-
NA_REPLACEMENT = "NA"
|
148
|
+
# NA_VALUES = [
|
149
|
+
# "",
|
150
|
+
# " ",
|
151
|
+
# " ",
|
152
|
+
# "#n/a",
|
153
|
+
# "#n/a n/a",
|
154
|
+
# "#na",
|
155
|
+
# "-1.#ind",
|
156
|
+
# "-1.#qnan",
|
157
|
+
# "-nan",
|
158
|
+
# "1.#ind",
|
159
|
+
# "1.#qnan",
|
160
|
+
# "n/a",
|
161
|
+
# "na",
|
162
|
+
# "null",
|
163
|
+
# "nan",
|
164
|
+
# "n/a",
|
165
|
+
# "nan",
|
166
|
+
# "none",
|
167
|
+
# "-",
|
168
|
+
# "undefined",
|
169
|
+
# "[[unknown]]",
|
170
|
+
# "[not provided]",
|
171
|
+
# "[unknown]",
|
172
|
+
# ]
|
173
|
+
|
174
|
+
# NA_REPLACEMENT = "NA"
|
178
175
|
|
179
176
|
SUPPORTED_CATBOOST_METRICS = {
|
180
177
|
s.upper(): s
|
@@ -290,6 +287,7 @@ class EstimatorWrapper:
|
|
290
287
|
self,
|
291
288
|
estimator,
|
292
289
|
scorer: Callable,
|
290
|
+
cat_features: Optional[List[str]],
|
293
291
|
metric_name: str,
|
294
292
|
multiplier: int,
|
295
293
|
cv: BaseCrossValidator,
|
@@ -301,9 +299,8 @@ class EstimatorWrapper:
|
|
301
299
|
):
|
302
300
|
self.estimator = estimator
|
303
301
|
self.scorer = scorer
|
304
|
-
self.
|
305
|
-
|
306
|
-
)
|
302
|
+
self.cat_features = cat_features
|
303
|
+
self.metric_name = metric_name
|
307
304
|
self.multiplier = multiplier
|
308
305
|
self.cv = cv
|
309
306
|
self.target_type = target_type
|
@@ -348,6 +345,8 @@ class EstimatorWrapper:
|
|
348
345
|
else:
|
349
346
|
x, y = self._remove_empty_target_rows(x, y)
|
350
347
|
|
348
|
+
y = prepare_target(y, self.target_type)
|
349
|
+
|
351
350
|
self.logger.info(f"After preparing data columns: {x.columns.to_list()}")
|
352
351
|
return x, y, groups
|
353
352
|
|
@@ -468,7 +467,7 @@ class EstimatorWrapper:
|
|
468
467
|
logger: logging.Logger,
|
469
468
|
target_type: ModelTaskType,
|
470
469
|
cv: BaseCrossValidator,
|
471
|
-
|
470
|
+
*,
|
472
471
|
scoring: Union[Callable, str, None] = None,
|
473
472
|
cat_features: Optional[List[str]] = None,
|
474
473
|
text_features: Optional[List[str]] = None,
|
@@ -476,9 +475,10 @@ class EstimatorWrapper:
|
|
476
475
|
groups: Optional[List[str]] = None,
|
477
476
|
has_date: Optional[bool] = None,
|
478
477
|
) -> EstimatorWrapper:
|
479
|
-
scorer, metric_name, multiplier =
|
478
|
+
scorer, metric_name, multiplier = define_scorer(target_type, scoring)
|
480
479
|
kwargs = {
|
481
480
|
"scorer": scorer,
|
481
|
+
"cat_features": cat_features,
|
482
482
|
"metric_name": metric_name,
|
483
483
|
"multiplier": multiplier,
|
484
484
|
"cv": cv,
|
@@ -512,11 +512,6 @@ class EstimatorWrapper:
|
|
512
512
|
kwargs["estimator"] = estimator_copy
|
513
513
|
if is_catboost_estimator(estimator):
|
514
514
|
if cat_features is not None:
|
515
|
-
for cat_feature in cat_features:
|
516
|
-
if cat_feature not in x.columns:
|
517
|
-
logger.error(
|
518
|
-
f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
|
519
|
-
)
|
520
515
|
estimator_copy.set_params(cat_features=cat_features, has_time=has_date)
|
521
516
|
estimator = CatBoostWrapper(**kwargs)
|
522
517
|
else:
|
@@ -539,6 +534,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
539
534
|
self,
|
540
535
|
estimator,
|
541
536
|
scorer: Callable,
|
537
|
+
cat_features: Optional[List[str]],
|
542
538
|
metric_name: str,
|
543
539
|
multiplier: int,
|
544
540
|
cv: BaseCrossValidator,
|
@@ -550,6 +546,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
550
546
|
super(CatBoostWrapper, self).__init__(
|
551
547
|
estimator,
|
552
548
|
scorer,
|
549
|
+
cat_features,
|
553
550
|
metric_name,
|
554
551
|
multiplier,
|
555
552
|
cv,
|
@@ -558,10 +555,9 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
558
555
|
text_features=text_features,
|
559
556
|
logger=logger,
|
560
557
|
)
|
561
|
-
self.cat_features = None
|
562
558
|
self.emb_features = None
|
563
559
|
self.grouped_embedding_features = None
|
564
|
-
self.
|
560
|
+
self.drop_cat_features = []
|
565
561
|
|
566
562
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
567
563
|
x, y, groups, params = super()._prepare_to_fit(x, y)
|
@@ -598,36 +594,9 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
598
594
|
self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
|
599
595
|
|
600
596
|
# Find rest categorical features
|
601
|
-
self.cat_features
|
602
|
-
|
603
|
-
|
604
|
-
for name in self.cat_features:
|
605
|
-
# Remove constant categorical features
|
606
|
-
if x[name].nunique() > 1:
|
607
|
-
unique_cat_features.append(name)
|
608
|
-
else:
|
609
|
-
self.logger.info(f"Drop column {name} on preparing data for fit")
|
610
|
-
x = x.drop(columns=name)
|
611
|
-
self.exclude_features.append(name)
|
612
|
-
self.cat_features = unique_cat_features
|
613
|
-
if (
|
614
|
-
hasattr(self.estimator, "get_param")
|
615
|
-
and hasattr(self.estimator, "_init_params")
|
616
|
-
and self.estimator.get_param("cat_features") is not None
|
617
|
-
):
|
618
|
-
estimator_cat_features = self.estimator.get_param("cat_features")
|
619
|
-
if all([isinstance(c, int) for c in estimator_cat_features]):
|
620
|
-
cat_features_idx = {x.columns.get_loc(c) for c in self.cat_features}
|
621
|
-
cat_features_idx.update(estimator_cat_features)
|
622
|
-
self.cat_features = [x.columns[idx] for idx in cat_features_idx]
|
623
|
-
elif all([isinstance(c, str) for c in estimator_cat_features]):
|
624
|
-
self.cat_features = list(set(self.cat_features + estimator_cat_features))
|
625
|
-
else:
|
626
|
-
print(f"WARNING: Unsupported type of cat_features in CatBoost estimator: {estimator_cat_features}")
|
627
|
-
|
628
|
-
del self.estimator._init_params["cat_features"]
|
629
|
-
|
630
|
-
self.logger.info(f"Selected categorical features: {self.cat_features}")
|
597
|
+
self.cat_features, self.features_to_encode, self.exclude_features = _get_cat_features(
|
598
|
+
self.logger, x, self.cat_features, self.text_features, self.grouped_embedding_features
|
599
|
+
)
|
631
600
|
params["cat_features"] = self.cat_features
|
632
601
|
|
633
602
|
return x, y, groups, params
|
@@ -658,7 +627,6 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
658
627
|
x, emb_columns = self.group_embeddings(x)
|
659
628
|
params["embedding_features"] = emb_columns
|
660
629
|
if self.cat_features:
|
661
|
-
# x = fill_na_cat_features(x, self.cat_features)
|
662
630
|
params["cat_features"] = self.cat_features
|
663
631
|
|
664
632
|
return x, y, params
|
@@ -728,6 +696,7 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
728
696
|
self,
|
729
697
|
estimator,
|
730
698
|
scorer: Callable,
|
699
|
+
cat_features: Optional[List[str]],
|
731
700
|
metric_name: str,
|
732
701
|
multiplier: int,
|
733
702
|
cv: BaseCrossValidator,
|
@@ -739,6 +708,7 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
739
708
|
super(LightGBMWrapper, self).__init__(
|
740
709
|
estimator,
|
741
710
|
scorer,
|
711
|
+
cat_features,
|
742
712
|
metric_name,
|
743
713
|
multiplier,
|
744
714
|
cv,
|
@@ -747,9 +717,10 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
747
717
|
text_features=text_features,
|
748
718
|
logger=logger,
|
749
719
|
)
|
750
|
-
self.cat_features = None
|
751
720
|
self.cat_encoder = None
|
752
721
|
self.n_classes = None
|
722
|
+
self.exclude_features = []
|
723
|
+
self.features_to_encode = []
|
753
724
|
|
754
725
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
|
755
726
|
x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
|
@@ -759,30 +730,25 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
759
730
|
if self.target_type == ModelTaskType.BINARY:
|
760
731
|
params["eval_metric"] = "auc"
|
761
732
|
params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
|
762
|
-
self.cat_features = _get_cat_features(
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
x[self.cat_features] = encoded
|
733
|
+
self.cat_features, self.features_to_encode, self.exclude_features = _get_cat_features(
|
734
|
+
self.logger, x, self.cat_features
|
735
|
+
)
|
736
|
+
if self.features_to_encode:
|
737
|
+
encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, return_df=True)
|
738
|
+
encoded = encoder.fit_transform(x[self.features_to_encode].astype("object"), y_numpy).astype("category")
|
739
|
+
x[self.features_to_encode] = encoded
|
770
740
|
self.cat_encoder = encoder
|
771
|
-
if not is_numeric_dtype(y_numpy):
|
772
|
-
y_numpy = correct_string_target(y_numpy)
|
773
741
|
|
774
742
|
return x, y_numpy, groups, params
|
775
743
|
|
776
744
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
745
|
+
if self.exclude_features:
|
746
|
+
x = x.drop(columns=self.exclude_features)
|
777
747
|
x, y_numpy, params = super()._prepare_to_calculate(x, y)
|
778
|
-
if self.
|
779
|
-
x =
|
780
|
-
|
781
|
-
|
782
|
-
self.cat_encoder.transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
|
783
|
-
)
|
784
|
-
if not is_numeric_dtype(y):
|
785
|
-
y_numpy = correct_string_target(y_numpy)
|
748
|
+
if self.features_to_encode is not None and self.cat_encoder is not None:
|
749
|
+
x[self.features_to_encode] = self.cat_encoder.transform(x[self.features_to_encode].astype("object")).astype(
|
750
|
+
"category"
|
751
|
+
)
|
786
752
|
return x, y_numpy, params
|
787
753
|
|
788
754
|
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
@@ -808,20 +774,6 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
808
774
|
for i, col in enumerate(x.columns):
|
809
775
|
feature_importance[col] = np.mean(np.abs(shap_matrix[:, i]))
|
810
776
|
|
811
|
-
# # exclude last column (base value)
|
812
|
-
# shap_values_only = shap_values[:, :-1]
|
813
|
-
# mean_abs_shap = np.mean(np.abs(shap_values_only), axis=0)
|
814
|
-
|
815
|
-
# # For classification, shap_values is returned as a list for each class
|
816
|
-
# # Take values for the positive class
|
817
|
-
# if isinstance(shap_values, list):
|
818
|
-
# shap_values = shap_values[1]
|
819
|
-
|
820
|
-
# # Calculate mean absolute SHAP value for each feature
|
821
|
-
# feature_importance = {}
|
822
|
-
# for i, col in enumerate(x.columns):
|
823
|
-
# feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
|
824
|
-
|
825
777
|
return feature_importance
|
826
778
|
|
827
779
|
except Exception as e:
|
@@ -834,6 +786,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
834
786
|
self,
|
835
787
|
estimator,
|
836
788
|
scorer: Callable,
|
789
|
+
cat_features: Optional[List[str]],
|
837
790
|
metric_name: str,
|
838
791
|
multiplier: int,
|
839
792
|
cv: BaseCrossValidator,
|
@@ -845,6 +798,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
845
798
|
super(OtherEstimatorWrapper, self).__init__(
|
846
799
|
estimator,
|
847
800
|
scorer,
|
801
|
+
cat_features,
|
848
802
|
metric_name,
|
849
803
|
multiplier,
|
850
804
|
cv,
|
@@ -853,32 +807,32 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
853
807
|
text_features=text_features,
|
854
808
|
logger=logger,
|
855
809
|
)
|
856
|
-
self.cat_features = None
|
857
810
|
|
858
811
|
def _prepare_to_fit(self, x: pd.DataFrame, y: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
859
|
-
x,
|
860
|
-
self.cat_features = _get_cat_features(
|
812
|
+
x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
|
813
|
+
self.cat_features, self.features_to_encode, self.exclude_features = _get_cat_features(
|
814
|
+
self.logger, x, self.cat_features
|
815
|
+
)
|
861
816
|
num_features = [col for col in x.columns if col not in self.cat_features]
|
862
817
|
x[num_features] = x[num_features].fillna(-999)
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
x[
|
867
|
-
|
868
|
-
|
869
|
-
return x, y, groups, params
|
818
|
+
if self.cat_features:
|
819
|
+
encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, return_df=True)
|
820
|
+
encoded = encoder.fit_transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
|
821
|
+
x[self.cat_features] = encoded
|
822
|
+
self.cat_encoder = encoder
|
823
|
+
return x, y_numpy, groups, params
|
870
824
|
|
871
825
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
826
|
+
if self.exclude_features:
|
827
|
+
x = x.drop(columns=self.exclude_features)
|
872
828
|
x, y, params = super()._prepare_to_calculate(x, y)
|
873
829
|
if self.cat_features is not None:
|
874
830
|
num_features = [col for col in x.columns if col not in self.cat_features]
|
875
831
|
x[num_features] = x[num_features].fillna(-999)
|
876
|
-
|
877
|
-
|
878
|
-
|
879
|
-
|
880
|
-
if not is_numeric_dtype(y):
|
881
|
-
y = correct_string_target(y)
|
832
|
+
if self.features_to_encode and self.cat_encoder is not None:
|
833
|
+
x[self.features_to_encode] = self.cat_encoder.transform(x[self.features_to_encode].astype("object")).astype(
|
834
|
+
"category"
|
835
|
+
)
|
882
836
|
return x, y, params
|
883
837
|
|
884
838
|
|
@@ -941,7 +895,7 @@ def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
|
|
941
895
|
return scoring, metric_name, multiplier
|
942
896
|
|
943
897
|
|
944
|
-
def
|
898
|
+
def define_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None]) -> Tuple[Callable, str, int]:
|
945
899
|
if scoring is None:
|
946
900
|
if target_type == ModelTaskType.BINARY:
|
947
901
|
scoring = "roc_auc"
|
@@ -960,16 +914,42 @@ def _get_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None])
|
|
960
914
|
else:
|
961
915
|
metric_name = str(scoring)
|
962
916
|
|
917
|
+
metric_name = "GINI" if metric_name.upper() == "ROC_AUC" and target_type == ModelTaskType.BINARY else metric_name
|
918
|
+
|
963
919
|
return scoring, metric_name, multiplier
|
964
920
|
|
965
921
|
|
966
922
|
def _get_cat_features(
|
967
|
-
|
923
|
+
logger: logging.Logger,
|
924
|
+
x: pd.DataFrame,
|
925
|
+
cat_features: Optional[List[str]],
|
926
|
+
text_features: Optional[List[str]] = None,
|
927
|
+
emb_features: Optional[List[str]] = None,
|
968
928
|
) -> List[str]:
|
929
|
+
cat_features = cat_features or []
|
969
930
|
text_features = text_features or []
|
970
931
|
emb_features = emb_features or []
|
971
932
|
exclude_features = text_features + emb_features
|
972
|
-
|
933
|
+
cat_features = [c for c in cat_features if c not in exclude_features]
|
934
|
+
unique_cat_features = []
|
935
|
+
drop_cat_features = []
|
936
|
+
for name in cat_features:
|
937
|
+
# Remove constant categorical features
|
938
|
+
if x[name].nunique() > 1:
|
939
|
+
unique_cat_features.append(name)
|
940
|
+
else:
|
941
|
+
logger.info(f"Drop column {name} on preparing data for fit")
|
942
|
+
x = x.drop(columns=name)
|
943
|
+
drop_cat_features.append(name)
|
944
|
+
cat_features = unique_cat_features
|
945
|
+
|
946
|
+
logger.info(f"Selected categorical features: {cat_features}")
|
947
|
+
|
948
|
+
features_to_encode = list(set(x.select_dtypes(exclude=[np.number, np.datetime64, pd.CategoricalDtype]).columns))
|
949
|
+
|
950
|
+
logger.info(f"Features to encode: {features_to_encode}")
|
951
|
+
|
952
|
+
return cat_features, features_to_encode, drop_cat_features
|
973
953
|
|
974
954
|
|
975
955
|
def _get_add_params(input_params, add_params):
|
@@ -1059,10 +1039,10 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
|
|
1059
1039
|
return mse if squared else np.sqrt(mse)
|
1060
1040
|
|
1061
1041
|
|
1062
|
-
def fill_na_cat_features(df: pd.DataFrame, cat_features: List[str]) -> pd.DataFrame:
|
1063
|
-
|
1064
|
-
|
1065
|
-
|
1066
|
-
|
1067
|
-
|
1068
|
-
|
1042
|
+
# def fill_na_cat_features(df: pd.DataFrame, cat_features: List[str]) -> pd.DataFrame:
|
1043
|
+
# for c in cat_features:
|
1044
|
+
# if c in df.columns:
|
1045
|
+
# df[c] = df[c].astype("string").fillna(NA_REPLACEMENT).astype(str)
|
1046
|
+
# na_filter = df[c].str.lower().isin(NA_VALUES)
|
1047
|
+
# df.loc[na_filter, c] = NA_REPLACEMENT
|
1048
|
+
# return df
|
upgini/utils/target_utils.py
CHANGED
@@ -3,7 +3,7 @@ from typing import Callable, List, Optional, Union
|
|
3
3
|
|
4
4
|
import numpy as np
|
5
5
|
import pandas as pd
|
6
|
-
from pandas.api.types import
|
6
|
+
from pandas.api.types import is_bool_dtype, is_datetime64_any_dtype, is_numeric_dtype
|
7
7
|
|
8
8
|
from upgini.errors import ValidationError
|
9
9
|
from upgini.metadata import SYSTEM_RECORD_ID, CVType, ModelTaskType
|
@@ -14,11 +14,14 @@ from upgini.utils.ts_utils import get_most_frequent_time_unit, trunc_datetime
|
|
14
14
|
TS_MIN_DIFFERENT_IDS_RATIO = 0.2
|
15
15
|
|
16
16
|
|
17
|
-
def
|
18
|
-
if
|
19
|
-
|
20
|
-
|
21
|
-
|
17
|
+
def prepare_target(y: Union[pd.Series, np.ndarray], target_type: ModelTaskType) -> Union[pd.Series, np.ndarray]:
|
18
|
+
if target_type != ModelTaskType.REGRESSION or (not is_numeric_dtype(y) and not is_datetime64_any_dtype(y)):
|
19
|
+
if isinstance(y, pd.Series):
|
20
|
+
y = y.astype(str).astype("category").cat.codes
|
21
|
+
elif isinstance(y, np.ndarray):
|
22
|
+
y = pd.Series(y).astype(str).astype("category").cat.codes.values
|
23
|
+
|
24
|
+
return y
|
22
25
|
|
23
26
|
|
24
27
|
def define_task(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: upgini
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.81a3832.dev1
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
@@ -22,6 +22,7 @@ Classifier: Programming Language :: Python :: 3.11
|
|
22
22
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
23
23
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
24
24
|
Requires-Python: <3.12,>=3.10
|
25
|
+
Requires-Dist: category-encoders>=2.8.1
|
25
26
|
Requires-Dist: fastparquet>=0.8.1
|
26
27
|
Requires-Dist: ipywidgets>=8.1.0
|
27
28
|
Requires-Dist: jarowinkler>=2.0.0
|
@@ -1,12 +1,12 @@
|
|
1
|
-
upgini/__about__.py,sha256
|
1
|
+
upgini/__about__.py,sha256=-WSXUS5Ith33qArTnDO4LmrI0wUaXbJ8bIzoMZvAsWU,33
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=qtrQJwF2QbKdQ8Tqk5RQj3aAqOzDgygD6nIHrco3AzE,209728
|
7
7
|
upgini/http.py,sha256=UH7nswcZ221un3O_VW9limCBO5oRsyg1eKUHiVslRPs,43737
|
8
8
|
upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
|
9
|
-
upgini/metrics.py,sha256=
|
9
|
+
upgini/metrics.py,sha256=95sK1Kr3dYxqQcdkkoNFDe9OZY7OhgLjYwe3bhMQd38,38087
|
10
10
|
upgini/search_task.py,sha256=RcvAE785yksWTsTNWuZFVNlk32jHElMoEna1T_C5N8Q,17823
|
11
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
12
12
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
@@ -66,11 +66,11 @@ upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml
|
|
66
66
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
67
67
|
upgini/utils/sklearn_ext.py,sha256=HpaNQaKJisgNE7IZ71n7uswxTj7kbPglU2G3s1sORAc,45042
|
68
68
|
upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
|
69
|
-
upgini/utils/target_utils.py,sha256=
|
69
|
+
upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,16832
|
70
70
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
71
71
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
72
72
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
73
|
-
upgini-1.2.
|
74
|
-
upgini-1.2.
|
75
|
-
upgini-1.2.
|
76
|
-
upgini-1.2.
|
73
|
+
upgini-1.2.81a3832.dev1.dist-info/METADATA,sha256=ShIRi8EeeujsKBJ0byR2XWJ6DKFka2vrViq9d5VwjzU,49141
|
74
|
+
upgini-1.2.81a3832.dev1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
75
|
+
upgini-1.2.81a3832.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
76
|
+
upgini-1.2.81a3832.dev1.dist-info/RECORD,,
|
File without changes
|
File without changes
|