upgini 1.2.80__py3-none-any.whl → 1.2.81a3832.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +48 -30
- upgini/metrics.py +81 -98
- upgini/utils/target_utils.py +9 -6
- {upgini-1.2.80.dist-info → upgini-1.2.81a3832.dev1.dist-info}/METADATA +2 -1
- {upgini-1.2.80.dist-info → upgini-1.2.81a3832.dev1.dist-info}/RECORD +8 -8
- {upgini-1.2.80.dist-info → upgini-1.2.81a3832.dev1.dist-info}/WHEEL +0 -0
- {upgini-1.2.80.dist-info → upgini-1.2.81a3832.dev1.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.81a3832.dev1"
|
upgini/features_enricher.py
CHANGED
@@ -63,7 +63,7 @@ from upgini.metadata import (
|
|
63
63
|
RuntimeParameters,
|
64
64
|
SearchKey,
|
65
65
|
)
|
66
|
-
from upgini.metrics import EstimatorWrapper, validate_scoring_argument
|
66
|
+
from upgini.metrics import EstimatorWrapper, define_scorer, validate_scoring_argument
|
67
67
|
from upgini.normalizer.normalize_utils import Normalizer
|
68
68
|
from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
|
69
69
|
from upgini.search_task import SearchTask
|
@@ -957,7 +957,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
957
957
|
self.__display_support_link(msg)
|
958
958
|
return None
|
959
959
|
|
960
|
-
|
960
|
+
client_cat_features, search_keys_for_metrics = self._get_client_cat_features(
|
961
961
|
estimator, validated_X, self.search_keys
|
962
962
|
)
|
963
963
|
search_keys_for_metrics.extend([c for c in self.id_columns or [] if c not in search_keys_for_metrics])
|
@@ -976,7 +976,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
976
976
|
search_keys_for_metrics=search_keys_for_metrics,
|
977
977
|
progress_bar=progress_bar,
|
978
978
|
progress_callback=progress_callback,
|
979
|
-
cat_features=
|
979
|
+
cat_features=client_cat_features,
|
980
980
|
)
|
981
981
|
if prepared_data is None:
|
982
982
|
return None
|
@@ -994,11 +994,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
994
994
|
) = prepared_data
|
995
995
|
|
996
996
|
# rename cat_features
|
997
|
-
if
|
997
|
+
if client_cat_features:
|
998
998
|
for new_c, old_c in columns_renaming.items():
|
999
|
-
if old_c in
|
1000
|
-
|
1001
|
-
|
999
|
+
if old_c in client_cat_features:
|
1000
|
+
client_cat_features.remove(old_c)
|
1001
|
+
client_cat_features.append(new_c)
|
1002
|
+
for cat_feature in client_cat_features:
|
1003
|
+
if cat_feature not in fitting_X.columns:
|
1004
|
+
self.logger.error(
|
1005
|
+
f"Client cat_feature `{cat_feature}` not found in"
|
1006
|
+
f" x columns: {fitting_X.columns.to_list()}"
|
1007
|
+
)
|
1008
|
+
else:
|
1009
|
+
client_cat_features = []
|
1002
1010
|
|
1003
1011
|
gc.collect()
|
1004
1012
|
|
@@ -1019,20 +1027,17 @@ class FeaturesEnricher(TransformerMixin):
|
|
1019
1027
|
|
1020
1028
|
has_date = self._get_date_column(search_keys) is not None
|
1021
1029
|
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
1030
|
+
cat_features_from_backend = self.__get_categorical_features()
|
1031
|
+
cat_features = list(set(client_cat_features + cat_features_from_backend))
|
1032
|
+
baseline_cat_features = [f for f in cat_features if f in fitting_X.columns]
|
1033
|
+
enriched_cat_features = [f for f in cat_features if f in fitting_enriched_X.columns]
|
1034
|
+
if len(enriched_cat_features) < len(cat_features):
|
1035
|
+
missing_cat_features = [f for f in cat_features if f not in fitting_enriched_X.columns]
|
1036
|
+
self.logger.warning(
|
1037
|
+
f"Some cat_features were not found in enriched_X: {missing_cat_features}"
|
1038
|
+
)
|
1022
1039
|
|
1023
|
-
|
1024
|
-
estimator,
|
1025
|
-
self.logger,
|
1026
|
-
model_task_type,
|
1027
|
-
_cv,
|
1028
|
-
fitting_enriched_X,
|
1029
|
-
scoring,
|
1030
|
-
groups=groups,
|
1031
|
-
text_features=text_features,
|
1032
|
-
has_date=has_date,
|
1033
|
-
)
|
1034
|
-
metric = wrapper.metric_name
|
1035
|
-
multiplier = wrapper.multiplier
|
1040
|
+
_, metric, multiplier = define_scorer(model_task_type, scoring)
|
1036
1041
|
|
1037
1042
|
# 1 If client features are presented - fit and predict with KFold estimator
|
1038
1043
|
# on etalon features and calculate baseline metric
|
@@ -1050,9 +1055,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1050
1055
|
self.logger,
|
1051
1056
|
model_task_type,
|
1052
1057
|
_cv,
|
1053
|
-
|
1054
|
-
|
1055
|
-
cat_features,
|
1058
|
+
scoring=scoring,
|
1059
|
+
cat_features=baseline_cat_features,
|
1056
1060
|
add_params=custom_loss_add_params,
|
1057
1061
|
groups=groups,
|
1058
1062
|
text_features=text_features,
|
@@ -1085,9 +1089,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1085
1089
|
self.logger,
|
1086
1090
|
model_task_type,
|
1087
1091
|
_cv,
|
1088
|
-
|
1089
|
-
|
1090
|
-
cat_features,
|
1092
|
+
scoring=scoring,
|
1093
|
+
cat_features=enriched_cat_features,
|
1091
1094
|
add_params=custom_loss_add_params,
|
1092
1095
|
groups=groups,
|
1093
1096
|
text_features=text_features,
|
@@ -1428,12 +1431,20 @@ class FeaturesEnricher(TransformerMixin):
|
|
1428
1431
|
if (
|
1429
1432
|
estimator is not None
|
1430
1433
|
and hasattr(estimator, "get_param")
|
1434
|
+
and hasattr(estimator, "_init_params")
|
1431
1435
|
and estimator.get_param("cat_features") is not None
|
1432
1436
|
):
|
1433
|
-
|
1434
|
-
if
|
1435
|
-
|
1436
|
-
|
1437
|
+
estimator_cat_features = estimator.get_param("cat_features")
|
1438
|
+
if all([isinstance(c, int) for c in estimator_cat_features]):
|
1439
|
+
cat_features = [X.columns[idx] for idx in estimator_cat_features]
|
1440
|
+
elif all([isinstance(c, str) for c in estimator_cat_features]):
|
1441
|
+
cat_features = estimator_cat_features
|
1442
|
+
else:
|
1443
|
+
print(f"WARNING: Unsupported type of cat_features in CatBoost estimator: {estimator_cat_features}")
|
1444
|
+
|
1445
|
+
del estimator._init_params["cat_features"]
|
1446
|
+
|
1447
|
+
if cat_features:
|
1437
1448
|
self.logger.info(f"Collected categorical features {cat_features} from user estimator")
|
1438
1449
|
for cat_feature in cat_features:
|
1439
1450
|
if cat_feature in search_keys:
|
@@ -3855,6 +3866,13 @@ if response.status_code == 200:
|
|
3855
3866
|
|
3856
3867
|
return importances
|
3857
3868
|
|
3869
|
+
def __get_categorical_features(self) -> List[str]:
|
3870
|
+
features_meta = self._search_task.get_all_features_metadata_v2()
|
3871
|
+
if features_meta is None:
|
3872
|
+
raise Exception(self.bundle.get("missing_features_meta"))
|
3873
|
+
|
3874
|
+
return [f.name for f in features_meta if f.type == "categorical"]
|
3875
|
+
|
3858
3876
|
def __prepare_feature_importances(
|
3859
3877
|
self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
|
3860
3878
|
):
|
upgini/metrics.py
CHANGED
@@ -11,15 +11,15 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
11
11
|
import lightgbm as lgb
|
12
12
|
import numpy as np
|
13
13
|
import pandas as pd
|
14
|
+
from category_encoders.cat_boost import CatBoostEncoder
|
14
15
|
from lightgbm import LGBMClassifier, LGBMRegressor
|
15
16
|
from numpy import log1p
|
16
17
|
from pandas.api.types import is_numeric_dtype
|
17
18
|
from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
|
18
|
-
from sklearn.preprocessing import OrdinalEncoder
|
19
19
|
|
20
|
+
from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
|
20
21
|
from upgini.utils.features_validator import FeaturesValidator
|
21
22
|
from upgini.utils.sklearn_ext import cross_validate
|
22
|
-
from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
|
23
23
|
|
24
24
|
try:
|
25
25
|
from sklearn.metrics import get_scorer_names
|
@@ -36,7 +36,7 @@ from sklearn.model_selection import BaseCrossValidator, TimeSeriesSplit
|
|
36
36
|
from upgini.errors import ValidationError
|
37
37
|
from upgini.metadata import ModelTaskType
|
38
38
|
from upgini.resource_bundle import bundle
|
39
|
-
from upgini.utils.target_utils import
|
39
|
+
from upgini.utils.target_utils import prepare_target
|
40
40
|
|
41
41
|
DEFAULT_RANDOM_STATE = 42
|
42
42
|
|
@@ -287,6 +287,7 @@ class EstimatorWrapper:
|
|
287
287
|
self,
|
288
288
|
estimator,
|
289
289
|
scorer: Callable,
|
290
|
+
cat_features: Optional[List[str]],
|
290
291
|
metric_name: str,
|
291
292
|
multiplier: int,
|
292
293
|
cv: BaseCrossValidator,
|
@@ -298,9 +299,8 @@ class EstimatorWrapper:
|
|
298
299
|
):
|
299
300
|
self.estimator = estimator
|
300
301
|
self.scorer = scorer
|
301
|
-
self.
|
302
|
-
|
303
|
-
)
|
302
|
+
self.cat_features = cat_features
|
303
|
+
self.metric_name = metric_name
|
304
304
|
self.multiplier = multiplier
|
305
305
|
self.cv = cv
|
306
306
|
self.target_type = target_type
|
@@ -345,6 +345,8 @@ class EstimatorWrapper:
|
|
345
345
|
else:
|
346
346
|
x, y = self._remove_empty_target_rows(x, y)
|
347
347
|
|
348
|
+
y = prepare_target(y, self.target_type)
|
349
|
+
|
348
350
|
self.logger.info(f"After preparing data columns: {x.columns.to_list()}")
|
349
351
|
return x, y, groups
|
350
352
|
|
@@ -465,7 +467,7 @@ class EstimatorWrapper:
|
|
465
467
|
logger: logging.Logger,
|
466
468
|
target_type: ModelTaskType,
|
467
469
|
cv: BaseCrossValidator,
|
468
|
-
|
470
|
+
*,
|
469
471
|
scoring: Union[Callable, str, None] = None,
|
470
472
|
cat_features: Optional[List[str]] = None,
|
471
473
|
text_features: Optional[List[str]] = None,
|
@@ -473,9 +475,10 @@ class EstimatorWrapper:
|
|
473
475
|
groups: Optional[List[str]] = None,
|
474
476
|
has_date: Optional[bool] = None,
|
475
477
|
) -> EstimatorWrapper:
|
476
|
-
scorer, metric_name, multiplier =
|
478
|
+
scorer, metric_name, multiplier = define_scorer(target_type, scoring)
|
477
479
|
kwargs = {
|
478
480
|
"scorer": scorer,
|
481
|
+
"cat_features": cat_features,
|
479
482
|
"metric_name": metric_name,
|
480
483
|
"multiplier": multiplier,
|
481
484
|
"cv": cv,
|
@@ -509,11 +512,6 @@ class EstimatorWrapper:
|
|
509
512
|
kwargs["estimator"] = estimator_copy
|
510
513
|
if is_catboost_estimator(estimator):
|
511
514
|
if cat_features is not None:
|
512
|
-
for cat_feature in cat_features:
|
513
|
-
if cat_feature not in x.columns:
|
514
|
-
logger.error(
|
515
|
-
f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
|
516
|
-
)
|
517
515
|
estimator_copy.set_params(cat_features=cat_features, has_time=has_date)
|
518
516
|
estimator = CatBoostWrapper(**kwargs)
|
519
517
|
else:
|
@@ -536,6 +534,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
536
534
|
self,
|
537
535
|
estimator,
|
538
536
|
scorer: Callable,
|
537
|
+
cat_features: Optional[List[str]],
|
539
538
|
metric_name: str,
|
540
539
|
multiplier: int,
|
541
540
|
cv: BaseCrossValidator,
|
@@ -547,6 +546,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
547
546
|
super(CatBoostWrapper, self).__init__(
|
548
547
|
estimator,
|
549
548
|
scorer,
|
549
|
+
cat_features,
|
550
550
|
metric_name,
|
551
551
|
multiplier,
|
552
552
|
cv,
|
@@ -555,10 +555,9 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
555
555
|
text_features=text_features,
|
556
556
|
logger=logger,
|
557
557
|
)
|
558
|
-
self.cat_features = None
|
559
558
|
self.emb_features = None
|
560
559
|
self.grouped_embedding_features = None
|
561
|
-
self.
|
560
|
+
self.drop_cat_features = []
|
562
561
|
|
563
562
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
564
563
|
x, y, groups, params = super()._prepare_to_fit(x, y)
|
@@ -595,36 +594,9 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
595
594
|
self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
|
596
595
|
|
597
596
|
# Find rest categorical features
|
598
|
-
self.cat_features
|
599
|
-
|
600
|
-
|
601
|
-
for name in self.cat_features:
|
602
|
-
# Remove constant categorical features
|
603
|
-
if x[name].nunique() > 1:
|
604
|
-
unique_cat_features.append(name)
|
605
|
-
else:
|
606
|
-
self.logger.info(f"Drop column {name} on preparing data for fit")
|
607
|
-
x = x.drop(columns=name)
|
608
|
-
self.exclude_features.append(name)
|
609
|
-
self.cat_features = unique_cat_features
|
610
|
-
if (
|
611
|
-
hasattr(self.estimator, "get_param")
|
612
|
-
and hasattr(self.estimator, "_init_params")
|
613
|
-
and self.estimator.get_param("cat_features") is not None
|
614
|
-
):
|
615
|
-
estimator_cat_features = self.estimator.get_param("cat_features")
|
616
|
-
if all([isinstance(c, int) for c in estimator_cat_features]):
|
617
|
-
cat_features_idx = {x.columns.get_loc(c) for c in self.cat_features}
|
618
|
-
cat_features_idx.update(estimator_cat_features)
|
619
|
-
self.cat_features = [x.columns[idx] for idx in cat_features_idx]
|
620
|
-
elif all([isinstance(c, str) for c in estimator_cat_features]):
|
621
|
-
self.cat_features = list(set(self.cat_features + estimator_cat_features))
|
622
|
-
else:
|
623
|
-
print(f"WARNING: Unsupported type of cat_features in CatBoost estimator: {estimator_cat_features}")
|
624
|
-
|
625
|
-
del self.estimator._init_params["cat_features"]
|
626
|
-
|
627
|
-
self.logger.info(f"Selected categorical features: {self.cat_features}")
|
597
|
+
self.cat_features, self.features_to_encode, self.exclude_features = _get_cat_features(
|
598
|
+
self.logger, x, self.cat_features, self.text_features, self.grouped_embedding_features
|
599
|
+
)
|
628
600
|
params["cat_features"] = self.cat_features
|
629
601
|
|
630
602
|
return x, y, groups, params
|
@@ -655,7 +627,6 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
655
627
|
x, emb_columns = self.group_embeddings(x)
|
656
628
|
params["embedding_features"] = emb_columns
|
657
629
|
if self.cat_features:
|
658
|
-
# x = fill_na_cat_features(x, self.cat_features)
|
659
630
|
params["cat_features"] = self.cat_features
|
660
631
|
|
661
632
|
return x, y, params
|
@@ -725,6 +696,7 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
725
696
|
self,
|
726
697
|
estimator,
|
727
698
|
scorer: Callable,
|
699
|
+
cat_features: Optional[List[str]],
|
728
700
|
metric_name: str,
|
729
701
|
multiplier: int,
|
730
702
|
cv: BaseCrossValidator,
|
@@ -736,6 +708,7 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
736
708
|
super(LightGBMWrapper, self).__init__(
|
737
709
|
estimator,
|
738
710
|
scorer,
|
711
|
+
cat_features,
|
739
712
|
metric_name,
|
740
713
|
multiplier,
|
741
714
|
cv,
|
@@ -744,9 +717,10 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
744
717
|
text_features=text_features,
|
745
718
|
logger=logger,
|
746
719
|
)
|
747
|
-
self.cat_features = None
|
748
720
|
self.cat_encoder = None
|
749
721
|
self.n_classes = None
|
722
|
+
self.exclude_features = []
|
723
|
+
self.features_to_encode = []
|
750
724
|
|
751
725
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
|
752
726
|
x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
|
@@ -756,30 +730,25 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
756
730
|
if self.target_type == ModelTaskType.BINARY:
|
757
731
|
params["eval_metric"] = "auc"
|
758
732
|
params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
|
759
|
-
self.cat_features = _get_cat_features(
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
x[self.cat_features] = encoded
|
733
|
+
self.cat_features, self.features_to_encode, self.exclude_features = _get_cat_features(
|
734
|
+
self.logger, x, self.cat_features
|
735
|
+
)
|
736
|
+
if self.features_to_encode:
|
737
|
+
encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, return_df=True)
|
738
|
+
encoded = encoder.fit_transform(x[self.features_to_encode].astype("object"), y_numpy).astype("category")
|
739
|
+
x[self.features_to_encode] = encoded
|
767
740
|
self.cat_encoder = encoder
|
768
|
-
if not is_numeric_dtype(y_numpy):
|
769
|
-
y_numpy = correct_string_target(y_numpy)
|
770
741
|
|
771
742
|
return x, y_numpy, groups, params
|
772
743
|
|
773
744
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
745
|
+
if self.exclude_features:
|
746
|
+
x = x.drop(columns=self.exclude_features)
|
774
747
|
x, y_numpy, params = super()._prepare_to_calculate(x, y)
|
775
|
-
if self.
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
self.cat_encoder.transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
|
780
|
-
)
|
781
|
-
if not is_numeric_dtype(y):
|
782
|
-
y_numpy = correct_string_target(y_numpy)
|
748
|
+
if self.features_to_encode is not None and self.cat_encoder is not None:
|
749
|
+
x[self.features_to_encode] = self.cat_encoder.transform(x[self.features_to_encode].astype("object")).astype(
|
750
|
+
"category"
|
751
|
+
)
|
783
752
|
return x, y_numpy, params
|
784
753
|
|
785
754
|
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
@@ -805,20 +774,6 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
805
774
|
for i, col in enumerate(x.columns):
|
806
775
|
feature_importance[col] = np.mean(np.abs(shap_matrix[:, i]))
|
807
776
|
|
808
|
-
# # exclude last column (base value)
|
809
|
-
# shap_values_only = shap_values[:, :-1]
|
810
|
-
# mean_abs_shap = np.mean(np.abs(shap_values_only), axis=0)
|
811
|
-
|
812
|
-
# # For classification, shap_values is returned as a list for each class
|
813
|
-
# # Take values for the positive class
|
814
|
-
# if isinstance(shap_values, list):
|
815
|
-
# shap_values = shap_values[1]
|
816
|
-
|
817
|
-
# # Calculate mean absolute SHAP value for each feature
|
818
|
-
# feature_importance = {}
|
819
|
-
# for i, col in enumerate(x.columns):
|
820
|
-
# feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
|
821
|
-
|
822
777
|
return feature_importance
|
823
778
|
|
824
779
|
except Exception as e:
|
@@ -831,6 +786,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
831
786
|
self,
|
832
787
|
estimator,
|
833
788
|
scorer: Callable,
|
789
|
+
cat_features: Optional[List[str]],
|
834
790
|
metric_name: str,
|
835
791
|
multiplier: int,
|
836
792
|
cv: BaseCrossValidator,
|
@@ -842,6 +798,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
842
798
|
super(OtherEstimatorWrapper, self).__init__(
|
843
799
|
estimator,
|
844
800
|
scorer,
|
801
|
+
cat_features,
|
845
802
|
metric_name,
|
846
803
|
multiplier,
|
847
804
|
cv,
|
@@ -850,32 +807,32 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
850
807
|
text_features=text_features,
|
851
808
|
logger=logger,
|
852
809
|
)
|
853
|
-
self.cat_features = None
|
854
810
|
|
855
811
|
def _prepare_to_fit(self, x: pd.DataFrame, y: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
856
|
-
x,
|
857
|
-
self.cat_features = _get_cat_features(
|
812
|
+
x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
|
813
|
+
self.cat_features, self.features_to_encode, self.exclude_features = _get_cat_features(
|
814
|
+
self.logger, x, self.cat_features
|
815
|
+
)
|
858
816
|
num_features = [col for col in x.columns if col not in self.cat_features]
|
859
817
|
x[num_features] = x[num_features].fillna(-999)
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
x[
|
864
|
-
|
865
|
-
|
866
|
-
return x, y, groups, params
|
818
|
+
if self.cat_features:
|
819
|
+
encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, return_df=True)
|
820
|
+
encoded = encoder.fit_transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
|
821
|
+
x[self.cat_features] = encoded
|
822
|
+
self.cat_encoder = encoder
|
823
|
+
return x, y_numpy, groups, params
|
867
824
|
|
868
825
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
826
|
+
if self.exclude_features:
|
827
|
+
x = x.drop(columns=self.exclude_features)
|
869
828
|
x, y, params = super()._prepare_to_calculate(x, y)
|
870
829
|
if self.cat_features is not None:
|
871
830
|
num_features = [col for col in x.columns if col not in self.cat_features]
|
872
831
|
x[num_features] = x[num_features].fillna(-999)
|
873
|
-
|
874
|
-
|
875
|
-
|
876
|
-
|
877
|
-
if not is_numeric_dtype(y):
|
878
|
-
y = correct_string_target(y)
|
832
|
+
if self.features_to_encode and self.cat_encoder is not None:
|
833
|
+
x[self.features_to_encode] = self.cat_encoder.transform(x[self.features_to_encode].astype("object")).astype(
|
834
|
+
"category"
|
835
|
+
)
|
879
836
|
return x, y, params
|
880
837
|
|
881
838
|
|
@@ -938,7 +895,7 @@ def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
|
|
938
895
|
return scoring, metric_name, multiplier
|
939
896
|
|
940
897
|
|
941
|
-
def
|
898
|
+
def define_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None]) -> Tuple[Callable, str, int]:
|
942
899
|
if scoring is None:
|
943
900
|
if target_type == ModelTaskType.BINARY:
|
944
901
|
scoring = "roc_auc"
|
@@ -957,16 +914,42 @@ def _get_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None])
|
|
957
914
|
else:
|
958
915
|
metric_name = str(scoring)
|
959
916
|
|
917
|
+
metric_name = "GINI" if metric_name.upper() == "ROC_AUC" and target_type == ModelTaskType.BINARY else metric_name
|
918
|
+
|
960
919
|
return scoring, metric_name, multiplier
|
961
920
|
|
962
921
|
|
963
922
|
def _get_cat_features(
|
964
|
-
|
923
|
+
logger: logging.Logger,
|
924
|
+
x: pd.DataFrame,
|
925
|
+
cat_features: Optional[List[str]],
|
926
|
+
text_features: Optional[List[str]] = None,
|
927
|
+
emb_features: Optional[List[str]] = None,
|
965
928
|
) -> List[str]:
|
929
|
+
cat_features = cat_features or []
|
966
930
|
text_features = text_features or []
|
967
931
|
emb_features = emb_features or []
|
968
932
|
exclude_features = text_features + emb_features
|
969
|
-
|
933
|
+
cat_features = [c for c in cat_features if c not in exclude_features]
|
934
|
+
unique_cat_features = []
|
935
|
+
drop_cat_features = []
|
936
|
+
for name in cat_features:
|
937
|
+
# Remove constant categorical features
|
938
|
+
if x[name].nunique() > 1:
|
939
|
+
unique_cat_features.append(name)
|
940
|
+
else:
|
941
|
+
logger.info(f"Drop column {name} on preparing data for fit")
|
942
|
+
x = x.drop(columns=name)
|
943
|
+
drop_cat_features.append(name)
|
944
|
+
cat_features = unique_cat_features
|
945
|
+
|
946
|
+
logger.info(f"Selected categorical features: {cat_features}")
|
947
|
+
|
948
|
+
features_to_encode = list(set(x.select_dtypes(exclude=[np.number, np.datetime64, pd.CategoricalDtype]).columns))
|
949
|
+
|
950
|
+
logger.info(f"Features to encode: {features_to_encode}")
|
951
|
+
|
952
|
+
return cat_features, features_to_encode, drop_cat_features
|
970
953
|
|
971
954
|
|
972
955
|
def _get_add_params(input_params, add_params):
|
upgini/utils/target_utils.py
CHANGED
@@ -3,7 +3,7 @@ from typing import Callable, List, Optional, Union
|
|
3
3
|
|
4
4
|
import numpy as np
|
5
5
|
import pandas as pd
|
6
|
-
from pandas.api.types import
|
6
|
+
from pandas.api.types import is_bool_dtype, is_datetime64_any_dtype, is_numeric_dtype
|
7
7
|
|
8
8
|
from upgini.errors import ValidationError
|
9
9
|
from upgini.metadata import SYSTEM_RECORD_ID, CVType, ModelTaskType
|
@@ -14,11 +14,14 @@ from upgini.utils.ts_utils import get_most_frequent_time_unit, trunc_datetime
|
|
14
14
|
TS_MIN_DIFFERENT_IDS_RATIO = 0.2
|
15
15
|
|
16
16
|
|
17
|
-
def
|
18
|
-
if
|
19
|
-
|
20
|
-
|
21
|
-
|
17
|
+
def prepare_target(y: Union[pd.Series, np.ndarray], target_type: ModelTaskType) -> Union[pd.Series, np.ndarray]:
|
18
|
+
if target_type != ModelTaskType.REGRESSION or (not is_numeric_dtype(y) and not is_datetime64_any_dtype(y)):
|
19
|
+
if isinstance(y, pd.Series):
|
20
|
+
y = y.astype(str).astype("category").cat.codes
|
21
|
+
elif isinstance(y, np.ndarray):
|
22
|
+
y = pd.Series(y).astype(str).astype("category").cat.codes.values
|
23
|
+
|
24
|
+
return y
|
22
25
|
|
23
26
|
|
24
27
|
def define_task(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: upgini
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.81a3832.dev1
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
@@ -22,6 +22,7 @@ Classifier: Programming Language :: Python :: 3.11
|
|
22
22
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
23
23
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
24
24
|
Requires-Python: <3.12,>=3.10
|
25
|
+
Requires-Dist: category-encoders>=2.8.1
|
25
26
|
Requires-Dist: fastparquet>=0.8.1
|
26
27
|
Requires-Dist: ipywidgets>=8.1.0
|
27
28
|
Requires-Dist: jarowinkler>=2.0.0
|
@@ -1,12 +1,12 @@
|
|
1
|
-
upgini/__about__.py,sha256
|
1
|
+
upgini/__about__.py,sha256=-WSXUS5Ith33qArTnDO4LmrI0wUaXbJ8bIzoMZvAsWU,33
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=qtrQJwF2QbKdQ8Tqk5RQj3aAqOzDgygD6nIHrco3AzE,209728
|
7
7
|
upgini/http.py,sha256=UH7nswcZ221un3O_VW9limCBO5oRsyg1eKUHiVslRPs,43737
|
8
8
|
upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
|
9
|
-
upgini/metrics.py,sha256=
|
9
|
+
upgini/metrics.py,sha256=95sK1Kr3dYxqQcdkkoNFDe9OZY7OhgLjYwe3bhMQd38,38087
|
10
10
|
upgini/search_task.py,sha256=RcvAE785yksWTsTNWuZFVNlk32jHElMoEna1T_C5N8Q,17823
|
11
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
12
12
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
@@ -66,11 +66,11 @@ upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml
|
|
66
66
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
67
67
|
upgini/utils/sklearn_ext.py,sha256=HpaNQaKJisgNE7IZ71n7uswxTj7kbPglU2G3s1sORAc,45042
|
68
68
|
upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
|
69
|
-
upgini/utils/target_utils.py,sha256=
|
69
|
+
upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,16832
|
70
70
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
71
71
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
72
72
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
73
|
-
upgini-1.2.
|
74
|
-
upgini-1.2.
|
75
|
-
upgini-1.2.
|
76
|
-
upgini-1.2.
|
73
|
+
upgini-1.2.81a3832.dev1.dist-info/METADATA,sha256=ShIRi8EeeujsKBJ0byR2XWJ6DKFka2vrViq9d5VwjzU,49141
|
74
|
+
upgini-1.2.81a3832.dev1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
75
|
+
upgini-1.2.81a3832.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
76
|
+
upgini-1.2.81a3832.dev1.dist-info/RECORD,,
|
File without changes
|
File without changes
|