upgini 1.2.79a1__py3-none-any.whl → 1.2.81a3832.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.79a1"
1
+ __version__ = "1.2.81a3832.dev1"
@@ -63,7 +63,7 @@ from upgini.metadata import (
63
63
  RuntimeParameters,
64
64
  SearchKey,
65
65
  )
66
- from upgini.metrics import EstimatorWrapper, validate_scoring_argument
66
+ from upgini.metrics import EstimatorWrapper, define_scorer, validate_scoring_argument
67
67
  from upgini.normalizer.normalize_utils import Normalizer
68
68
  from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
69
69
  from upgini.search_task import SearchTask
@@ -957,7 +957,7 @@ class FeaturesEnricher(TransformerMixin):
957
957
  self.__display_support_link(msg)
958
958
  return None
959
959
 
960
- cat_features, search_keys_for_metrics = self._get_client_cat_features(
960
+ client_cat_features, search_keys_for_metrics = self._get_client_cat_features(
961
961
  estimator, validated_X, self.search_keys
962
962
  )
963
963
  search_keys_for_metrics.extend([c for c in self.id_columns or [] if c not in search_keys_for_metrics])
@@ -976,7 +976,7 @@ class FeaturesEnricher(TransformerMixin):
976
976
  search_keys_for_metrics=search_keys_for_metrics,
977
977
  progress_bar=progress_bar,
978
978
  progress_callback=progress_callback,
979
- cat_features=cat_features,
979
+ cat_features=client_cat_features,
980
980
  )
981
981
  if prepared_data is None:
982
982
  return None
@@ -994,11 +994,19 @@ class FeaturesEnricher(TransformerMixin):
994
994
  ) = prepared_data
995
995
 
996
996
  # rename cat_features
997
- if cat_features:
997
+ if client_cat_features:
998
998
  for new_c, old_c in columns_renaming.items():
999
- if old_c in cat_features:
1000
- cat_features.remove(old_c)
1001
- cat_features.append(new_c)
999
+ if old_c in client_cat_features:
1000
+ client_cat_features.remove(old_c)
1001
+ client_cat_features.append(new_c)
1002
+ for cat_feature in client_cat_features:
1003
+ if cat_feature not in fitting_X.columns:
1004
+ self.logger.error(
1005
+ f"Client cat_feature `{cat_feature}` not found in"
1006
+ f" x columns: {fitting_X.columns.to_list()}"
1007
+ )
1008
+ else:
1009
+ client_cat_features = []
1002
1010
 
1003
1011
  gc.collect()
1004
1012
 
@@ -1019,20 +1027,17 @@ class FeaturesEnricher(TransformerMixin):
1019
1027
 
1020
1028
  has_date = self._get_date_column(search_keys) is not None
1021
1029
  model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
1030
+ cat_features_from_backend = self.__get_categorical_features()
1031
+ cat_features = list(set(client_cat_features + cat_features_from_backend))
1032
+ baseline_cat_features = [f for f in cat_features if f in fitting_X.columns]
1033
+ enriched_cat_features = [f for f in cat_features if f in fitting_enriched_X.columns]
1034
+ if len(enriched_cat_features) < len(cat_features):
1035
+ missing_cat_features = [f for f in cat_features if f not in fitting_enriched_X.columns]
1036
+ self.logger.warning(
1037
+ f"Some cat_features were not found in enriched_X: {missing_cat_features}"
1038
+ )
1022
1039
 
1023
- wrapper = EstimatorWrapper.create(
1024
- estimator,
1025
- self.logger,
1026
- model_task_type,
1027
- _cv,
1028
- fitting_enriched_X,
1029
- scoring,
1030
- groups=groups,
1031
- text_features=text_features,
1032
- has_date=has_date,
1033
- )
1034
- metric = wrapper.metric_name
1035
- multiplier = wrapper.multiplier
1040
+ _, metric, multiplier = define_scorer(model_task_type, scoring)
1036
1041
 
1037
1042
  # 1 If client features are presented - fit and predict with KFold estimator
1038
1043
  # on etalon features and calculate baseline metric
@@ -1050,9 +1055,8 @@ class FeaturesEnricher(TransformerMixin):
1050
1055
  self.logger,
1051
1056
  model_task_type,
1052
1057
  _cv,
1053
- fitting_enriched_X,
1054
- scoring,
1055
- cat_features,
1058
+ scoring=scoring,
1059
+ cat_features=baseline_cat_features,
1056
1060
  add_params=custom_loss_add_params,
1057
1061
  groups=groups,
1058
1062
  text_features=text_features,
@@ -1085,9 +1089,8 @@ class FeaturesEnricher(TransformerMixin):
1085
1089
  self.logger,
1086
1090
  model_task_type,
1087
1091
  _cv,
1088
- fitting_enriched_X,
1089
- scoring,
1090
- cat_features,
1092
+ scoring=scoring,
1093
+ cat_features=enriched_cat_features,
1091
1094
  add_params=custom_loss_add_params,
1092
1095
  groups=groups,
1093
1096
  text_features=text_features,
@@ -1119,7 +1122,7 @@ class FeaturesEnricher(TransformerMixin):
1119
1122
  self.bundle.get("quality_metrics_rows_header"): _num_samples(fitting_X),
1120
1123
  }
1121
1124
  if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
1122
- validated_y
1125
+ y_sorted
1123
1126
  ):
1124
1127
  train_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
1125
1128
  # np.mean(validated_y), 4
@@ -1197,7 +1200,7 @@ class FeaturesEnricher(TransformerMixin):
1197
1200
  # self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
1198
1201
  }
1199
1202
  if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
1200
- validated_eval_set[idx][1]
1203
+ eval_y_sorted
1201
1204
  ):
1202
1205
  eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
1203
1206
  # np.mean(validated_eval_set[idx][1]), 4
@@ -1428,12 +1431,20 @@ class FeaturesEnricher(TransformerMixin):
1428
1431
  if (
1429
1432
  estimator is not None
1430
1433
  and hasattr(estimator, "get_param")
1434
+ and hasattr(estimator, "_init_params")
1431
1435
  and estimator.get_param("cat_features") is not None
1432
1436
  ):
1433
- cat_features = estimator.get_param("cat_features")
1434
- if len(cat_features) > 0:
1435
- if all([isinstance(f, int) for f in cat_features]):
1436
- cat_features = [X.columns[i] for i in cat_features]
1437
+ estimator_cat_features = estimator.get_param("cat_features")
1438
+ if all([isinstance(c, int) for c in estimator_cat_features]):
1439
+ cat_features = [X.columns[idx] for idx in estimator_cat_features]
1440
+ elif all([isinstance(c, str) for c in estimator_cat_features]):
1441
+ cat_features = estimator_cat_features
1442
+ else:
1443
+ print(f"WARNING: Unsupported type of cat_features in CatBoost estimator: {estimator_cat_features}")
1444
+
1445
+ del estimator._init_params["cat_features"]
1446
+
1447
+ if cat_features:
1437
1448
  self.logger.info(f"Collected categorical features {cat_features} from user estimator")
1438
1449
  for cat_feature in cat_features:
1439
1450
  if cat_feature in search_keys:
@@ -3855,6 +3866,13 @@ if response.status_code == 200:
3855
3866
 
3856
3867
  return importances
3857
3868
 
3869
+ def __get_categorical_features(self) -> List[str]:
3870
+ features_meta = self._search_task.get_all_features_metadata_v2()
3871
+ if features_meta is None:
3872
+ raise Exception(self.bundle.get("missing_features_meta"))
3873
+
3874
+ return [f.name for f in features_meta if f.type == "categorical"]
3875
+
3858
3876
  def __prepare_feature_importances(
3859
3877
  self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
3860
3878
  ):
@@ -3886,9 +3904,10 @@ if response.status_code == 200:
3886
3904
  if updated_shaps is not None:
3887
3905
  updating_shap = updated_shaps.get(feature_meta.name)
3888
3906
  if updating_shap is None:
3889
- self.logger.warning(
3890
- f"WARNING: Shap value for feature {feature_meta.name} not found and will be set to 0.0"
3891
- )
3907
+ if feature_meta.shap_value != 0.0:
3908
+ self.logger.warning(
3909
+ f"WARNING: Shap value for feature {feature_meta.name} not found and will be set to 0.0"
3910
+ )
3892
3911
  updating_shap = 0.0
3893
3912
  feature_meta.shap_value = updating_shap
3894
3913
 
upgini/metrics.py CHANGED
@@ -11,15 +11,15 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
11
11
  import lightgbm as lgb
12
12
  import numpy as np
13
13
  import pandas as pd
14
+ from category_encoders.cat_boost import CatBoostEncoder
14
15
  from lightgbm import LGBMClassifier, LGBMRegressor
15
16
  from numpy import log1p
16
17
  from pandas.api.types import is_numeric_dtype
17
18
  from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
18
- from sklearn.preprocessing import OrdinalEncoder
19
19
 
20
+ from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
20
21
  from upgini.utils.features_validator import FeaturesValidator
21
22
  from upgini.utils.sklearn_ext import cross_validate
22
- from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
23
23
 
24
24
  try:
25
25
  from sklearn.metrics import get_scorer_names
@@ -36,7 +36,7 @@ from sklearn.model_selection import BaseCrossValidator, TimeSeriesSplit
36
36
  from upgini.errors import ValidationError
37
37
  from upgini.metadata import ModelTaskType
38
38
  from upgini.resource_bundle import bundle
39
- from upgini.utils.target_utils import correct_string_target
39
+ from upgini.utils.target_utils import prepare_target
40
40
 
41
41
  DEFAULT_RANDOM_STATE = 42
42
42
 
@@ -99,8 +99,7 @@ LIGHTGBM_REGRESSION_PARAMS = {
99
99
  "min_sum_hessian_in_leaf": 0.01,
100
100
  "objective": "huber",
101
101
  "deterministic": "true",
102
- "force_col_wise": "true",
103
- "force_row_wise": "true",
102
+ # "force_col_wise": "true",
104
103
  "verbosity": -1,
105
104
  }
106
105
 
@@ -120,8 +119,7 @@ LIGHTGBM_MULTICLASS_PARAMS = {
120
119
  "num_grad_quant_bins": "8",
121
120
  "stochastic_rounding": "true",
122
121
  "deterministic": "true",
123
- "force_col_wise": "true",
124
- "force_row_wise": "true",
122
+ # "force_col_wise": "true",
125
123
  "verbosity": -1,
126
124
  }
127
125
 
@@ -138,8 +136,7 @@ LIGHTGBM_BINARY_PARAMS = {
138
136
  "cat_smooth": 18,
139
137
  "cat_l2": 8,
140
138
  "deterministic": "true",
141
- "force_col_wise": "true",
142
- "force_row_wise": "true",
139
+ # "force_col_wise": "true",
143
140
  "verbosity": -1,
144
141
  }
145
142
 
@@ -148,33 +145,33 @@ LIGHTGBM_EARLY_STOPPING_ROUNDS = 20
148
145
  N_FOLDS = 5
149
146
  BLOCKED_TS_TEST_SIZE = 0.2
150
147
 
151
- NA_VALUES = [
152
- "",
153
- " ",
154
- " ",
155
- "#n/a",
156
- "#n/a n/a",
157
- "#na",
158
- "-1.#ind",
159
- "-1.#qnan",
160
- "-nan",
161
- "1.#ind",
162
- "1.#qnan",
163
- "n/a",
164
- "na",
165
- "null",
166
- "nan",
167
- "n/a",
168
- "nan",
169
- "none",
170
- "-",
171
- "undefined",
172
- "[[unknown]]",
173
- "[not provided]",
174
- "[unknown]",
175
- ]
176
-
177
- NA_REPLACEMENT = "NA"
148
+ # NA_VALUES = [
149
+ # "",
150
+ # " ",
151
+ # " ",
152
+ # "#n/a",
153
+ # "#n/a n/a",
154
+ # "#na",
155
+ # "-1.#ind",
156
+ # "-1.#qnan",
157
+ # "-nan",
158
+ # "1.#ind",
159
+ # "1.#qnan",
160
+ # "n/a",
161
+ # "na",
162
+ # "null",
163
+ # "nan",
164
+ # "n/a",
165
+ # "nan",
166
+ # "none",
167
+ # "-",
168
+ # "undefined",
169
+ # "[[unknown]]",
170
+ # "[not provided]",
171
+ # "[unknown]",
172
+ # ]
173
+
174
+ # NA_REPLACEMENT = "NA"
178
175
 
179
176
  SUPPORTED_CATBOOST_METRICS = {
180
177
  s.upper(): s
@@ -290,6 +287,7 @@ class EstimatorWrapper:
290
287
  self,
291
288
  estimator,
292
289
  scorer: Callable,
290
+ cat_features: Optional[List[str]],
293
291
  metric_name: str,
294
292
  multiplier: int,
295
293
  cv: BaseCrossValidator,
@@ -301,9 +299,8 @@ class EstimatorWrapper:
301
299
  ):
302
300
  self.estimator = estimator
303
301
  self.scorer = scorer
304
- self.metric_name = (
305
- "GINI" if metric_name.upper() == "ROC_AUC" and target_type == ModelTaskType.BINARY else metric_name
306
- )
302
+ self.cat_features = cat_features
303
+ self.metric_name = metric_name
307
304
  self.multiplier = multiplier
308
305
  self.cv = cv
309
306
  self.target_type = target_type
@@ -348,6 +345,8 @@ class EstimatorWrapper:
348
345
  else:
349
346
  x, y = self._remove_empty_target_rows(x, y)
350
347
 
348
+ y = prepare_target(y, self.target_type)
349
+
351
350
  self.logger.info(f"After preparing data columns: {x.columns.to_list()}")
352
351
  return x, y, groups
353
352
 
@@ -468,7 +467,7 @@ class EstimatorWrapper:
468
467
  logger: logging.Logger,
469
468
  target_type: ModelTaskType,
470
469
  cv: BaseCrossValidator,
471
- x: pd.DataFrame,
470
+ *,
472
471
  scoring: Union[Callable, str, None] = None,
473
472
  cat_features: Optional[List[str]] = None,
474
473
  text_features: Optional[List[str]] = None,
@@ -476,9 +475,10 @@ class EstimatorWrapper:
476
475
  groups: Optional[List[str]] = None,
477
476
  has_date: Optional[bool] = None,
478
477
  ) -> EstimatorWrapper:
479
- scorer, metric_name, multiplier = _get_scorer(target_type, scoring)
478
+ scorer, metric_name, multiplier = define_scorer(target_type, scoring)
480
479
  kwargs = {
481
480
  "scorer": scorer,
481
+ "cat_features": cat_features,
482
482
  "metric_name": metric_name,
483
483
  "multiplier": multiplier,
484
484
  "cv": cv,
@@ -512,11 +512,6 @@ class EstimatorWrapper:
512
512
  kwargs["estimator"] = estimator_copy
513
513
  if is_catboost_estimator(estimator):
514
514
  if cat_features is not None:
515
- for cat_feature in cat_features:
516
- if cat_feature not in x.columns:
517
- logger.error(
518
- f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
519
- )
520
515
  estimator_copy.set_params(cat_features=cat_features, has_time=has_date)
521
516
  estimator = CatBoostWrapper(**kwargs)
522
517
  else:
@@ -539,6 +534,7 @@ class CatBoostWrapper(EstimatorWrapper):
539
534
  self,
540
535
  estimator,
541
536
  scorer: Callable,
537
+ cat_features: Optional[List[str]],
542
538
  metric_name: str,
543
539
  multiplier: int,
544
540
  cv: BaseCrossValidator,
@@ -550,6 +546,7 @@ class CatBoostWrapper(EstimatorWrapper):
550
546
  super(CatBoostWrapper, self).__init__(
551
547
  estimator,
552
548
  scorer,
549
+ cat_features,
553
550
  metric_name,
554
551
  multiplier,
555
552
  cv,
@@ -558,10 +555,9 @@ class CatBoostWrapper(EstimatorWrapper):
558
555
  text_features=text_features,
559
556
  logger=logger,
560
557
  )
561
- self.cat_features = None
562
558
  self.emb_features = None
563
559
  self.grouped_embedding_features = None
564
- self.exclude_features = []
560
+ self.drop_cat_features = []
565
561
 
566
562
  def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
567
563
  x, y, groups, params = super()._prepare_to_fit(x, y)
@@ -598,36 +594,9 @@ class CatBoostWrapper(EstimatorWrapper):
598
594
  self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
599
595
 
600
596
  # Find rest categorical features
601
- self.cat_features = _get_cat_features(x, self.text_features, self.grouped_embedding_features)
602
- # x = fill_na_cat_features(x, self.cat_features)
603
- unique_cat_features = []
604
- for name in self.cat_features:
605
- # Remove constant categorical features
606
- if x[name].nunique() > 1:
607
- unique_cat_features.append(name)
608
- else:
609
- self.logger.info(f"Drop column {name} on preparing data for fit")
610
- x = x.drop(columns=name)
611
- self.exclude_features.append(name)
612
- self.cat_features = unique_cat_features
613
- if (
614
- hasattr(self.estimator, "get_param")
615
- and hasattr(self.estimator, "_init_params")
616
- and self.estimator.get_param("cat_features") is not None
617
- ):
618
- estimator_cat_features = self.estimator.get_param("cat_features")
619
- if all([isinstance(c, int) for c in estimator_cat_features]):
620
- cat_features_idx = {x.columns.get_loc(c) for c in self.cat_features}
621
- cat_features_idx.update(estimator_cat_features)
622
- self.cat_features = [x.columns[idx] for idx in cat_features_idx]
623
- elif all([isinstance(c, str) for c in estimator_cat_features]):
624
- self.cat_features = list(set(self.cat_features + estimator_cat_features))
625
- else:
626
- print(f"WARNING: Unsupported type of cat_features in CatBoost estimator: {estimator_cat_features}")
627
-
628
- del self.estimator._init_params["cat_features"]
629
-
630
- self.logger.info(f"Selected categorical features: {self.cat_features}")
597
+ self.cat_features, self.features_to_encode, self.exclude_features = _get_cat_features(
598
+ self.logger, x, self.cat_features, self.text_features, self.grouped_embedding_features
599
+ )
631
600
  params["cat_features"] = self.cat_features
632
601
 
633
602
  return x, y, groups, params
@@ -658,7 +627,6 @@ class CatBoostWrapper(EstimatorWrapper):
658
627
  x, emb_columns = self.group_embeddings(x)
659
628
  params["embedding_features"] = emb_columns
660
629
  if self.cat_features:
661
- # x = fill_na_cat_features(x, self.cat_features)
662
630
  params["cat_features"] = self.cat_features
663
631
 
664
632
  return x, y, params
@@ -728,6 +696,7 @@ class LightGBMWrapper(EstimatorWrapper):
728
696
  self,
729
697
  estimator,
730
698
  scorer: Callable,
699
+ cat_features: Optional[List[str]],
731
700
  metric_name: str,
732
701
  multiplier: int,
733
702
  cv: BaseCrossValidator,
@@ -739,6 +708,7 @@ class LightGBMWrapper(EstimatorWrapper):
739
708
  super(LightGBMWrapper, self).__init__(
740
709
  estimator,
741
710
  scorer,
711
+ cat_features,
742
712
  metric_name,
743
713
  multiplier,
744
714
  cv,
@@ -747,9 +717,10 @@ class LightGBMWrapper(EstimatorWrapper):
747
717
  text_features=text_features,
748
718
  logger=logger,
749
719
  )
750
- self.cat_features = None
751
720
  self.cat_encoder = None
752
721
  self.n_classes = None
722
+ self.exclude_features = []
723
+ self.features_to_encode = []
753
724
 
754
725
  def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
755
726
  x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
@@ -759,30 +730,25 @@ class LightGBMWrapper(EstimatorWrapper):
759
730
  if self.target_type == ModelTaskType.BINARY:
760
731
  params["eval_metric"] = "auc"
761
732
  params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
762
- self.cat_features = _get_cat_features(x)
763
- if self.cat_features:
764
- x = fill_na_cat_features(x, self.cat_features)
765
- encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
766
- encoded = pd.DataFrame(
767
- encoder.fit_transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
768
- )
769
- x[self.cat_features] = encoded
733
+ self.cat_features, self.features_to_encode, self.exclude_features = _get_cat_features(
734
+ self.logger, x, self.cat_features
735
+ )
736
+ if self.features_to_encode:
737
+ encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, return_df=True)
738
+ encoded = encoder.fit_transform(x[self.features_to_encode].astype("object"), y_numpy).astype("category")
739
+ x[self.features_to_encode] = encoded
770
740
  self.cat_encoder = encoder
771
- if not is_numeric_dtype(y_numpy):
772
- y_numpy = correct_string_target(y_numpy)
773
741
 
774
742
  return x, y_numpy, groups, params
775
743
 
776
744
  def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
745
+ if self.exclude_features:
746
+ x = x.drop(columns=self.exclude_features)
777
747
  x, y_numpy, params = super()._prepare_to_calculate(x, y)
778
- if self.cat_features is not None:
779
- x = fill_na_cat_features(x, self.cat_features)
780
- if self.cat_encoder is not None:
781
- x[self.cat_features] = pd.DataFrame(
782
- self.cat_encoder.transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
783
- )
784
- if not is_numeric_dtype(y):
785
- y_numpy = correct_string_target(y_numpy)
748
+ if self.features_to_encode is not None and self.cat_encoder is not None:
749
+ x[self.features_to_encode] = self.cat_encoder.transform(x[self.features_to_encode].astype("object")).astype(
750
+ "category"
751
+ )
786
752
  return x, y_numpy, params
787
753
 
788
754
  def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
@@ -808,20 +774,6 @@ class LightGBMWrapper(EstimatorWrapper):
808
774
  for i, col in enumerate(x.columns):
809
775
  feature_importance[col] = np.mean(np.abs(shap_matrix[:, i]))
810
776
 
811
- # # exclude last column (base value)
812
- # shap_values_only = shap_values[:, :-1]
813
- # mean_abs_shap = np.mean(np.abs(shap_values_only), axis=0)
814
-
815
- # # For classification, shap_values is returned as a list for each class
816
- # # Take values for the positive class
817
- # if isinstance(shap_values, list):
818
- # shap_values = shap_values[1]
819
-
820
- # # Calculate mean absolute SHAP value for each feature
821
- # feature_importance = {}
822
- # for i, col in enumerate(x.columns):
823
- # feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
824
-
825
777
  return feature_importance
826
778
 
827
779
  except Exception as e:
@@ -834,6 +786,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
834
786
  self,
835
787
  estimator,
836
788
  scorer: Callable,
789
+ cat_features: Optional[List[str]],
837
790
  metric_name: str,
838
791
  multiplier: int,
839
792
  cv: BaseCrossValidator,
@@ -845,6 +798,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
845
798
  super(OtherEstimatorWrapper, self).__init__(
846
799
  estimator,
847
800
  scorer,
801
+ cat_features,
848
802
  metric_name,
849
803
  multiplier,
850
804
  cv,
@@ -853,32 +807,32 @@ class OtherEstimatorWrapper(EstimatorWrapper):
853
807
  text_features=text_features,
854
808
  logger=logger,
855
809
  )
856
- self.cat_features = None
857
810
 
858
811
  def _prepare_to_fit(self, x: pd.DataFrame, y: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
859
- x, y, groups, params = super()._prepare_to_fit(x, y)
860
- self.cat_features = _get_cat_features(x)
812
+ x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
813
+ self.cat_features, self.features_to_encode, self.exclude_features = _get_cat_features(
814
+ self.logger, x, self.cat_features
815
+ )
861
816
  num_features = [col for col in x.columns if col not in self.cat_features]
862
817
  x[num_features] = x[num_features].fillna(-999)
863
- x = fill_na_cat_features(x, self.cat_features)
864
- # TODO use one-hot encoding if cardinality is less 50
865
- for feature in self.cat_features:
866
- x[feature] = x[feature].astype("category").cat.codes
867
- if not is_numeric_dtype(y):
868
- y = correct_string_target(y)
869
- return x, y, groups, params
818
+ if self.cat_features:
819
+ encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, return_df=True)
820
+ encoded = encoder.fit_transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
821
+ x[self.cat_features] = encoded
822
+ self.cat_encoder = encoder
823
+ return x, y_numpy, groups, params
870
824
 
871
825
  def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
826
+ if self.exclude_features:
827
+ x = x.drop(columns=self.exclude_features)
872
828
  x, y, params = super()._prepare_to_calculate(x, y)
873
829
  if self.cat_features is not None:
874
830
  num_features = [col for col in x.columns if col not in self.cat_features]
875
831
  x[num_features] = x[num_features].fillna(-999)
876
- x = fill_na_cat_features(x, self.cat_features)
877
- # TODO use one-hot encoding if cardinality is less 50
878
- for feature in self.cat_features:
879
- x[feature] = x[feature].astype("category").cat.codes
880
- if not is_numeric_dtype(y):
881
- y = correct_string_target(y)
832
+ if self.features_to_encode and self.cat_encoder is not None:
833
+ x[self.features_to_encode] = self.cat_encoder.transform(x[self.features_to_encode].astype("object")).astype(
834
+ "category"
835
+ )
882
836
  return x, y, params
883
837
 
884
838
 
@@ -941,7 +895,7 @@ def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
941
895
  return scoring, metric_name, multiplier
942
896
 
943
897
 
944
- def _get_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None]) -> Tuple[Callable, str, int]:
898
+ def define_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None]) -> Tuple[Callable, str, int]:
945
899
  if scoring is None:
946
900
  if target_type == ModelTaskType.BINARY:
947
901
  scoring = "roc_auc"
@@ -960,16 +914,42 @@ def _get_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None])
960
914
  else:
961
915
  metric_name = str(scoring)
962
916
 
917
+ metric_name = "GINI" if metric_name.upper() == "ROC_AUC" and target_type == ModelTaskType.BINARY else metric_name
918
+
963
919
  return scoring, metric_name, multiplier
964
920
 
965
921
 
966
922
  def _get_cat_features(
967
- x: pd.DataFrame, text_features: Optional[List[str]] = None, emb_features: Optional[List[str]] = None
923
+ logger: logging.Logger,
924
+ x: pd.DataFrame,
925
+ cat_features: Optional[List[str]],
926
+ text_features: Optional[List[str]] = None,
927
+ emb_features: Optional[List[str]] = None,
968
928
  ) -> List[str]:
929
+ cat_features = cat_features or []
969
930
  text_features = text_features or []
970
931
  emb_features = emb_features or []
971
932
  exclude_features = text_features + emb_features
972
- return [c for c in x.columns if c not in exclude_features and not is_numeric_dtype(x[c])]
933
+ cat_features = [c for c in cat_features if c not in exclude_features]
934
+ unique_cat_features = []
935
+ drop_cat_features = []
936
+ for name in cat_features:
937
+ # Remove constant categorical features
938
+ if x[name].nunique() > 1:
939
+ unique_cat_features.append(name)
940
+ else:
941
+ logger.info(f"Drop column {name} on preparing data for fit")
942
+ x = x.drop(columns=name)
943
+ drop_cat_features.append(name)
944
+ cat_features = unique_cat_features
945
+
946
+ logger.info(f"Selected categorical features: {cat_features}")
947
+
948
+ features_to_encode = list(set(x.select_dtypes(exclude=[np.number, np.datetime64, pd.CategoricalDtype]).columns))
949
+
950
+ logger.info(f"Features to encode: {features_to_encode}")
951
+
952
+ return cat_features, features_to_encode, drop_cat_features
973
953
 
974
954
 
975
955
  def _get_add_params(input_params, add_params):
@@ -1059,10 +1039,10 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
1059
1039
  return mse if squared else np.sqrt(mse)
1060
1040
 
1061
1041
 
1062
- def fill_na_cat_features(df: pd.DataFrame, cat_features: List[str]) -> pd.DataFrame:
1063
- for c in cat_features:
1064
- if c in df.columns:
1065
- df[c] = df[c].astype("string").fillna(NA_REPLACEMENT).astype(str)
1066
- na_filter = df[c].str.lower().isin(NA_VALUES)
1067
- df.loc[na_filter, c] = NA_REPLACEMENT
1068
- return df
1042
+ # def fill_na_cat_features(df: pd.DataFrame, cat_features: List[str]) -> pd.DataFrame:
1043
+ # for c in cat_features:
1044
+ # if c in df.columns:
1045
+ # df[c] = df[c].astype("string").fillna(NA_REPLACEMENT).astype(str)
1046
+ # na_filter = df[c].str.lower().isin(NA_VALUES)
1047
+ # df.loc[na_filter, c] = NA_REPLACEMENT
1048
+ # return df
@@ -3,7 +3,7 @@ from typing import Callable, List, Optional, Union
3
3
 
4
4
  import numpy as np
5
5
  import pandas as pd
6
- from pandas.api.types import is_numeric_dtype, is_bool_dtype
6
+ from pandas.api.types import is_bool_dtype, is_datetime64_any_dtype, is_numeric_dtype
7
7
 
8
8
  from upgini.errors import ValidationError
9
9
  from upgini.metadata import SYSTEM_RECORD_ID, CVType, ModelTaskType
@@ -14,11 +14,14 @@ from upgini.utils.ts_utils import get_most_frequent_time_unit, trunc_datetime
14
14
  TS_MIN_DIFFERENT_IDS_RATIO = 0.2
15
15
 
16
16
 
17
- def correct_string_target(y: Union[pd.Series, np.ndarray]) -> Union[pd.Series, np.ndarray]:
18
- if isinstance(y, pd.Series):
19
- return y.astype(str).astype("category").cat.codes
20
- elif isinstance(y, np.ndarray):
21
- return pd.Series(y).astype(str).astype("category").cat.codes.values
17
+ def prepare_target(y: Union[pd.Series, np.ndarray], target_type: ModelTaskType) -> Union[pd.Series, np.ndarray]:
18
+ if target_type != ModelTaskType.REGRESSION or (not is_numeric_dtype(y) and not is_datetime64_any_dtype(y)):
19
+ if isinstance(y, pd.Series):
20
+ y = y.astype(str).astype("category").cat.codes
21
+ elif isinstance(y, np.ndarray):
22
+ y = pd.Series(y).astype(str).astype("category").cat.codes.values
23
+
24
+ return y
22
25
 
23
26
 
24
27
  def define_task(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.79a1
3
+ Version: 1.2.81a3832.dev1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -22,6 +22,7 @@ Classifier: Programming Language :: Python :: 3.11
22
22
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
23
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
24
24
  Requires-Python: <3.12,>=3.10
25
+ Requires-Dist: category-encoders>=2.8.1
25
26
  Requires-Dist: fastparquet>=0.8.1
26
27
  Requires-Dist: ipywidgets>=8.1.0
27
28
  Requires-Dist: jarowinkler>=2.0.0
@@ -1,12 +1,12 @@
1
- upgini/__about__.py,sha256=h3pHSW7QFH3c863fq8fxK5FCQiwhFda3blWAzoxplSE,25
1
+ upgini/__about__.py,sha256=-WSXUS5Ith33qArTnDO4LmrI0wUaXbJ8bIzoMZvAsWU,33
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=_UkJS35uGaYtI7dR6Xd9Q28nmiPzTjhK3y8v3IjJTfQ,208245
6
+ upgini/features_enricher.py,sha256=qtrQJwF2QbKdQ8Tqk5RQj3aAqOzDgygD6nIHrco3AzE,209728
7
7
  upgini/http.py,sha256=UH7nswcZ221un3O_VW9limCBO5oRsyg1eKUHiVslRPs,43737
8
8
  upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
9
- upgini/metrics.py,sha256=l7r4cM-xrftcgOTv4uMQBHC_Sd820Z6umw5bIpP5wDI,39384
9
+ upgini/metrics.py,sha256=95sK1Kr3dYxqQcdkkoNFDe9OZY7OhgLjYwe3bhMQd38,38087
10
10
  upgini/search_task.py,sha256=RcvAE785yksWTsTNWuZFVNlk32jHElMoEna1T_C5N8Q,17823
11
11
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
12
12
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -66,11 +66,11 @@ upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml
66
66
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
67
67
  upgini/utils/sklearn_ext.py,sha256=HpaNQaKJisgNE7IZ71n7uswxTj7kbPglU2G3s1sORAc,45042
68
68
  upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
69
- upgini/utils/target_utils.py,sha256=P0cCVRaakWLydYwFjk3TEaQfr0p0hfsJCvKRD8qcxiE,16650
69
+ upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,16832
70
70
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
71
71
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
72
72
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
73
- upgini-1.2.79a1.dist-info/METADATA,sha256=49MF6sCtAqdDrgL7s0hY2fm7T0ma0A5yeJQ6oIokZDo,49093
74
- upgini-1.2.79a1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
- upgini-1.2.79a1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
- upgini-1.2.79a1.dist-info/RECORD,,
73
+ upgini-1.2.81a3832.dev1.dist-info/METADATA,sha256=ShIRi8EeeujsKBJ0byR2XWJ6DKFka2vrViq9d5VwjzU,49141
74
+ upgini-1.2.81a3832.dev1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
+ upgini-1.2.81a3832.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
+ upgini-1.2.81a3832.dev1.dist-info/RECORD,,