upgini 1.2.80__py3-none-any.whl → 1.2.81a3832.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.80"
1
+ __version__ = "1.2.81a3832.dev1"
@@ -63,7 +63,7 @@ from upgini.metadata import (
63
63
  RuntimeParameters,
64
64
  SearchKey,
65
65
  )
66
- from upgini.metrics import EstimatorWrapper, validate_scoring_argument
66
+ from upgini.metrics import EstimatorWrapper, define_scorer, validate_scoring_argument
67
67
  from upgini.normalizer.normalize_utils import Normalizer
68
68
  from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
69
69
  from upgini.search_task import SearchTask
@@ -957,7 +957,7 @@ class FeaturesEnricher(TransformerMixin):
957
957
  self.__display_support_link(msg)
958
958
  return None
959
959
 
960
- cat_features, search_keys_for_metrics = self._get_client_cat_features(
960
+ client_cat_features, search_keys_for_metrics = self._get_client_cat_features(
961
961
  estimator, validated_X, self.search_keys
962
962
  )
963
963
  search_keys_for_metrics.extend([c for c in self.id_columns or [] if c not in search_keys_for_metrics])
@@ -976,7 +976,7 @@ class FeaturesEnricher(TransformerMixin):
976
976
  search_keys_for_metrics=search_keys_for_metrics,
977
977
  progress_bar=progress_bar,
978
978
  progress_callback=progress_callback,
979
- cat_features=cat_features,
979
+ cat_features=client_cat_features,
980
980
  )
981
981
  if prepared_data is None:
982
982
  return None
@@ -994,11 +994,19 @@ class FeaturesEnricher(TransformerMixin):
994
994
  ) = prepared_data
995
995
 
996
996
  # rename cat_features
997
- if cat_features:
997
+ if client_cat_features:
998
998
  for new_c, old_c in columns_renaming.items():
999
- if old_c in cat_features:
1000
- cat_features.remove(old_c)
1001
- cat_features.append(new_c)
999
+ if old_c in client_cat_features:
1000
+ client_cat_features.remove(old_c)
1001
+ client_cat_features.append(new_c)
1002
+ for cat_feature in client_cat_features:
1003
+ if cat_feature not in fitting_X.columns:
1004
+ self.logger.error(
1005
+ f"Client cat_feature `{cat_feature}` not found in"
1006
+ f" x columns: {fitting_X.columns.to_list()}"
1007
+ )
1008
+ else:
1009
+ client_cat_features = []
1002
1010
 
1003
1011
  gc.collect()
1004
1012
 
@@ -1019,20 +1027,17 @@ class FeaturesEnricher(TransformerMixin):
1019
1027
 
1020
1028
  has_date = self._get_date_column(search_keys) is not None
1021
1029
  model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
1030
+ cat_features_from_backend = self.__get_categorical_features()
1031
+ cat_features = list(set(client_cat_features + cat_features_from_backend))
1032
+ baseline_cat_features = [f for f in cat_features if f in fitting_X.columns]
1033
+ enriched_cat_features = [f for f in cat_features if f in fitting_enriched_X.columns]
1034
+ if len(enriched_cat_features) < len(cat_features):
1035
+ missing_cat_features = [f for f in cat_features if f not in fitting_enriched_X.columns]
1036
+ self.logger.warning(
1037
+ f"Some cat_features were not found in enriched_X: {missing_cat_features}"
1038
+ )
1022
1039
 
1023
- wrapper = EstimatorWrapper.create(
1024
- estimator,
1025
- self.logger,
1026
- model_task_type,
1027
- _cv,
1028
- fitting_enriched_X,
1029
- scoring,
1030
- groups=groups,
1031
- text_features=text_features,
1032
- has_date=has_date,
1033
- )
1034
- metric = wrapper.metric_name
1035
- multiplier = wrapper.multiplier
1040
+ _, metric, multiplier = define_scorer(model_task_type, scoring)
1036
1041
 
1037
1042
  # 1 If client features are presented - fit and predict with KFold estimator
1038
1043
  # on etalon features and calculate baseline metric
@@ -1050,9 +1055,8 @@ class FeaturesEnricher(TransformerMixin):
1050
1055
  self.logger,
1051
1056
  model_task_type,
1052
1057
  _cv,
1053
- fitting_enriched_X,
1054
- scoring,
1055
- cat_features,
1058
+ scoring=scoring,
1059
+ cat_features=baseline_cat_features,
1056
1060
  add_params=custom_loss_add_params,
1057
1061
  groups=groups,
1058
1062
  text_features=text_features,
@@ -1085,9 +1089,8 @@ class FeaturesEnricher(TransformerMixin):
1085
1089
  self.logger,
1086
1090
  model_task_type,
1087
1091
  _cv,
1088
- fitting_enriched_X,
1089
- scoring,
1090
- cat_features,
1092
+ scoring=scoring,
1093
+ cat_features=enriched_cat_features,
1091
1094
  add_params=custom_loss_add_params,
1092
1095
  groups=groups,
1093
1096
  text_features=text_features,
@@ -1428,12 +1431,20 @@ class FeaturesEnricher(TransformerMixin):
1428
1431
  if (
1429
1432
  estimator is not None
1430
1433
  and hasattr(estimator, "get_param")
1434
+ and hasattr(estimator, "_init_params")
1431
1435
  and estimator.get_param("cat_features") is not None
1432
1436
  ):
1433
- cat_features = estimator.get_param("cat_features")
1434
- if len(cat_features) > 0:
1435
- if all([isinstance(f, int) for f in cat_features]):
1436
- cat_features = [X.columns[i] for i in cat_features]
1437
+ estimator_cat_features = estimator.get_param("cat_features")
1438
+ if all([isinstance(c, int) for c in estimator_cat_features]):
1439
+ cat_features = [X.columns[idx] for idx in estimator_cat_features]
1440
+ elif all([isinstance(c, str) for c in estimator_cat_features]):
1441
+ cat_features = estimator_cat_features
1442
+ else:
1443
+ print(f"WARNING: Unsupported type of cat_features in CatBoost estimator: {estimator_cat_features}")
1444
+
1445
+ del estimator._init_params["cat_features"]
1446
+
1447
+ if cat_features:
1437
1448
  self.logger.info(f"Collected categorical features {cat_features} from user estimator")
1438
1449
  for cat_feature in cat_features:
1439
1450
  if cat_feature in search_keys:
@@ -3855,6 +3866,13 @@ if response.status_code == 200:
3855
3866
 
3856
3867
  return importances
3857
3868
 
3869
+ def __get_categorical_features(self) -> List[str]:
3870
+ features_meta = self._search_task.get_all_features_metadata_v2()
3871
+ if features_meta is None:
3872
+ raise Exception(self.bundle.get("missing_features_meta"))
3873
+
3874
+ return [f.name for f in features_meta if f.type == "categorical"]
3875
+
3858
3876
  def __prepare_feature_importances(
3859
3877
  self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
3860
3878
  ):
upgini/metrics.py CHANGED
@@ -11,15 +11,15 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
11
11
  import lightgbm as lgb
12
12
  import numpy as np
13
13
  import pandas as pd
14
+ from category_encoders.cat_boost import CatBoostEncoder
14
15
  from lightgbm import LGBMClassifier, LGBMRegressor
15
16
  from numpy import log1p
16
17
  from pandas.api.types import is_numeric_dtype
17
18
  from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
18
- from sklearn.preprocessing import OrdinalEncoder
19
19
 
20
+ from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
20
21
  from upgini.utils.features_validator import FeaturesValidator
21
22
  from upgini.utils.sklearn_ext import cross_validate
22
- from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
23
23
 
24
24
  try:
25
25
  from sklearn.metrics import get_scorer_names
@@ -36,7 +36,7 @@ from sklearn.model_selection import BaseCrossValidator, TimeSeriesSplit
36
36
  from upgini.errors import ValidationError
37
37
  from upgini.metadata import ModelTaskType
38
38
  from upgini.resource_bundle import bundle
39
- from upgini.utils.target_utils import correct_string_target
39
+ from upgini.utils.target_utils import prepare_target
40
40
 
41
41
  DEFAULT_RANDOM_STATE = 42
42
42
 
@@ -287,6 +287,7 @@ class EstimatorWrapper:
287
287
  self,
288
288
  estimator,
289
289
  scorer: Callable,
290
+ cat_features: Optional[List[str]],
290
291
  metric_name: str,
291
292
  multiplier: int,
292
293
  cv: BaseCrossValidator,
@@ -298,9 +299,8 @@ class EstimatorWrapper:
298
299
  ):
299
300
  self.estimator = estimator
300
301
  self.scorer = scorer
301
- self.metric_name = (
302
- "GINI" if metric_name.upper() == "ROC_AUC" and target_type == ModelTaskType.BINARY else metric_name
303
- )
302
+ self.cat_features = cat_features
303
+ self.metric_name = metric_name
304
304
  self.multiplier = multiplier
305
305
  self.cv = cv
306
306
  self.target_type = target_type
@@ -345,6 +345,8 @@ class EstimatorWrapper:
345
345
  else:
346
346
  x, y = self._remove_empty_target_rows(x, y)
347
347
 
348
+ y = prepare_target(y, self.target_type)
349
+
348
350
  self.logger.info(f"After preparing data columns: {x.columns.to_list()}")
349
351
  return x, y, groups
350
352
 
@@ -465,7 +467,7 @@ class EstimatorWrapper:
465
467
  logger: logging.Logger,
466
468
  target_type: ModelTaskType,
467
469
  cv: BaseCrossValidator,
468
- x: pd.DataFrame,
470
+ *,
469
471
  scoring: Union[Callable, str, None] = None,
470
472
  cat_features: Optional[List[str]] = None,
471
473
  text_features: Optional[List[str]] = None,
@@ -473,9 +475,10 @@ class EstimatorWrapper:
473
475
  groups: Optional[List[str]] = None,
474
476
  has_date: Optional[bool] = None,
475
477
  ) -> EstimatorWrapper:
476
- scorer, metric_name, multiplier = _get_scorer(target_type, scoring)
478
+ scorer, metric_name, multiplier = define_scorer(target_type, scoring)
477
479
  kwargs = {
478
480
  "scorer": scorer,
481
+ "cat_features": cat_features,
479
482
  "metric_name": metric_name,
480
483
  "multiplier": multiplier,
481
484
  "cv": cv,
@@ -509,11 +512,6 @@ class EstimatorWrapper:
509
512
  kwargs["estimator"] = estimator_copy
510
513
  if is_catboost_estimator(estimator):
511
514
  if cat_features is not None:
512
- for cat_feature in cat_features:
513
- if cat_feature not in x.columns:
514
- logger.error(
515
- f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
516
- )
517
515
  estimator_copy.set_params(cat_features=cat_features, has_time=has_date)
518
516
  estimator = CatBoostWrapper(**kwargs)
519
517
  else:
@@ -536,6 +534,7 @@ class CatBoostWrapper(EstimatorWrapper):
536
534
  self,
537
535
  estimator,
538
536
  scorer: Callable,
537
+ cat_features: Optional[List[str]],
539
538
  metric_name: str,
540
539
  multiplier: int,
541
540
  cv: BaseCrossValidator,
@@ -547,6 +546,7 @@ class CatBoostWrapper(EstimatorWrapper):
547
546
  super(CatBoostWrapper, self).__init__(
548
547
  estimator,
549
548
  scorer,
549
+ cat_features,
550
550
  metric_name,
551
551
  multiplier,
552
552
  cv,
@@ -555,10 +555,9 @@ class CatBoostWrapper(EstimatorWrapper):
555
555
  text_features=text_features,
556
556
  logger=logger,
557
557
  )
558
- self.cat_features = None
559
558
  self.emb_features = None
560
559
  self.grouped_embedding_features = None
561
- self.exclude_features = []
560
+ self.drop_cat_features = []
562
561
 
563
562
  def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
564
563
  x, y, groups, params = super()._prepare_to_fit(x, y)
@@ -595,36 +594,9 @@ class CatBoostWrapper(EstimatorWrapper):
595
594
  self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
596
595
 
597
596
  # Find rest categorical features
598
- self.cat_features = _get_cat_features(x, self.text_features, self.grouped_embedding_features)
599
- # x = fill_na_cat_features(x, self.cat_features)
600
- unique_cat_features = []
601
- for name in self.cat_features:
602
- # Remove constant categorical features
603
- if x[name].nunique() > 1:
604
- unique_cat_features.append(name)
605
- else:
606
- self.logger.info(f"Drop column {name} on preparing data for fit")
607
- x = x.drop(columns=name)
608
- self.exclude_features.append(name)
609
- self.cat_features = unique_cat_features
610
- if (
611
- hasattr(self.estimator, "get_param")
612
- and hasattr(self.estimator, "_init_params")
613
- and self.estimator.get_param("cat_features") is not None
614
- ):
615
- estimator_cat_features = self.estimator.get_param("cat_features")
616
- if all([isinstance(c, int) for c in estimator_cat_features]):
617
- cat_features_idx = {x.columns.get_loc(c) for c in self.cat_features}
618
- cat_features_idx.update(estimator_cat_features)
619
- self.cat_features = [x.columns[idx] for idx in cat_features_idx]
620
- elif all([isinstance(c, str) for c in estimator_cat_features]):
621
- self.cat_features = list(set(self.cat_features + estimator_cat_features))
622
- else:
623
- print(f"WARNING: Unsupported type of cat_features in CatBoost estimator: {estimator_cat_features}")
624
-
625
- del self.estimator._init_params["cat_features"]
626
-
627
- self.logger.info(f"Selected categorical features: {self.cat_features}")
597
+ self.cat_features, self.features_to_encode, self.exclude_features = _get_cat_features(
598
+ self.logger, x, self.cat_features, self.text_features, self.grouped_embedding_features
599
+ )
628
600
  params["cat_features"] = self.cat_features
629
601
 
630
602
  return x, y, groups, params
@@ -655,7 +627,6 @@ class CatBoostWrapper(EstimatorWrapper):
655
627
  x, emb_columns = self.group_embeddings(x)
656
628
  params["embedding_features"] = emb_columns
657
629
  if self.cat_features:
658
- # x = fill_na_cat_features(x, self.cat_features)
659
630
  params["cat_features"] = self.cat_features
660
631
 
661
632
  return x, y, params
@@ -725,6 +696,7 @@ class LightGBMWrapper(EstimatorWrapper):
725
696
  self,
726
697
  estimator,
727
698
  scorer: Callable,
699
+ cat_features: Optional[List[str]],
728
700
  metric_name: str,
729
701
  multiplier: int,
730
702
  cv: BaseCrossValidator,
@@ -736,6 +708,7 @@ class LightGBMWrapper(EstimatorWrapper):
736
708
  super(LightGBMWrapper, self).__init__(
737
709
  estimator,
738
710
  scorer,
711
+ cat_features,
739
712
  metric_name,
740
713
  multiplier,
741
714
  cv,
@@ -744,9 +717,10 @@ class LightGBMWrapper(EstimatorWrapper):
744
717
  text_features=text_features,
745
718
  logger=logger,
746
719
  )
747
- self.cat_features = None
748
720
  self.cat_encoder = None
749
721
  self.n_classes = None
722
+ self.exclude_features = []
723
+ self.features_to_encode = []
750
724
 
751
725
  def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
752
726
  x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
@@ -756,30 +730,25 @@ class LightGBMWrapper(EstimatorWrapper):
756
730
  if self.target_type == ModelTaskType.BINARY:
757
731
  params["eval_metric"] = "auc"
758
732
  params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
759
- self.cat_features = _get_cat_features(x)
760
- if self.cat_features:
761
- # x = fill_na_cat_features(x, self.cat_features)
762
- encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan)
763
- encoded = pd.DataFrame(
764
- encoder.fit_transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
765
- )
766
- x[self.cat_features] = encoded
733
+ self.cat_features, self.features_to_encode, self.exclude_features = _get_cat_features(
734
+ self.logger, x, self.cat_features
735
+ )
736
+ if self.features_to_encode:
737
+ encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, return_df=True)
738
+ encoded = encoder.fit_transform(x[self.features_to_encode].astype("object"), y_numpy).astype("category")
739
+ x[self.features_to_encode] = encoded
767
740
  self.cat_encoder = encoder
768
- if not is_numeric_dtype(y_numpy):
769
- y_numpy = correct_string_target(y_numpy)
770
741
 
771
742
  return x, y_numpy, groups, params
772
743
 
773
744
  def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
745
+ if self.exclude_features:
746
+ x = x.drop(columns=self.exclude_features)
774
747
  x, y_numpy, params = super()._prepare_to_calculate(x, y)
775
- if self.cat_features is not None:
776
- # x = fill_na_cat_features(x, self.cat_features)
777
- if self.cat_encoder is not None:
778
- x[self.cat_features] = pd.DataFrame(
779
- self.cat_encoder.transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
780
- )
781
- if not is_numeric_dtype(y):
782
- y_numpy = correct_string_target(y_numpy)
748
+ if self.features_to_encode is not None and self.cat_encoder is not None:
749
+ x[self.features_to_encode] = self.cat_encoder.transform(x[self.features_to_encode].astype("object")).astype(
750
+ "category"
751
+ )
783
752
  return x, y_numpy, params
784
753
 
785
754
  def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
@@ -805,20 +774,6 @@ class LightGBMWrapper(EstimatorWrapper):
805
774
  for i, col in enumerate(x.columns):
806
775
  feature_importance[col] = np.mean(np.abs(shap_matrix[:, i]))
807
776
 
808
- # # exclude last column (base value)
809
- # shap_values_only = shap_values[:, :-1]
810
- # mean_abs_shap = np.mean(np.abs(shap_values_only), axis=0)
811
-
812
- # # For classification, shap_values is returned as a list for each class
813
- # # Take values for the positive class
814
- # if isinstance(shap_values, list):
815
- # shap_values = shap_values[1]
816
-
817
- # # Calculate mean absolute SHAP value for each feature
818
- # feature_importance = {}
819
- # for i, col in enumerate(x.columns):
820
- # feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
821
-
822
777
  return feature_importance
823
778
 
824
779
  except Exception as e:
@@ -831,6 +786,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
831
786
  self,
832
787
  estimator,
833
788
  scorer: Callable,
789
+ cat_features: Optional[List[str]],
834
790
  metric_name: str,
835
791
  multiplier: int,
836
792
  cv: BaseCrossValidator,
@@ -842,6 +798,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
842
798
  super(OtherEstimatorWrapper, self).__init__(
843
799
  estimator,
844
800
  scorer,
801
+ cat_features,
845
802
  metric_name,
846
803
  multiplier,
847
804
  cv,
@@ -850,32 +807,32 @@ class OtherEstimatorWrapper(EstimatorWrapper):
850
807
  text_features=text_features,
851
808
  logger=logger,
852
809
  )
853
- self.cat_features = None
854
810
 
855
811
  def _prepare_to_fit(self, x: pd.DataFrame, y: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
856
- x, y, groups, params = super()._prepare_to_fit(x, y)
857
- self.cat_features = _get_cat_features(x)
812
+ x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
813
+ self.cat_features, self.features_to_encode, self.exclude_features = _get_cat_features(
814
+ self.logger, x, self.cat_features
815
+ )
858
816
  num_features = [col for col in x.columns if col not in self.cat_features]
859
817
  x[num_features] = x[num_features].fillna(-999)
860
- # x = fill_na_cat_features(x, self.cat_features)
861
- # TODO use one-hot encoding if cardinality is less 50
862
- for feature in self.cat_features:
863
- x[feature] = x[feature].astype("category").cat.codes
864
- if not is_numeric_dtype(y):
865
- y = correct_string_target(y)
866
- return x, y, groups, params
818
+ if self.cat_features:
819
+ encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, return_df=True)
820
+ encoded = encoder.fit_transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
821
+ x[self.cat_features] = encoded
822
+ self.cat_encoder = encoder
823
+ return x, y_numpy, groups, params
867
824
 
868
825
  def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
826
+ if self.exclude_features:
827
+ x = x.drop(columns=self.exclude_features)
869
828
  x, y, params = super()._prepare_to_calculate(x, y)
870
829
  if self.cat_features is not None:
871
830
  num_features = [col for col in x.columns if col not in self.cat_features]
872
831
  x[num_features] = x[num_features].fillna(-999)
873
- # x = fill_na_cat_features(x, self.cat_features)
874
- # TODO use one-hot encoding if cardinality is less 50
875
- for feature in self.cat_features:
876
- x[feature] = x[feature].astype("category").cat.codes
877
- if not is_numeric_dtype(y):
878
- y = correct_string_target(y)
832
+ if self.features_to_encode and self.cat_encoder is not None:
833
+ x[self.features_to_encode] = self.cat_encoder.transform(x[self.features_to_encode].astype("object")).astype(
834
+ "category"
835
+ )
879
836
  return x, y, params
880
837
 
881
838
 
@@ -938,7 +895,7 @@ def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
938
895
  return scoring, metric_name, multiplier
939
896
 
940
897
 
941
- def _get_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None]) -> Tuple[Callable, str, int]:
898
+ def define_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None]) -> Tuple[Callable, str, int]:
942
899
  if scoring is None:
943
900
  if target_type == ModelTaskType.BINARY:
944
901
  scoring = "roc_auc"
@@ -957,16 +914,42 @@ def _get_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None])
957
914
  else:
958
915
  metric_name = str(scoring)
959
916
 
917
+ metric_name = "GINI" if metric_name.upper() == "ROC_AUC" and target_type == ModelTaskType.BINARY else metric_name
918
+
960
919
  return scoring, metric_name, multiplier
961
920
 
962
921
 
963
922
  def _get_cat_features(
964
- x: pd.DataFrame, text_features: Optional[List[str]] = None, emb_features: Optional[List[str]] = None
923
+ logger: logging.Logger,
924
+ x: pd.DataFrame,
925
+ cat_features: Optional[List[str]],
926
+ text_features: Optional[List[str]] = None,
927
+ emb_features: Optional[List[str]] = None,
965
928
  ) -> List[str]:
929
+ cat_features = cat_features or []
966
930
  text_features = text_features or []
967
931
  emb_features = emb_features or []
968
932
  exclude_features = text_features + emb_features
969
- return [c for c in x.columns if c not in exclude_features and not is_numeric_dtype(x[c])]
933
+ cat_features = [c for c in cat_features if c not in exclude_features]
934
+ unique_cat_features = []
935
+ drop_cat_features = []
936
+ for name in cat_features:
937
+ # Remove constant categorical features
938
+ if x[name].nunique() > 1:
939
+ unique_cat_features.append(name)
940
+ else:
941
+ logger.info(f"Drop column {name} on preparing data for fit")
942
+ x = x.drop(columns=name)
943
+ drop_cat_features.append(name)
944
+ cat_features = unique_cat_features
945
+
946
+ logger.info(f"Selected categorical features: {cat_features}")
947
+
948
+ features_to_encode = list(set(x.select_dtypes(exclude=[np.number, np.datetime64, pd.CategoricalDtype]).columns))
949
+
950
+ logger.info(f"Features to encode: {features_to_encode}")
951
+
952
+ return cat_features, features_to_encode, drop_cat_features
970
953
 
971
954
 
972
955
  def _get_add_params(input_params, add_params):
@@ -3,7 +3,7 @@ from typing import Callable, List, Optional, Union
3
3
 
4
4
  import numpy as np
5
5
  import pandas as pd
6
- from pandas.api.types import is_numeric_dtype, is_bool_dtype
6
+ from pandas.api.types import is_bool_dtype, is_datetime64_any_dtype, is_numeric_dtype
7
7
 
8
8
  from upgini.errors import ValidationError
9
9
  from upgini.metadata import SYSTEM_RECORD_ID, CVType, ModelTaskType
@@ -14,11 +14,14 @@ from upgini.utils.ts_utils import get_most_frequent_time_unit, trunc_datetime
14
14
  TS_MIN_DIFFERENT_IDS_RATIO = 0.2
15
15
 
16
16
 
17
- def correct_string_target(y: Union[pd.Series, np.ndarray]) -> Union[pd.Series, np.ndarray]:
18
- if isinstance(y, pd.Series):
19
- return y.astype(str).astype("category").cat.codes
20
- elif isinstance(y, np.ndarray):
21
- return pd.Series(y).astype(str).astype("category").cat.codes.values
17
+ def prepare_target(y: Union[pd.Series, np.ndarray], target_type: ModelTaskType) -> Union[pd.Series, np.ndarray]:
18
+ if target_type != ModelTaskType.REGRESSION or (not is_numeric_dtype(y) and not is_datetime64_any_dtype(y)):
19
+ if isinstance(y, pd.Series):
20
+ y = y.astype(str).astype("category").cat.codes
21
+ elif isinstance(y, np.ndarray):
22
+ y = pd.Series(y).astype(str).astype("category").cat.codes.values
23
+
24
+ return y
22
25
 
23
26
 
24
27
  def define_task(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.80
3
+ Version: 1.2.81a3832.dev1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -22,6 +22,7 @@ Classifier: Programming Language :: Python :: 3.11
22
22
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
23
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
24
24
  Requires-Python: <3.12,>=3.10
25
+ Requires-Dist: category-encoders>=2.8.1
25
26
  Requires-Dist: fastparquet>=0.8.1
26
27
  Requires-Dist: ipywidgets>=8.1.0
27
28
  Requires-Dist: jarowinkler>=2.0.0
@@ -1,12 +1,12 @@
1
- upgini/__about__.py,sha256=CoguueQtsTfVbd91MeGXrmsF-vGq7K1xnwf9nFL4qz0,23
1
+ upgini/__about__.py,sha256=-WSXUS5Ith33qArTnDO4LmrI0wUaXbJ8bIzoMZvAsWU,33
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=MQZ01u-7jR8nSTlsyvMzUt-FvsbsBjds2TvQZG5F4vM,208296
6
+ upgini/features_enricher.py,sha256=qtrQJwF2QbKdQ8Tqk5RQj3aAqOzDgygD6nIHrco3AzE,209728
7
7
  upgini/http.py,sha256=UH7nswcZ221un3O_VW9limCBO5oRsyg1eKUHiVslRPs,43737
8
8
  upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
9
- upgini/metrics.py,sha256=pv3LELb8QObiaKcUco5YUfM_rP2c7hseK2qtjKmjBGk,39378
9
+ upgini/metrics.py,sha256=95sK1Kr3dYxqQcdkkoNFDe9OZY7OhgLjYwe3bhMQd38,38087
10
10
  upgini/search_task.py,sha256=RcvAE785yksWTsTNWuZFVNlk32jHElMoEna1T_C5N8Q,17823
11
11
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
12
12
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -66,11 +66,11 @@ upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml
66
66
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
67
67
  upgini/utils/sklearn_ext.py,sha256=HpaNQaKJisgNE7IZ71n7uswxTj7kbPglU2G3s1sORAc,45042
68
68
  upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
69
- upgini/utils/target_utils.py,sha256=P0cCVRaakWLydYwFjk3TEaQfr0p0hfsJCvKRD8qcxiE,16650
69
+ upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,16832
70
70
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
71
71
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
72
72
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
73
- upgini-1.2.80.dist-info/METADATA,sha256=szsz09LH3Kv4SMNG8Ogut33IDG0Tzqln2JsrLiEXPBc,49091
74
- upgini-1.2.80.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
- upgini-1.2.80.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
- upgini-1.2.80.dist-info/RECORD,,
73
+ upgini-1.2.81a3832.dev1.dist-info/METADATA,sha256=ShIRi8EeeujsKBJ0byR2XWJ6DKFka2vrViq9d5VwjzU,49141
74
+ upgini-1.2.81a3832.dev1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
+ upgini-1.2.81a3832.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
+ upgini-1.2.81a3832.dev1.dist-info/RECORD,,