upgini 1.2.80__py3-none-any.whl → 1.2.81__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.80"
1
+ __version__ = "1.2.81"
upgini/autofe/binary.py CHANGED
@@ -146,8 +146,8 @@ class Distance(PandasOperator):
146
146
 
147
147
  # row-wise dot product, handling None values
148
148
  def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
149
- left = left.apply(lambda x: np.array(x))
150
- right = right.apply(lambda x: np.array(x))
149
+ left = left.apply(lambda x: np.array(x).astype(np.float64))
150
+ right = right.apply(lambda x: np.array(x).astype(np.float64))
151
151
  res = (left.dropna() * right.dropna()).apply(np.sum)
152
152
  res = res.reindex(left.index.union(right.index))
153
153
  return res
@@ -60,12 +60,14 @@ class EWMAVolatility(VolatilityBase, ParametrizedOperator):
60
60
  return res
61
61
 
62
62
  def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
63
- return ts.apply(self._ewma_vol)
63
+ return ts.apply(self._ewma_vol).iloc[:, [-1]]
64
64
 
65
65
  def _ewma_vol(self, x):
66
- x = pd.DataFrame(x).iloc[:, -1]
67
- returns = self._get_returns(x, f"{self.step_size}{self.step_unit}")
68
- return returns.ewm(span=self.window_size).std()
66
+ return_series = isinstance(x, pd.Series)
67
+ x = pd.DataFrame(x)
68
+ returns = self._get_returns(x.iloc[:, -1], f"{self.step_size}{self.step_unit}")
69
+ x.iloc[:, -1] = returns.ewm(span=self.window_size).std()
70
+ return x.iloc[:, -1] if return_series else x
69
71
 
70
72
 
71
73
  class RollingVolBase(VolatilityBase):
@@ -63,7 +63,7 @@ from upgini.metadata import (
63
63
  RuntimeParameters,
64
64
  SearchKey,
65
65
  )
66
- from upgini.metrics import EstimatorWrapper, validate_scoring_argument
66
+ from upgini.metrics import EstimatorWrapper, define_scorer, validate_scoring_argument
67
67
  from upgini.normalizer.normalize_utils import Normalizer
68
68
  from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
69
69
  from upgini.search_task import SearchTask
@@ -310,6 +310,7 @@ class FeaturesEnricher(TransformerMixin):
310
310
  self._search_task = search_task.poll_result(trace_id, quiet=True, check_fit=True)
311
311
  file_metadata = self._search_task.get_file_metadata(trace_id)
312
312
  x_columns = [c.originalName or c.name for c in file_metadata.columns]
313
+ self.fit_columns_renaming = {c.name: c.originalName for c in file_metadata.columns}
313
314
  df = pd.DataFrame(columns=x_columns)
314
315
  self.__prepare_feature_importances(trace_id, df, silent=True)
315
316
  # TODO validate search_keys with search_keys from file_metadata
@@ -452,6 +453,7 @@ class FeaturesEnricher(TransformerMixin):
452
453
  """
453
454
  trace_id = str(uuid.uuid4())
454
455
  start_time = time.time()
456
+ auto_fe_parameters = AutoFEParameters() if auto_fe_parameters is None else auto_fe_parameters
455
457
  search_progress = SearchProgress(0.0, ProgressStage.START_FIT)
456
458
  if progress_callback is not None:
457
459
  progress_callback(search_progress)
@@ -476,7 +478,7 @@ class FeaturesEnricher(TransformerMixin):
476
478
  self.__validate_search_keys(self.search_keys)
477
479
 
478
480
  # Validate client estimator params
479
- self._get_client_cat_features(estimator, X, self.search_keys)
481
+ self._get_and_validate_client_cat_features(estimator, X, self.search_keys)
480
482
 
481
483
  try:
482
484
  self.X = X
@@ -606,6 +608,7 @@ class FeaturesEnricher(TransformerMixin):
606
608
  """
607
609
 
608
610
  self.warning_counter.reset()
611
+ auto_fe_parameters = AutoFEParameters() if auto_fe_parameters is None else auto_fe_parameters
609
612
  trace_id = str(uuid.uuid4())
610
613
  start_time = time.time()
611
614
  with MDC(trace_id=trace_id):
@@ -957,9 +960,17 @@ class FeaturesEnricher(TransformerMixin):
957
960
  self.__display_support_link(msg)
958
961
  return None
959
962
 
960
- cat_features, search_keys_for_metrics = self._get_client_cat_features(
963
+ cat_features_from_backend = self.__get_categorical_features()
964
+ client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
961
965
  estimator, validated_X, self.search_keys
962
966
  )
967
+ for cat_feature in cat_features_from_backend:
968
+ original_cat_feature = self.fit_columns_renaming.get(cat_feature)
969
+ if original_cat_feature in self.search_keys:
970
+ if self.search_keys[original_cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
971
+ search_keys_for_metrics.append(original_cat_feature)
972
+ else:
973
+ self.logger.warning(self.bundle.get("cat_feature_search_key").format(original_cat_feature))
963
974
  search_keys_for_metrics.extend([c for c in self.id_columns or [] if c not in search_keys_for_metrics])
964
975
  self.logger.info(f"Search keys for metrics: {search_keys_for_metrics}")
965
976
 
@@ -976,7 +987,7 @@ class FeaturesEnricher(TransformerMixin):
976
987
  search_keys_for_metrics=search_keys_for_metrics,
977
988
  progress_bar=progress_bar,
978
989
  progress_callback=progress_callback,
979
- cat_features=cat_features,
990
+ client_cat_features=client_cat_features,
980
991
  )
981
992
  if prepared_data is None:
982
993
  return None
@@ -994,11 +1005,25 @@ class FeaturesEnricher(TransformerMixin):
994
1005
  ) = prepared_data
995
1006
 
996
1007
  # rename cat_features
997
- if cat_features:
1008
+ if client_cat_features:
998
1009
  for new_c, old_c in columns_renaming.items():
999
- if old_c in cat_features:
1000
- cat_features.remove(old_c)
1001
- cat_features.append(new_c)
1010
+ if old_c in client_cat_features:
1011
+ client_cat_features.remove(old_c)
1012
+ client_cat_features.append(new_c)
1013
+ for cat_feature in client_cat_features:
1014
+ if cat_feature not in fitting_X.columns:
1015
+ self.logger.error(
1016
+ f"Client cat_feature `{cat_feature}` not found in"
1017
+ f" x columns: {fitting_X.columns.to_list()}"
1018
+ )
1019
+ else:
1020
+ client_cat_features = []
1021
+
1022
+ # rename baseline_score_column
1023
+ reversed_renaming = {v: k for k, v in columns_renaming.items()}
1024
+ baseline_score_column = self.baseline_score_column
1025
+ if baseline_score_column is not None:
1026
+ baseline_score_column = reversed_renaming[baseline_score_column]
1002
1027
 
1003
1028
  gc.collect()
1004
1029
 
@@ -1006,12 +1031,12 @@ class FeaturesEnricher(TransformerMixin):
1006
1031
  self.__log_warning(self.bundle.get("metrics_no_important_free_features"))
1007
1032
  return None
1008
1033
 
1009
- maybe_phone_column = self._get_phone_column(self.search_keys)
1010
- text_features = (
1011
- [f for f in self.generate_features if f != maybe_phone_column]
1012
- if self.generate_features is not None
1013
- else None
1014
- )
1034
+ text_features = self.generate_features.copy() if self.generate_features else None
1035
+ if text_features:
1036
+ for renamed, original in columns_renaming.items():
1037
+ if original in text_features:
1038
+ text_features.remove(original)
1039
+ text_features.append(renamed)
1015
1040
 
1016
1041
  print(self.bundle.get("metrics_start"))
1017
1042
  with Spinner():
@@ -1019,25 +1044,20 @@ class FeaturesEnricher(TransformerMixin):
1019
1044
 
1020
1045
  has_date = self._get_date_column(search_keys) is not None
1021
1046
  model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
1047
+ cat_features = list(set(client_cat_features + cat_features_from_backend))
1048
+ baseline_cat_features = [f for f in cat_features if f in fitting_X.columns]
1049
+ enriched_cat_features = [f for f in cat_features if f in fitting_enriched_X.columns]
1050
+ if len(enriched_cat_features) < len(cat_features):
1051
+ missing_cat_features = [f for f in cat_features if f not in fitting_enriched_X.columns]
1052
+ self.logger.warning(f"Some cat_features were not found in enriched_X: {missing_cat_features}")
1022
1053
 
1023
- wrapper = EstimatorWrapper.create(
1024
- estimator,
1025
- self.logger,
1026
- model_task_type,
1027
- _cv,
1028
- fitting_enriched_X,
1029
- scoring,
1030
- groups=groups,
1031
- text_features=text_features,
1032
- has_date=has_date,
1033
- )
1034
- metric = wrapper.metric_name
1035
- multiplier = wrapper.multiplier
1054
+ _, metric, multiplier = define_scorer(model_task_type, scoring)
1036
1055
 
1037
1056
  # 1 If client features are presented - fit and predict with KFold estimator
1038
1057
  # on etalon features and calculate baseline metric
1039
- etalon_metric = None
1058
+ baseline_metric = None
1040
1059
  baseline_estimator = None
1060
+ updating_shaps = None
1041
1061
  custom_loss_add_params = get_additional_params_custom_loss(
1042
1062
  self.loss, model_task_type, logger=self.logger
1043
1063
  )
@@ -1050,25 +1070,25 @@ class FeaturesEnricher(TransformerMixin):
1050
1070
  self.logger,
1051
1071
  model_task_type,
1052
1072
  _cv,
1053
- fitting_enriched_X,
1054
- scoring,
1055
- cat_features,
1073
+ scoring=scoring,
1074
+ cat_features=baseline_cat_features,
1056
1075
  add_params=custom_loss_add_params,
1057
1076
  groups=groups,
1058
1077
  text_features=text_features,
1059
1078
  has_date=has_date,
1060
1079
  )
1061
- etalon_cv_result = baseline_estimator.cross_val_predict(
1062
- fitting_X, y_sorted, self.baseline_score_column
1080
+ baseline_cv_result = baseline_estimator.cross_val_predict(
1081
+ fitting_X, y_sorted, baseline_score_column
1063
1082
  )
1064
- etalon_metric = etalon_cv_result.get_display_metric()
1065
- if etalon_metric is None:
1083
+ baseline_metric = baseline_cv_result.get_display_metric()
1084
+ if baseline_metric is None:
1066
1085
  self.logger.info(
1067
1086
  f"Baseline {metric} on train client features is None (maybe all features was removed)"
1068
1087
  )
1069
1088
  baseline_estimator = None
1070
1089
  else:
1071
- self.logger.info(f"Baseline {metric} on train client features: {etalon_metric}")
1090
+ self.logger.info(f"Baseline {metric} on train client features: {baseline_metric}")
1091
+ updating_shaps = baseline_cv_result.shap_values
1072
1092
 
1073
1093
  # 2 Fit and predict with KFold estimator on enriched tds
1074
1094
  # and calculate final metric (and uplift)
@@ -1085,9 +1105,8 @@ class FeaturesEnricher(TransformerMixin):
1085
1105
  self.logger,
1086
1106
  model_task_type,
1087
1107
  _cv,
1088
- fitting_enriched_X,
1089
- scoring,
1090
- cat_features,
1108
+ scoring=scoring,
1109
+ cat_features=enriched_cat_features,
1091
1110
  add_params=custom_loss_add_params,
1092
1111
  groups=groups,
1093
1112
  text_features=text_features,
@@ -1095,10 +1114,7 @@ class FeaturesEnricher(TransformerMixin):
1095
1114
  )
1096
1115
  enriched_cv_result = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
1097
1116
  enriched_metric = enriched_cv_result.get_display_metric()
1098
- enriched_shaps = enriched_cv_result.shap_values
1099
-
1100
- if enriched_shaps is not None:
1101
- self._update_shap_values(trace_id, fitting_X, enriched_shaps, silent=not internal_call)
1117
+ updating_shaps = enriched_cv_result.shap_values
1102
1118
 
1103
1119
  if enriched_metric is None:
1104
1120
  self.logger.warning(
@@ -1107,8 +1123,8 @@ class FeaturesEnricher(TransformerMixin):
1107
1123
  enriched_estimator = None
1108
1124
  else:
1109
1125
  self.logger.info(f"Enriched {metric} on train combined features: {enriched_metric}")
1110
- if etalon_metric is not None and enriched_metric is not None:
1111
- uplift = (enriched_cv_result.metric - etalon_cv_result.metric) * multiplier
1126
+ if baseline_metric is not None and enriched_metric is not None:
1127
+ uplift = (enriched_cv_result.metric - baseline_cv_result.metric) * multiplier
1112
1128
 
1113
1129
  train_metrics = {
1114
1130
  self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
@@ -1126,8 +1142,10 @@ class FeaturesEnricher(TransformerMixin):
1126
1142
  np.mean(y_sorted),
1127
1143
  4,
1128
1144
  )
1129
- if etalon_metric is not None:
1130
- train_metrics[self.bundle.get("quality_metrics_baseline_header").format(metric)] = etalon_metric
1145
+ if baseline_metric is not None:
1146
+ train_metrics[self.bundle.get("quality_metrics_baseline_header").format(metric)] = (
1147
+ baseline_metric
1148
+ )
1131
1149
  if enriched_metric is not None:
1132
1150
  train_metrics[self.bundle.get("quality_metrics_enriched_header").format(metric)] = (
1133
1151
  enriched_metric
@@ -1156,7 +1174,7 @@ class FeaturesEnricher(TransformerMixin):
1156
1174
  f"on client features: {eval_X_sorted.columns.to_list()}"
1157
1175
  )
1158
1176
  etalon_eval_results = baseline_estimator.calculate_metric(
1159
- eval_X_sorted, eval_y_sorted, self.baseline_score_column
1177
+ eval_X_sorted, eval_y_sorted, baseline_score_column
1160
1178
  )
1161
1179
  etalon_eval_metric = etalon_eval_results.get_display_metric()
1162
1180
  self.logger.info(
@@ -1218,6 +1236,9 @@ class FeaturesEnricher(TransformerMixin):
1218
1236
 
1219
1237
  metrics.append(eval_metrics)
1220
1238
 
1239
+ if updating_shaps is not None:
1240
+ self._update_shap_values(trace_id, fitting_X, updating_shaps, silent=not internal_call)
1241
+
1221
1242
  metrics_df = pd.DataFrame(metrics)
1222
1243
  mean_target_hdr = self.bundle.get("quality_metrics_mean_target_header")
1223
1244
  if mean_target_hdr in metrics_df.columns:
@@ -1268,6 +1289,7 @@ class FeaturesEnricher(TransformerMixin):
1268
1289
 
1269
1290
  def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float], silent: bool = False):
1270
1291
  renaming = self.fit_columns_renaming or {}
1292
+ self.logger.info(f"Updating SHAP values: {new_shaps}")
1271
1293
  new_shaps = {
1272
1294
  renaming.get(feature, feature): _round_shap_value(shap)
1273
1295
  for feature, shap in new_shaps.items()
@@ -1420,7 +1442,7 @@ class FeaturesEnricher(TransformerMixin):
1420
1442
 
1421
1443
  return _cv, groups
1422
1444
 
1423
- def _get_client_cat_features(
1445
+ def _get_and_validate_client_cat_features(
1424
1446
  self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
1425
1447
  ) -> Tuple[Optional[List[str]], List[str]]:
1426
1448
  cat_features = None
@@ -1428,12 +1450,20 @@ class FeaturesEnricher(TransformerMixin):
1428
1450
  if (
1429
1451
  estimator is not None
1430
1452
  and hasattr(estimator, "get_param")
1453
+ and hasattr(estimator, "_init_params")
1431
1454
  and estimator.get_param("cat_features") is not None
1432
1455
  ):
1433
- cat_features = estimator.get_param("cat_features")
1434
- if len(cat_features) > 0:
1435
- if all([isinstance(f, int) for f in cat_features]):
1436
- cat_features = [X.columns[i] for i in cat_features]
1456
+ estimator_cat_features = estimator.get_param("cat_features")
1457
+ if all([isinstance(c, int) for c in estimator_cat_features]):
1458
+ cat_features = [X.columns[idx] for idx in estimator_cat_features]
1459
+ elif all([isinstance(c, str) for c in estimator_cat_features]):
1460
+ cat_features = estimator_cat_features
1461
+ else:
1462
+ print(f"WARNING: Unsupported type of cat_features in CatBoost estimator: {estimator_cat_features}")
1463
+
1464
+ del estimator._init_params["cat_features"]
1465
+
1466
+ if cat_features:
1437
1467
  self.logger.info(f"Collected categorical features {cat_features} from user estimator")
1438
1468
  for cat_feature in cat_features:
1439
1469
  if cat_feature in search_keys:
@@ -1457,7 +1487,7 @@ class FeaturesEnricher(TransformerMixin):
1457
1487
  search_keys_for_metrics: Optional[List[str]] = None,
1458
1488
  progress_bar: Optional[ProgressBar] = None,
1459
1489
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
1460
- cat_features: Optional[List[str]] = None,
1490
+ client_cat_features: Optional[List[str]] = None,
1461
1491
  ):
1462
1492
  is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
1463
1493
  is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
@@ -1531,7 +1561,7 @@ class FeaturesEnricher(TransformerMixin):
1531
1561
 
1532
1562
  # Detect and drop high cardinality columns in train
1533
1563
  columns_with_high_cardinality = FeaturesValidator.find_high_cardinality(fitting_X)
1534
- non_excluding_columns = (self.generate_features or []) + (cat_features or [])
1564
+ non_excluding_columns = (self.generate_features or []) + (client_cat_features or [])
1535
1565
  columns_with_high_cardinality = [c for c in columns_with_high_cardinality if c not in non_excluding_columns]
1536
1566
  if len(columns_with_high_cardinality) > 0:
1537
1567
  self.logger.warning(
@@ -1751,10 +1781,10 @@ class FeaturesEnricher(TransformerMixin):
1751
1781
  df = generator.generate(df)
1752
1782
  generated_features.extend(generator.generated_features)
1753
1783
 
1754
- # normalizer = Normalizer(self.bundle, self.logger)
1755
- # df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
1756
- # columns_renaming = normalizer.columns_renaming
1757
- columns_renaming = {c: c for c in df.columns}
1784
+ normalizer = Normalizer(self.bundle, self.logger)
1785
+ df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
1786
+ columns_renaming = normalizer.columns_renaming
1787
+ # columns_renaming = {c: c for c in df.columns}
1758
1788
 
1759
1789
  df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1760
1790
 
@@ -1886,6 +1916,13 @@ class FeaturesEnricher(TransformerMixin):
1886
1916
  enriched_eval_X = enriched_eval_sets[idx + 1][enriched_X_columns].copy()
1887
1917
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
1888
1918
 
1919
+ reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
1920
+ X_sampled.rename(columns=reversed_renaming, inplace=True)
1921
+ enriched_X.rename(columns=reversed_renaming, inplace=True)
1922
+ for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
1923
+ eval_X_sampled.rename(columns=reversed_renaming, inplace=True)
1924
+ enriched_eval_X.rename(columns=reversed_renaming, inplace=True)
1925
+
1889
1926
  datasets_hash = hash_input(self.X, self.y, self.eval_set)
1890
1927
  return self.__cache_and_return_results(
1891
1928
  datasets_hash,
@@ -1942,6 +1979,14 @@ class FeaturesEnricher(TransformerMixin):
1942
1979
  enriched_df, x_columns, enriched_X.columns.tolist(), len(eval_set) if has_eval_set else 0
1943
1980
  )
1944
1981
 
1982
+ # Add hash-suffixes because output of transform has original names
1983
+ reversed_renaming = {v: k for k, v in columns_renaming.items()}
1984
+ X_sampled.rename(columns=reversed_renaming, inplace=True)
1985
+ enriched_X.rename(columns=reversed_renaming, inplace=True)
1986
+ for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
1987
+ eval_X_sampled.rename(columns=reversed_renaming, inplace=True)
1988
+ enriched_eval_X.rename(columns=reversed_renaming, inplace=True)
1989
+
1945
1990
  # Cache and return results
1946
1991
  datasets_hash = hash_input(validated_X, validated_y, eval_set)
1947
1992
  return self.__cache_and_return_results(
@@ -2069,10 +2114,12 @@ class FeaturesEnricher(TransformerMixin):
2069
2114
  search_keys: Dict,
2070
2115
  columns_renaming: Dict[str, str],
2071
2116
  ):
2117
+ # X_sampled - with hash-suffixes
2118
+ reversed_renaming = {v: k for k, v in columns_renaming.items()}
2072
2119
  search_keys = {
2073
- columns_renaming.get(k, k): v
2120
+ reversed_renaming.get(k, k): v
2074
2121
  for k, v in search_keys.items()
2075
- if columns_renaming.get(k, k) in X_sampled.columns.to_list()
2122
+ if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
2076
2123
  }
2077
2124
  return FeaturesEnricher._SampledDataForMetrics(
2078
2125
  X_sampled=X_sampled,
@@ -2334,6 +2381,25 @@ if response.status_code == 200:
2334
2381
  df[columns_for_system_record_id], index=False
2335
2382
  ).astype("float64")
2336
2383
 
2384
+ features_not_to_pass = []
2385
+ if add_fit_system_record_id:
2386
+ df = self.__add_fit_system_record_id(
2387
+ df,
2388
+ search_keys,
2389
+ SYSTEM_RECORD_ID,
2390
+ TARGET,
2391
+ columns_renaming,
2392
+ silent=True,
2393
+ )
2394
+ df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
2395
+ features_not_to_pass.append(SORT_ID)
2396
+
2397
+ system_columns_with_original_index = [ENTITY_SYSTEM_RECORD_ID] + generated_features
2398
+ if add_fit_system_record_id:
2399
+ system_columns_with_original_index.append(SORT_ID)
2400
+
2401
+ df_before_explode = df[system_columns_with_original_index].copy()
2402
+
2337
2403
  # Explode multiple search keys
2338
2404
  df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys, columns_renaming)
2339
2405
 
@@ -2381,25 +2447,13 @@ if response.status_code == 200:
2381
2447
  meaning_types.update({col: FileColumnMeaningType.FEATURE for col in features_for_transform})
2382
2448
  meaning_types.update({col: key.value for col, key in search_keys.items()})
2383
2449
 
2384
- features_not_to_pass = [
2450
+ features_not_to_pass.extend([
2385
2451
  c
2386
2452
  for c in df.columns
2387
2453
  if c not in search_keys.keys()
2388
2454
  and c not in features_for_transform
2389
2455
  and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
2390
- ]
2391
-
2392
- if add_fit_system_record_id:
2393
- df = self.__add_fit_system_record_id(
2394
- df,
2395
- search_keys,
2396
- SYSTEM_RECORD_ID,
2397
- TARGET,
2398
- columns_renaming,
2399
- silent=True,
2400
- )
2401
- df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
2402
- features_not_to_pass.append(SORT_ID)
2456
+ ])
2403
2457
 
2404
2458
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2405
2459
  df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
@@ -2415,10 +2469,6 @@ if response.status_code == 200:
2415
2469
  meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
2416
2470
 
2417
2471
  df = df.reset_index(drop=True)
2418
- system_columns_with_original_index = [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID] + generated_features
2419
- if add_fit_system_record_id:
2420
- system_columns_with_original_index.append(SORT_ID)
2421
- df_with_original_index = df[system_columns_with_original_index].copy()
2422
2472
 
2423
2473
  combined_search_keys = combine_search_keys(search_keys.keys())
2424
2474
 
@@ -2526,7 +2576,7 @@ if response.status_code == 200:
2526
2576
  combined_df = pd.concat(
2527
2577
  [
2528
2578
  validated_Xy.reset_index(drop=True),
2529
- df_with_original_index.reset_index(drop=True),
2579
+ df_before_explode.reset_index(drop=True),
2530
2580
  ],
2531
2581
  axis=1,
2532
2582
  ).set_index(validated_Xy.index)
@@ -2683,7 +2733,7 @@ if response.status_code == 200:
2683
2733
  importance_threshold: Optional[float],
2684
2734
  max_features: Optional[int],
2685
2735
  remove_outliers_calc_metrics: Optional[bool],
2686
- auto_fe_parameters: Optional[AutoFEParameters] = None,
2736
+ auto_fe_parameters: AutoFEParameters,
2687
2737
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
2688
2738
  search_id_callback: Optional[Callable[[str], Any]] = None,
2689
2739
  ):
@@ -2729,7 +2779,9 @@ if response.status_code == 200:
2729
2779
  if self.id_columns is not None:
2730
2780
  for id_column in self.id_columns:
2731
2781
  if id_column not in validated_X.columns:
2732
- raise ValidationError(self.bundle.get("missing_id_column").format(id_column))
2782
+ raise ValidationError(
2783
+ self.bundle.get("missing_id_column").format(id_column, list(validated_X.columns))
2784
+ )
2733
2785
 
2734
2786
  validate_scoring_argument(scoring)
2735
2787
 
@@ -3071,7 +3123,7 @@ if response.status_code == 200:
3071
3123
  self.__show_selected_features(self.fit_search_keys)
3072
3124
 
3073
3125
  autofe_description = self.get_autofe_features_description()
3074
- if autofe_description is not None:
3126
+ if autofe_description is not None and len(autofe_description) > 0:
3075
3127
  self.logger.info(f"AutoFE descriptions: {autofe_description}")
3076
3128
  self.autofe_features_display_handle = display_html_dataframe(
3077
3129
  df=autofe_description,
@@ -3855,6 +3907,13 @@ if response.status_code == 200:
3855
3907
 
3856
3908
  return importances
3857
3909
 
3910
+ def __get_categorical_features(self) -> List[str]:
3911
+ features_meta = self._search_task.get_all_features_metadata_v2()
3912
+ if features_meta is None:
3913
+ raise Exception(self.bundle.get("missing_features_meta"))
3914
+
3915
+ return [f.name for f in features_meta if f.type == "categorical"]
3916
+
3858
3917
  def __prepare_feature_importances(
3859
3918
  self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
3860
3919
  ):
@@ -3883,7 +3942,10 @@ if response.status_code == 200:
3883
3942
  if feature_meta.name in original_names_dict.keys():
3884
3943
  feature_meta.name = original_names_dict[feature_meta.name]
3885
3944
 
3886
- if updated_shaps is not None:
3945
+ is_client_feature = feature_meta.name in df.columns
3946
+
3947
+ # Show and update shap values for client features only if select_features is True
3948
+ if updated_shaps is not None and (not is_client_feature or self.fit_select_features):
3887
3949
  updating_shap = updated_shaps.get(feature_meta.name)
3888
3950
  if updating_shap is None:
3889
3951
  if feature_meta.shap_value != 0.0:
@@ -3906,6 +3968,7 @@ if response.status_code == 200:
3906
3968
  continue
3907
3969
 
3908
3970
  # Use only important features
3971
+ # If select_features is False, we don't show etalon features in the report
3909
3972
  if (
3910
3973
  # feature_meta.name in self.fit_generated_features or
3911
3974
  feature_meta.name == COUNTRY # constant synthetic column
@@ -4216,7 +4279,7 @@ if response.status_code == 200:
4216
4279
  def __show_selected_features(self, search_keys: Dict[str, SearchKey]):
4217
4280
  search_key_names = [col for col, tpe in search_keys.items() if tpe != SearchKey.CUSTOM_KEY]
4218
4281
  if self.fit_columns_renaming:
4219
- search_key_names = [self.fit_columns_renaming.get(col, col) for col in search_key_names]
4282
+ search_key_names = sorted(set([self.fit_columns_renaming.get(col, col) for col in search_key_names]))
4220
4283
  msg = self.bundle.get("features_info_header").format(len(self.feature_names_), search_key_names)
4221
4284
 
4222
4285
  try:
@@ -4232,12 +4295,13 @@ if response.status_code == 200:
4232
4295
  display_id=f"features_info_{uuid.uuid4()}",
4233
4296
  )
4234
4297
 
4235
- self.data_sources_display_handle = display_html_dataframe(
4236
- self.relevant_data_sources,
4237
- self._relevant_data_sources_wo_links,
4238
- self.bundle.get("relevant_data_sources_header"),
4239
- display_id=f"data_sources_{uuid.uuid4()}",
4240
- )
4298
+ if len(self.relevant_data_sources) > 0:
4299
+ self.data_sources_display_handle = display_html_dataframe(
4300
+ self.relevant_data_sources,
4301
+ self._relevant_data_sources_wo_links,
4302
+ self.bundle.get("relevant_data_sources_header"),
4303
+ display_id=f"data_sources_{uuid.uuid4()}",
4304
+ )
4241
4305
  else:
4242
4306
  msg = self.bundle.get("features_info_zero_important_features")
4243
4307
  self.__log_warning(msg, show_support_link=True)