upgini 1.2.113a4__py3-none-any.whl → 1.2.113a3974.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -112,7 +112,6 @@ except Exception:
112
112
  CustomFallbackProgressBar as ProgressBar,
113
113
  )
114
114
 
115
- from upgini.utils.psi import calculate_features_psi
116
115
  from upgini.utils.sample_utils import SampleColumns, SampleConfig, _num_samples, sample
117
116
  from upgini.utils.sort import sort_columns
118
117
  from upgini.utils.target_utils import calculate_psi, define_task
@@ -298,9 +297,7 @@ class FeaturesEnricher(TransformerMixin):
298
297
  self.feature_names_ = []
299
298
  self.external_source_feature_names = []
300
299
  self.zero_shap_client_features = []
301
- self.unstable_client_features = []
302
300
  self.feature_importances_ = []
303
- self.psi_values: Optional[Dict[str, float]] = None
304
301
  self.search_id = search_id
305
302
  self.disable_force_downsampling = disable_force_downsampling
306
303
  self.print_trace_id = print_trace_id
@@ -401,26 +398,13 @@ class FeaturesEnricher(TransformerMixin):
401
398
  @staticmethod
402
399
  def _check_eval_set(eval_set, X, bundle: ResourceBundle):
403
400
  checked_eval_set = []
404
- if eval_set is None:
405
- return checked_eval_set
406
- if isinstance(eval_set, tuple):
401
+ if eval_set is not None and isinstance(eval_set, tuple):
407
402
  eval_set = [eval_set]
408
- if not isinstance(eval_set, list):
403
+ if eval_set is not None and not isinstance(eval_set, list):
409
404
  raise ValidationError(bundle.get("unsupported_type_eval_set").format(type(eval_set)))
410
405
  for eval_pair in eval_set or []:
411
- # Handle OOT
412
- if isinstance(eval_pair, pd.DataFrame):
413
- empty_target = pd.Series([np.nan] * len(eval_pair), index=eval_pair.index)
414
- eval_pair = (eval_pair, empty_target)
415
- elif isinstance(eval_pair, tuple) and len(eval_pair) == 1:
416
- empty_target = pd.Series([np.nan] * len(eval_pair[0]), index=eval_pair[0].index)
417
- eval_pair = (eval_pair[0], empty_target)
418
-
419
406
  if not isinstance(eval_pair, tuple) or len(eval_pair) != 2:
420
407
  raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
421
- if eval_pair[1] is None:
422
- empty_target = pd.Series([np.nan] * len(eval_pair[0]), index=eval_pair[0].index)
423
- eval_pair = (eval_pair[0], empty_target)
424
408
  if not is_frames_equal(X, eval_pair[0], bundle):
425
409
  checked_eval_set.append(eval_pair)
426
410
  return checked_eval_set
@@ -442,7 +426,6 @@ class FeaturesEnricher(TransformerMixin):
442
426
  search_id_callback: Optional[Callable[[str], Any]] = None,
443
427
  select_features: bool = True,
444
428
  auto_fe_parameters: Optional[AutoFEParameters] = None,
445
- stability_threshold: float = 0.15,
446
429
  **kwargs,
447
430
  ):
448
431
  """Fit to data.
@@ -532,7 +515,6 @@ class FeaturesEnricher(TransformerMixin):
532
515
  estimator=estimator,
533
516
  scoring=scoring,
534
517
  importance_threshold=importance_threshold,
535
- stability_threshold=stability_threshold,
536
518
  max_features=max_features,
537
519
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
538
520
  auto_fe_parameters=auto_fe_parameters,
@@ -592,7 +574,6 @@ class FeaturesEnricher(TransformerMixin):
592
574
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
593
575
  select_features: bool = True,
594
576
  auto_fe_parameters: Optional[AutoFEParameters] = None,
595
- stability_threshold: float = 0.15,
596
577
  **kwargs,
597
578
  ) -> pd.DataFrame:
598
579
  """Fit to data, then transform it.
@@ -637,10 +618,6 @@ class FeaturesEnricher(TransformerMixin):
637
618
  If True, return only selected features both from input and data sources.
638
619
  Otherwise, return all features from input and only selected features from data sources.
639
620
 
640
- stability_threshold: float, optional (default=0.15)
641
- Stability threshold for selected features PSI calculation. If PSI is less than this threshold,
642
- then feature will be dropped.
643
-
644
621
  Returns
645
622
  -------
646
623
  X_new: pandas.DataFrame of shape (n_samples, n_features_new)
@@ -697,7 +674,6 @@ class FeaturesEnricher(TransformerMixin):
697
674
  scoring=scoring,
698
675
  estimator=estimator,
699
676
  importance_threshold=importance_threshold,
700
- stability_threshold=stability_threshold,
701
677
  max_features=max_features,
702
678
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
703
679
  auto_fe_parameters=auto_fe_parameters,
@@ -965,7 +941,7 @@ class FeaturesEnricher(TransformerMixin):
965
941
  raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
966
942
 
967
943
  validated_X, validated_y, validated_eval_set = self._validate_train_eval(
968
- effective_X, effective_y, effective_eval_set, silent=internal_call
944
+ effective_X, effective_y, effective_eval_set
969
945
  )
970
946
 
971
947
  if self.X is None:
@@ -1003,31 +979,29 @@ class FeaturesEnricher(TransformerMixin):
1003
979
  return None
1004
980
 
1005
981
  cat_features_from_backend = self.__get_categorical_features()
1006
- # Convert to original names
1007
- cat_features_from_backend = [self.fit_columns_renaming.get(c, c) for c in cat_features_from_backend]
1008
982
  client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
1009
983
  estimator, validated_X, self.search_keys
1010
984
  )
1011
- # Exclude id columns from cat_features
1012
985
  if self.id_columns and self.id_columns_encoder is not None:
1013
986
  if cat_features_from_backend:
1014
987
  cat_features_from_backend = [
1015
988
  c
1016
989
  for c in cat_features_from_backend
1017
- if c not in self.id_columns_encoder.feature_names_in_
990
+ if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
1018
991
  ]
1019
992
  if client_cat_features:
1020
993
  client_cat_features = [
1021
994
  c
1022
995
  for c in client_cat_features
1023
- if c not in self.id_columns_encoder.feature_names_in_
996
+ if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
1024
997
  ]
1025
998
  for cat_feature in cat_features_from_backend:
1026
- if cat_feature in self.search_keys:
1027
- if self.search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
1028
- search_keys_for_metrics.append(cat_feature)
999
+ original_cat_feature = self.fit_columns_renaming.get(cat_feature)
1000
+ if original_cat_feature in self.search_keys:
1001
+ if self.search_keys[original_cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
1002
+ search_keys_for_metrics.append(original_cat_feature)
1029
1003
  else:
1030
- self.logger.warning(self.bundle.get("cat_feature_search_key").format(cat_feature))
1004
+ self.logger.warning(self.bundle.get("cat_feature_search_key").format(original_cat_feature))
1031
1005
  search_keys_for_metrics.extend([c for c in self.id_columns or [] if c not in search_keys_for_metrics])
1032
1006
  self.logger.info(f"Search keys for metrics: {search_keys_for_metrics}")
1033
1007
 
@@ -1059,9 +1033,23 @@ class FeaturesEnricher(TransformerMixin):
1059
1033
  groups,
1060
1034
  _cv,
1061
1035
  columns_renaming,
1062
- _,
1063
1036
  ) = prepared_data
1064
1037
 
1038
+ # rename cat_features
1039
+ if client_cat_features:
1040
+ for new_c, old_c in columns_renaming.items():
1041
+ if old_c in client_cat_features:
1042
+ client_cat_features.remove(old_c)
1043
+ client_cat_features.append(new_c)
1044
+ for cat_feature in client_cat_features:
1045
+ if cat_feature not in fitting_X.columns:
1046
+ self.logger.error(
1047
+ f"Client cat_feature `{cat_feature}` not found in"
1048
+ f" x columns: {fitting_X.columns.to_list()}"
1049
+ )
1050
+ else:
1051
+ client_cat_features = []
1052
+
1065
1053
  # rename baseline_score_column
1066
1054
  reversed_renaming = {v: k for k, v in columns_renaming.items()}
1067
1055
  baseline_score_column = self.baseline_score_column
@@ -1086,9 +1074,9 @@ class FeaturesEnricher(TransformerMixin):
1086
1074
  self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
1087
1075
 
1088
1076
  has_date = self._get_date_column(search_keys) is not None
1077
+ has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
1089
1078
  model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
1090
1079
  cat_features = list(set(client_cat_features + cat_features_from_backend))
1091
- has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
1092
1080
  baseline_cat_features = [f for f in cat_features if f in fitting_X.columns]
1093
1081
  enriched_cat_features = [f for f in cat_features if f in fitting_enriched_X.columns]
1094
1082
  if len(enriched_cat_features) < len(cat_features):
@@ -1208,6 +1196,8 @@ class FeaturesEnricher(TransformerMixin):
1208
1196
  # max_initial_eval_set_hit_rate = self._search_task.get_max_initial_eval_set_hit_rate_v2()
1209
1197
  if len(fitting_eval_set_dict) > 0:
1210
1198
  for idx in fitting_eval_set_dict.keys():
1199
+ # eval_hit_rate = max_initial_eval_set_hit_rate[idx + 1]
1200
+
1211
1201
  (
1212
1202
  eval_X_sorted,
1213
1203
  eval_y_sorted,
@@ -1215,10 +1205,6 @@ class FeaturesEnricher(TransformerMixin):
1215
1205
  enriched_eval_y_sorted,
1216
1206
  ) = fitting_eval_set_dict[idx]
1217
1207
 
1218
- if eval_y_sorted.isna().all():
1219
- # Skip OOT eval set
1220
- continue
1221
-
1222
1208
  if baseline_estimator is not None:
1223
1209
  self.logger.info(
1224
1210
  f"Calculate baseline {metric} on eval set {idx + 1} "
@@ -1261,14 +1247,17 @@ class FeaturesEnricher(TransformerMixin):
1261
1247
  "quality_metrics_eval_segment"
1262
1248
  ).format(idx + 1),
1263
1249
  self.bundle.get("quality_metrics_rows_header"): _num_samples(
1250
+ # effective_eval_set[idx][0]
1264
1251
  # Use actually used for metrics dataset
1265
1252
  eval_X_sorted
1266
1253
  ),
1254
+ # self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
1267
1255
  }
1268
1256
  if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
1269
1257
  eval_y_sorted
1270
1258
  ):
1271
1259
  eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
1260
+ # np.mean(validated_eval_set[idx][1]), 4
1272
1261
  # Use actually used for metrics dataset
1273
1262
  np.mean(eval_y_sorted),
1274
1263
  4,
@@ -1290,7 +1279,7 @@ class FeaturesEnricher(TransformerMixin):
1290
1279
  metrics.append(eval_metrics)
1291
1280
 
1292
1281
  if updating_shaps is not None:
1293
- decoded_X = self._decode_id_columns(fitting_X)
1282
+ decoded_X = self._decode_id_columns(fitting_X, columns_renaming)
1294
1283
  self._update_shap_values(trace_id, decoded_X, updating_shaps, silent=not internal_call)
1295
1284
 
1296
1285
  metrics_df = pd.DataFrame(metrics)
@@ -1341,188 +1330,6 @@ class FeaturesEnricher(TransformerMixin):
1341
1330
  finally:
1342
1331
  self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
1343
1332
 
1344
- def _select_features_by_psi(
1345
- self,
1346
- trace_id: str,
1347
- X: Union[pd.DataFrame, pd.Series, np.ndarray],
1348
- y: Union[pd.DataFrame, pd.Series, np.ndarray, List],
1349
- eval_set: Optional[Union[List[tuple], tuple]],
1350
- stability_threshold: float,
1351
- cv: Union[BaseCrossValidator, CVType, str, None] = None,
1352
- estimator=None,
1353
- exclude_features_sources: Optional[List[str]] = None,
1354
- importance_threshold: Optional[float] = None,
1355
- max_features: Optional[int] = None,
1356
- progress_bar: bool = True,
1357
- progress_callback: Optional[Callable] = None,
1358
- ):
1359
- search_keys = self.search_keys.copy()
1360
- validated_X, _, validated_eval_set = self._validate_train_eval(X, y, eval_set, silent=True)
1361
- if isinstance(X, np.ndarray):
1362
- search_keys = {str(k): v for k, v in search_keys.items()}
1363
-
1364
- date_column = self._get_date_column(search_keys)
1365
- has_date = date_column is not None
1366
- if not has_date:
1367
- self.logger.info("No date column for OOT PSI calculation")
1368
- return
1369
- if not validated_eval_set:
1370
- self.logger.info("No eval set for OOT PSI calculation")
1371
- return
1372
- if validated_X[date_column].nunique() <= 1:
1373
- self.logger.warning("Constant date for OOT PSI calculation")
1374
- return
1375
- if self.cv is not None and self.cv.is_time_series():
1376
- self.logger.warning("Time series CV is not supported for OOT PSI calculation")
1377
- return
1378
-
1379
- cat_features_from_backend = self.__get_categorical_features()
1380
- cat_features_from_backend = [self.fit_columns_renaming.get(c, c) for c in cat_features_from_backend]
1381
- client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
1382
- estimator, validated_X, search_keys
1383
- )
1384
- if self.id_columns and self.id_columns_encoder is not None:
1385
- if cat_features_from_backend:
1386
- cat_features_from_backend = [
1387
- c
1388
- for c in cat_features_from_backend
1389
- if c not in self.id_columns_encoder.feature_names_in_
1390
- ]
1391
- if client_cat_features:
1392
- client_cat_features = [
1393
- c
1394
- for c in client_cat_features
1395
- if c not in self.id_columns_encoder.feature_names_in_
1396
- ]
1397
-
1398
- prepared_data = self._prepare_data_for_metrics(
1399
- trace_id=trace_id,
1400
- X=X,
1401
- y=y,
1402
- eval_set=eval_set,
1403
- exclude_features_sources=exclude_features_sources,
1404
- importance_threshold=importance_threshold,
1405
- max_features=max_features,
1406
- remove_outliers_calc_metrics=False,
1407
- cv_override=cv,
1408
- search_keys_for_metrics=search_keys_for_metrics,
1409
- progress_bar=progress_bar,
1410
- progress_callback=progress_callback,
1411
- client_cat_features=client_cat_features,
1412
- )
1413
- if prepared_data is None:
1414
- return None
1415
-
1416
- (
1417
- validated_X,
1418
- fitting_X,
1419
- y_sorted,
1420
- fitting_enriched_X,
1421
- _,
1422
- fitting_eval_set_dict,
1423
- _,
1424
- _,
1425
- _,
1426
- columns_renaming,
1427
- eval_set_dates,
1428
- ) = prepared_data
1429
-
1430
- model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
1431
- cat_features = list(set(client_cat_features + cat_features_from_backend))
1432
-
1433
- # Drop unstable features
1434
- unstable_features = self._check_stability(
1435
- validated_X,
1436
- validated_eval_set,
1437
- fitting_eval_set_dict,
1438
- eval_set_dates,
1439
- search_keys,
1440
- stability_threshold,
1441
- cat_features,
1442
- model_task_type,
1443
- )
1444
- client_features_df = self.df_with_original_index.rename(columns=columns_renaming)
1445
- # decoded_X = self._decode_id_columns(fitting_X, columns_renaming)
1446
- self._update_report_psi(trace_id, client_features_df)
1447
-
1448
- if unstable_features:
1449
- msg = f"Some features are unstable: {unstable_features} and will be dropped"
1450
- self.logger.warning(msg)
1451
- print(msg)
1452
- fitting_X = fitting_X.drop(columns=unstable_features, errors="ignore")
1453
- fitting_enriched_X = fitting_enriched_X.drop(columns=unstable_features, errors="ignore")
1454
- msg = f"Threre are {len(fitting_enriched_X.columns)} stable selected features left"
1455
- self.logger.info(msg)
1456
- print(msg)
1457
- for idx, (
1458
- eval_X,
1459
- eval_y,
1460
- eval_enriched_X,
1461
- eval_enriched_y,
1462
- ) in fitting_eval_set_dict.items():
1463
- eval_X = eval_X.drop(columns=unstable_features, errors="ignore")
1464
- eval_enriched_X = eval_enriched_X.drop(columns=unstable_features, errors="ignore")
1465
- fitting_eval_set_dict[idx] = (eval_X, eval_y, eval_enriched_X, eval_enriched_y)
1466
-
1467
- def _check_stability(
1468
- self,
1469
- X: pd.DataFrame,
1470
- eval_set: List[Tuple[pd.DataFrame, pd.Series]],
1471
- enriched_eval_set: Dict,
1472
- eval_set_dates: Dict[int, pd.Series],
1473
- search_keys: Dict[str, SearchKey],
1474
- stability_threshold: float,
1475
- cat_features: List[str],
1476
- model_task_type: ModelTaskType,
1477
- ) -> List[str]:
1478
- # Find latest eval set or earliest if all eval sets are before train set
1479
- date_column = self._get_date_column(search_keys)
1480
-
1481
- # Get minimum date from main dataset X
1482
- main_min_date = X[date_column].min()
1483
-
1484
- # Find minimum date for each eval_set and compare with main dataset
1485
- eval_dates = []
1486
- for i, (eval_x, _) in enumerate(eval_set):
1487
- if date_column in eval_x.columns:
1488
- eval_min_date = eval_x[date_column].min()
1489
- eval_max_date = eval_x[date_column].max()
1490
- eval_dates.append((i, eval_min_date, eval_max_date))
1491
-
1492
- if not eval_dates:
1493
- return []
1494
-
1495
- # Check if any eval_set has minimum date >= main dataset minimum date
1496
- later_eval_sets = [(i, min_date, max_date) for i, min_date, max_date in eval_dates if min_date >= main_min_date]
1497
-
1498
- if later_eval_sets:
1499
- # If there are eval_sets with date >= main date, choose the one with highest maximum date
1500
- selected_eval_set_idx = max(later_eval_sets, key=lambda x: x[2])[0]
1501
- else:
1502
- # If all eval_sets have dates < main date, choose the one with lowest minimux date
1503
- selected_eval_set_idx = max(eval_dates, key=lambda x: x[1])[0]
1504
-
1505
- checking_eval_set = enriched_eval_set[selected_eval_set_idx]
1506
-
1507
- checking_eval_set_df = (
1508
- checking_eval_set[2]
1509
- if checking_eval_set[1] is None or checking_eval_set[1].isna().all()
1510
- else pd.concat([checking_eval_set[2], checking_eval_set[1].to_frame(TARGET)], axis=1)
1511
- )
1512
- checking_eval_set_df = checking_eval_set_df.copy()
1513
-
1514
- checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
1515
-
1516
- psi_values = calculate_features_psi(
1517
- checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
1518
- )
1519
-
1520
- self.psi_values = {
1521
- feature: psi_value for feature, psi_value in psi_values.items() if psi_value <= stability_threshold
1522
- }
1523
-
1524
- return [feature for feature, psi in psi_values.items() if psi > stability_threshold]
1525
-
1526
1333
  def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float], silent: bool = False):
1527
1334
  renaming = self.fit_columns_renaming or {}
1528
1335
  self.logger.info(f"Updating SHAP values: {new_shaps}")
@@ -1578,56 +1385,6 @@ class FeaturesEnricher(TransformerMixin):
1578
1385
  except (ImportError, NameError):
1579
1386
  pass
1580
1387
 
1581
- def _update_report_psi(self, trace_id: str, clients_features_df: pd.DataFrame):
1582
- self.__prepare_feature_importances(trace_id, clients_features_df)
1583
-
1584
- if self.features_info_display_handle is not None:
1585
- try:
1586
- _ = get_ipython() # type: ignore
1587
-
1588
- display_html_dataframe(
1589
- self.features_info,
1590
- self._features_info_without_links,
1591
- self.bundle.get("relevant_features_header"),
1592
- display_handle=self.features_info_display_handle,
1593
- )
1594
- except (ImportError, NameError):
1595
- pass
1596
-
1597
- if self.data_sources_display_handle is not None:
1598
- try:
1599
- _ = get_ipython() # type: ignore
1600
-
1601
- display_html_dataframe(
1602
- self.relevant_data_sources,
1603
- self._relevant_data_sources_wo_links,
1604
- self.bundle.get("relevant_data_sources_header"),
1605
- display_handle=self.data_sources_display_handle,
1606
- )
1607
- except (ImportError, NameError):
1608
- pass
1609
-
1610
- if self.autofe_features_display_handle is not None:
1611
- try:
1612
- _ = get_ipython() # type: ignore
1613
- autofe_descriptions_df = self.get_autofe_features_description()
1614
- if autofe_descriptions_df is not None:
1615
- display_html_dataframe(
1616
- df=autofe_descriptions_df,
1617
- internal_df=autofe_descriptions_df,
1618
- header=self.bundle.get("autofe_descriptions_header"),
1619
- display_handle=self.autofe_features_display_handle,
1620
- )
1621
- except (ImportError, NameError):
1622
- pass
1623
- if self.report_button_handle is not None:
1624
- try:
1625
- _ = get_ipython() # type: ignore
1626
-
1627
- self.__show_report_button(display_handle=self.report_button_handle)
1628
- except (ImportError, NameError):
1629
- pass
1630
-
1631
1388
  def _check_train_and_eval_target_distribution(self, y, eval_set_dict):
1632
1389
  uneven_distribution = False
1633
1390
  for eval_set in eval_set_dict.values():
@@ -1731,7 +1488,7 @@ class FeaturesEnricher(TransformerMixin):
1731
1488
  def _get_and_validate_client_cat_features(
1732
1489
  self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
1733
1490
  ) -> Tuple[Optional[List[str]], List[str]]:
1734
- cat_features = []
1491
+ cat_features = None
1735
1492
  search_keys_for_metrics = []
1736
1493
  if (
1737
1494
  estimator is not None
@@ -1778,7 +1535,7 @@ class FeaturesEnricher(TransformerMixin):
1778
1535
  is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
1779
1536
  is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
1780
1537
  checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
1781
- validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set, silent=True)
1538
+ validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set)
1782
1539
 
1783
1540
  sampled_data = self._get_enriched_for_metrics(
1784
1541
  trace_id,
@@ -1792,7 +1549,7 @@ class FeaturesEnricher(TransformerMixin):
1792
1549
  progress_bar,
1793
1550
  progress_callback,
1794
1551
  )
1795
- (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming) = dataclasses.astuple(
1552
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = dataclasses.astuple(
1796
1553
  sampled_data
1797
1554
  )
1798
1555
 
@@ -1815,7 +1572,8 @@ class FeaturesEnricher(TransformerMixin):
1815
1572
  or c in set(self.feature_names_).union(self.id_columns or [])
1816
1573
  or (self.fit_columns_renaming or {}).get(c, c) in set(self.feature_names_).union(self.id_columns or [])
1817
1574
  )
1818
- and c not in (
1575
+ and c
1576
+ not in (
1819
1577
  excluding_search_keys
1820
1578
  + list(self.fit_dropped_features)
1821
1579
  + [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
@@ -1900,7 +1658,7 @@ class FeaturesEnricher(TransformerMixin):
1900
1658
  fitting_X, y_sorted, search_keys, self.model_task_type, sort_all_columns=True, logger=self.logger
1901
1659
  )
1902
1660
  fitting_X = fitting_X[fitting_x_columns]
1903
- fitting_X, _ = self._encode_id_columns(fitting_X)
1661
+ fitting_X, _ = self._encode_id_columns(fitting_X, self.fit_columns_renaming)
1904
1662
  self.logger.info(f"Final sorted list of fitting X columns: {fitting_x_columns}")
1905
1663
  fitting_enriched_x_columns = fitting_enriched_X.columns.to_list()
1906
1664
  fitting_enriched_x_columns = sort_columns(
@@ -1912,18 +1670,14 @@ class FeaturesEnricher(TransformerMixin):
1912
1670
  logger=self.logger,
1913
1671
  )
1914
1672
  fitting_enriched_X = fitting_enriched_X[fitting_enriched_x_columns]
1915
- fitting_enriched_X, _ = self._encode_id_columns(fitting_enriched_X)
1673
+ fitting_enriched_X, _ = self._encode_id_columns(fitting_enriched_X, self.fit_columns_renaming)
1916
1674
  self.logger.info(f"Final sorted list of fitting enriched X columns: {fitting_enriched_x_columns}")
1917
- date_column = self._get_date_column(search_keys)
1918
- eval_set_dates = {}
1919
1675
  for idx, eval_tuple in eval_set_sampled_dict.items():
1920
1676
  eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
1921
1677
  eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
1922
1678
  enriched_eval_X_sorted, enriched_eval_y_sorted = self._sort_by_system_record_id(
1923
1679
  enriched_eval_X, eval_y_sampled, self.cv
1924
1680
  )
1925
- if date_column is not None:
1926
- eval_set_dates[idx] = eval_X_sorted[date_column]
1927
1681
  fitting_eval_X = eval_X_sorted[fitting_x_columns].copy()
1928
1682
  fitting_enriched_eval_X = enriched_eval_X_sorted[fitting_enriched_x_columns].copy()
1929
1683
 
@@ -1944,8 +1698,8 @@ class FeaturesEnricher(TransformerMixin):
1944
1698
  .astype(np.float64)
1945
1699
  )
1946
1700
 
1947
- fitting_eval_X, unknown_dict = self._encode_id_columns(fitting_eval_X)
1948
- fitting_enriched_eval_X, _ = self._encode_id_columns(fitting_enriched_eval_X)
1701
+ fitting_eval_X, unknown_dict = self._encode_id_columns(fitting_eval_X, self.fit_columns_renaming)
1702
+ fitting_enriched_eval_X, _ = self._encode_id_columns(fitting_enriched_eval_X, self.fit_columns_renaming)
1949
1703
 
1950
1704
  if len(unknown_dict) > 0:
1951
1705
  print(self.bundle.get("unknown_id_column_value_in_eval_set").format(unknown_dict))
@@ -1968,7 +1722,6 @@ class FeaturesEnricher(TransformerMixin):
1968
1722
  groups,
1969
1723
  cv,
1970
1724
  columns_renaming,
1971
- eval_set_dates,
1972
1725
  )
1973
1726
 
1974
1727
  @dataclass
@@ -2131,16 +1884,14 @@ class FeaturesEnricher(TransformerMixin):
2131
1884
  remove_outliers_calc_metrics: Optional[bool],
2132
1885
  ) -> _EnrichedDataForMetrics:
2133
1886
  eval_set_sampled_dict = {}
2134
- search_keys = self.fit_search_keys.copy()
1887
+ search_keys = self.fit_search_keys
2135
1888
 
2136
1889
  rows_to_drop = None
2137
1890
  has_date = self._get_date_column(search_keys) is not None
2138
1891
  self.model_task_type = self.model_task_type or define_task(
2139
1892
  self.df_with_original_index[TARGET], has_date, self.logger, silent=True
2140
1893
  )
2141
- if remove_outliers_calc_metrics is None:
2142
- remove_outliers_calc_metrics = True
2143
- if self.model_task_type == ModelTaskType.REGRESSION and remove_outliers_calc_metrics:
1894
+ if self.model_task_type == ModelTaskType.REGRESSION:
2144
1895
  target_outliers_df = self._search_task.get_target_outliers(trace_id)
2145
1896
  if target_outliers_df is not None and len(target_outliers_df) > 0:
2146
1897
  outliers = pd.merge(
@@ -2150,8 +1901,11 @@ class FeaturesEnricher(TransformerMixin):
2150
1901
  how="inner",
2151
1902
  )
2152
1903
  top_outliers = outliers.sort_values(by=TARGET, ascending=False)[TARGET].head(3)
2153
- rows_to_drop = outliers
2154
- not_msg = ""
1904
+ if remove_outliers_calc_metrics is None or remove_outliers_calc_metrics is True:
1905
+ rows_to_drop = outliers
1906
+ not_msg = ""
1907
+ else:
1908
+ not_msg = "not "
2155
1909
  msg = self.bundle.get("target_outliers_warning").format(len(target_outliers_df), top_outliers, not_msg)
2156
1910
  print(msg)
2157
1911
  self.logger.warning(msg)
@@ -2209,13 +1963,12 @@ class FeaturesEnricher(TransformerMixin):
2209
1963
  enriched_eval_X = enriched_eval_sets[idx + 1][enriched_X_columns].copy()
2210
1964
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
2211
1965
 
2212
- # reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
2213
- X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
2214
- enriched_X.rename(columns=self.fit_columns_renaming, inplace=True)
1966
+ reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
1967
+ X_sampled.rename(columns=reversed_renaming, inplace=True)
1968
+ enriched_X.rename(columns=reversed_renaming, inplace=True)
2215
1969
  for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
2216
- eval_X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
2217
- enriched_eval_X.rename(columns=self.fit_columns_renaming, inplace=True)
2218
- search_keys = {self.fit_columns_renaming.get(k, k): v for k, v in search_keys.items()}
1970
+ eval_X_sampled.rename(columns=reversed_renaming, inplace=True)
1971
+ enriched_eval_X.rename(columns=reversed_renaming, inplace=True)
2219
1972
 
2220
1973
  datasets_hash = hash_input(self.X, self.y, self.eval_set)
2221
1974
  return self.__cache_and_return_results(
@@ -2359,7 +2112,7 @@ class FeaturesEnricher(TransformerMixin):
2359
2112
 
2360
2113
  def __extract_eval_data(
2361
2114
  self, enriched_df: pd.DataFrame, x_columns: List[str], enriched_X_columns: List[str], eval_set_len: int
2362
- ) -> Tuple[Dict[int, Tuple], Dict[int, pd.Series]]:
2115
+ ) -> Dict[int, Tuple]:
2363
2116
  eval_set_sampled_dict = {}
2364
2117
 
2365
2118
  for idx in range(eval_set_len):
@@ -2405,12 +2158,12 @@ class FeaturesEnricher(TransformerMixin):
2405
2158
  columns_renaming: Dict[str, str],
2406
2159
  ):
2407
2160
  # X_sampled - with hash-suffixes
2408
- # reversed_renaming = {v: k for k, v in columns_renaming.items()}
2409
- # search_keys = {
2410
- # reversed_renaming.get(k, k): v
2411
- # for k, v in search_keys.items()
2412
- # if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
2413
- # }
2161
+ reversed_renaming = {v: k for k, v in columns_renaming.items()}
2162
+ search_keys = {
2163
+ reversed_renaming.get(k, k): v
2164
+ for k, v in search_keys.items()
2165
+ if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
2166
+ }
2414
2167
  return FeaturesEnricher._EnrichedDataForMetrics(
2415
2168
  X_sampled=X_sampled,
2416
2169
  y_sampled=y_sampled,
@@ -2560,7 +2313,7 @@ if response.status_code == 200:
2560
2313
  self.logger.info("Start transform")
2561
2314
 
2562
2315
  validated_X, validated_y, validated_eval_set = self._validate_train_eval(
2563
- X, y, eval_set=None, is_transform=True, silent=True
2316
+ X, y, eval_set=None, is_transform=True
2564
2317
  )
2565
2318
  df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
2566
2319
 
@@ -2662,7 +2415,7 @@ if response.status_code == 200:
2662
2415
  else:
2663
2416
  self.logger.info("Input dataset hasn't date column")
2664
2417
  if self.__should_add_date_column():
2665
- df = self._add_current_date_as_key(df, search_keys, self.bundle, silent=True)
2418
+ df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
2666
2419
 
2667
2420
  email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
2668
2421
  if email_columns and self.generate_search_key_features:
@@ -2911,8 +2664,7 @@ if response.status_code == 200:
2911
2664
  selecting_columns = [
2912
2665
  c
2913
2666
  for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
2914
- if (c not in self.zero_shap_client_features and c not in self.unstable_client_features)
2915
- or c in (self.id_columns or [])
2667
+ if c not in self.zero_shap_client_features or c in (self.id_columns or [])
2916
2668
  ]
2917
2669
  selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
2918
2670
  if add_fit_system_record_id:
@@ -3046,7 +2798,6 @@ if response.status_code == 200:
3046
2798
  scoring: Union[Callable, str, None],
3047
2799
  estimator: Optional[Any],
3048
2800
  importance_threshold: Optional[float],
3049
- stability_threshold: float,
3050
2801
  max_features: Optional[int],
3051
2802
  remove_outliers_calc_metrics: Optional[bool],
3052
2803
  auto_fe_parameters: AutoFEParameters,
@@ -3061,7 +2812,6 @@ if response.status_code == 200:
3061
2812
  self.fit_columns_renaming = None
3062
2813
  self.fit_dropped_features = set()
3063
2814
  self.fit_generated_features = []
3064
- self.psi_values = None
3065
2815
 
3066
2816
  validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, eval_set)
3067
2817
 
@@ -3158,7 +2908,7 @@ if response.status_code == 200:
3158
2908
  self.logger.info("Input dataset hasn't date column")
3159
2909
  # TODO remove when this logic will be implemented on the back
3160
2910
  if self.__should_add_date_column():
3161
- df = self._add_current_date_as_key(df, self.fit_search_keys, self.bundle)
2911
+ df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
3162
2912
 
3163
2913
  email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
3164
2914
  if email_columns and self.generate_search_key_features:
@@ -3173,13 +2923,10 @@ if response.status_code == 200:
3173
2923
  except Exception:
3174
2924
  self.logger.exception("Failed to check dates distribution validity")
3175
2925
 
3176
- self.__adjust_cv(df)
3177
-
3178
2926
  if (
3179
2927
  is_numeric_dtype(df[self.TARGET_NAME])
3180
2928
  and self.model_task_type in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]
3181
2929
  and has_date
3182
- and (self.cv is None or not self.cv.is_time_series())
3183
2930
  ):
3184
2931
  self._validate_PSI(df.sort_values(by=maybe_date_column))
3185
2932
 
@@ -3211,8 +2958,8 @@ if response.status_code == 200:
3211
2958
 
3212
2959
  self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
3213
2960
 
3214
- # Group columns should have normalized names
3215
- self.__adjust_cv(df, force=True)
2961
+ self.__adjust_cv(df)
2962
+
3216
2963
  if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
3217
2964
  id_columns = self.__get_renamed_id_columns()
3218
2965
  if id_columns:
@@ -3417,21 +3164,6 @@ if response.status_code == 200:
3417
3164
  display_id=f"autofe_descriptions_{uuid.uuid4()}",
3418
3165
  )
3419
3166
 
3420
- self._select_features_by_psi(
3421
- trace_id=trace_id,
3422
- X=X,
3423
- y=y,
3424
- eval_set=eval_set,
3425
- stability_threshold=stability_threshold,
3426
- cv=self.cv,
3427
- estimator=estimator,
3428
- exclude_features_sources=exclude_features_sources,
3429
- importance_threshold=importance_threshold,
3430
- max_features=max_features,
3431
- progress_bar=progress_bar,
3432
- progress_callback=progress_callback,
3433
- )
3434
-
3435
3167
  if self._has_paid_features(exclude_features_sources):
3436
3168
  if calculate_metrics is not None and calculate_metrics:
3437
3169
  msg = self.bundle.get("metrics_with_paid_features")
@@ -3517,21 +3249,19 @@ if response.status_code == 200:
3517
3249
  reverse_renaming = {v: k for k, v in renaming.items()}
3518
3250
  return None if self.id_columns is None else [reverse_renaming.get(c) or c for c in self.id_columns]
3519
3251
 
3520
- def __adjust_cv(self, df: pd.DataFrame, force: bool = False):
3521
- if self.cv is not None and not force:
3522
- return
3523
-
3252
+ def __adjust_cv(self, df: pd.DataFrame):
3524
3253
  date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3525
3254
  # Check Multivariate time series
3526
3255
  if (
3527
- date_column
3256
+ self.cv is None
3257
+ and date_column
3528
3258
  and self.model_task_type == ModelTaskType.REGRESSION
3529
3259
  and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys())) == 0
3530
3260
  and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
3531
3261
  ):
3532
3262
  msg = self.bundle.get("multivariate_timeseries_detected")
3533
3263
  self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
3534
- elif self.model_task_type != ModelTaskType.REGRESSION:
3264
+ elif self.cv is None and self.model_task_type != ModelTaskType.REGRESSION:
3535
3265
  msg = self.bundle.get("group_k_fold_in_classification")
3536
3266
  self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
3537
3267
  group_columns = self._get_group_columns(df, self.fit_search_keys)
@@ -3559,42 +3289,48 @@ if response.status_code == 200:
3559
3289
  y: Optional[pd.Series] = None,
3560
3290
  eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]] = None,
3561
3291
  is_transform: bool = False,
3562
- silent: bool = False,
3563
3292
  ) -> Tuple[pd.DataFrame, pd.Series, Optional[List[Tuple[pd.DataFrame, pd.Series]]]]:
3564
3293
  validated_X = self._validate_X(X, is_transform)
3565
3294
  validated_y = self._validate_y(validated_X, y, enforce_y=not is_transform)
3566
- validated_eval_set = self._validate_eval_set(validated_X, eval_set, silent=silent)
3295
+ validated_eval_set = self._validate_eval_set(validated_X, eval_set)
3567
3296
  return validated_X, validated_y, validated_eval_set
3568
3297
 
3569
3298
  def _encode_id_columns(
3570
3299
  self,
3571
3300
  X: pd.DataFrame,
3301
+ columns_renaming: Optional[Dict[str, str]] = None,
3572
3302
  ) -> Tuple[pd.DataFrame, Dict[str, List[Any]]]:
3303
+ columns_renaming = columns_renaming or {}
3573
3304
  unknown_dict = {}
3574
3305
 
3575
3306
  if self.id_columns and self.id_columns_encoder is not None:
3576
- encoding_id_columns = [c for c in self.id_columns if c in X.columns]
3577
- if len(encoding_id_columns) > 0:
3578
- self.logger.info(f"Convert id columns to int: {encoding_id_columns}")
3579
- encoded = self.id_columns_encoder.transform(X[encoding_id_columns])
3580
- for i, c in enumerate(encoding_id_columns):
3581
- unknown_values = X[encoded[:, i] == -1][c].unique().tolist()
3582
- if len(unknown_values) > 0:
3583
- unknown_dict[c] = unknown_values
3584
- X[encoding_id_columns] = encoded
3585
- X = X.loc[(X[encoding_id_columns] != -1).all(axis=1)]
3586
-
3587
- if len(unknown_dict) > 0:
3588
- self.logger.warning(f"Unknown values in id columns: {unknown_dict}")
3307
+ inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
3308
+ renamed_id_columns = [
3309
+ inverse_columns_renaming.get(col, col) for col in self.id_columns_encoder.feature_names_in_
3310
+ ]
3311
+ self.logger.info(f"Convert id columns to int: {renamed_id_columns}")
3312
+ encoded = self.id_columns_encoder.transform(X[renamed_id_columns].rename(columns=columns_renaming))
3313
+ for i, c in enumerate(renamed_id_columns):
3314
+ unknown_values = X[encoded[:, i] == -1][c].unique().tolist()
3315
+ if len(unknown_values) > 0:
3316
+ unknown_dict[c] = unknown_values
3317
+ X[renamed_id_columns] = encoded
3318
+ X = X.loc[(X[renamed_id_columns] != -1).all(axis=1)]
3319
+
3320
+ if len(unknown_dict) > 0:
3321
+ self.logger.warning(f"Unknown values in id columns: {unknown_dict}")
3589
3322
 
3590
3323
  return X, unknown_dict
3591
3324
 
3592
- def _decode_id_columns(self, X: pd.DataFrame):
3325
+ def _decode_id_columns(self, X: pd.DataFrame, columns_renaming: Dict[str, str]):
3326
+ columns_renaming = columns_renaming or {}
3593
3327
  if self.id_columns and self.id_columns_encoder is not None:
3594
- decoding_id_columns = [c for c in self.id_columns if c in X.columns]
3595
- if len(decoding_id_columns) > 0:
3596
- decoded = self.id_columns_encoder.inverse_transform(X[self.id_columns])
3597
- X[self.id_columns] = decoded
3328
+ inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
3329
+ renamed_id_columns = [
3330
+ inverse_columns_renaming.get(col, col) for col in self.id_columns_encoder.feature_names_in_
3331
+ ]
3332
+ decoded = self.id_columns_encoder.inverse_transform(X[renamed_id_columns].rename(columns=columns_renaming))
3333
+ X[renamed_id_columns] = decoded
3598
3334
 
3599
3335
  return X
3600
3336
 
@@ -3688,30 +3424,10 @@ if response.status_code == 200:
3688
3424
 
3689
3425
  return validated_y
3690
3426
 
3691
- def _validate_eval_set(
3692
- self, X: pd.DataFrame, eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]], silent: bool = False
3693
- ):
3427
+ def _validate_eval_set(self, X: pd.DataFrame, eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]]):
3694
3428
  if eval_set is None:
3695
3429
  return None
3696
- validated_eval_set = []
3697
- has_date = self._get_date_column(self.search_keys) is not None
3698
- for idx, eval_pair in enumerate(eval_set):
3699
- validated_pair = self._validate_eval_set_pair(X, eval_pair)
3700
- if validated_pair[1].isna().all():
3701
- if not has_date:
3702
- msg = self.bundle.get("oot_without_date_not_supported").format(idx + 1)
3703
- elif self.columns_for_online_api:
3704
- msg = self.bundle.get("oot_with_online_sources_not_supported").format(idx + 1)
3705
- else:
3706
- msg = None
3707
- if msg:
3708
- if not silent:
3709
- print(msg)
3710
- self.logger.warning(msg)
3711
- continue
3712
- validated_eval_set.append(validated_pair)
3713
-
3714
- return validated_eval_set
3430
+ return [self._validate_eval_set_pair(X, eval_pair) for eval_pair in eval_set]
3715
3431
 
3716
3432
  def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
3717
3433
  if len(eval_pair) != 2:
@@ -3786,18 +3502,16 @@ if response.status_code == 200:
3786
3502
  raise ValidationError(self.bundle.get("unsupported_y_type_eval_set").format(type(eval_y)))
3787
3503
 
3788
3504
  eval_y_nunique = validated_eval_y.nunique()
3789
- is_oot = validated_eval_y.isna().all()
3790
- if not is_oot and eval_y_nunique < 2:
3505
+ if eval_y_nunique < 2:
3791
3506
  raise ValidationError(self.bundle.get("y_is_constant_eval_set"))
3792
3507
 
3793
- if not is_oot and self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
3508
+ if self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
3794
3509
  raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
3795
3510
 
3796
- if not is_oot:
3797
- # Check for duplicates between train and eval sets by comparing all values
3798
- train_eval_intersection = pd.merge(X, validated_eval_X, how="inner")
3799
- if len(train_eval_intersection) > 0:
3800
- raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
3511
+ # Check for duplicates between train and eval sets by comparing all values
3512
+ train_eval_intersection = pd.merge(X, validated_eval_X, how="inner")
3513
+ if len(train_eval_intersection) > 0:
3514
+ raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
3801
3515
 
3802
3516
  return validated_eval_X, validated_eval_y
3803
3517
 
@@ -3813,12 +3527,10 @@ if response.status_code == 200:
3813
3527
  if isinstance(eval_set, tuple):
3814
3528
  eval_set = [eval_set]
3815
3529
  for eval in eval_set:
3816
- is_oot = eval[1].isna().all()
3817
- if not is_oot:
3818
- if self.baseline_score_column not in eval[0].columns:
3819
- raise ValidationError(self.bundle.get("baseline_score_column_not_exists"))
3820
- if eval[0][self.baseline_score_column].isna().any():
3821
- raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
3530
+ if self.baseline_score_column not in eval[0].columns:
3531
+ raise ValidationError(self.bundle.get("baseline_score_column_not_exists"))
3532
+ if eval[0][self.baseline_score_column].isna().any():
3533
+ raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
3822
3534
 
3823
3535
  @staticmethod
3824
3536
  def _sample_X_and_y(X: pd.DataFrame, y: pd.Series, enriched_X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
@@ -3992,7 +3704,7 @@ if response.status_code == 200:
3992
3704
  return df
3993
3705
 
3994
3706
  def _add_current_date_as_key(
3995
- self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], bundle: ResourceBundle, silent: bool = False
3707
+ self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
3996
3708
  ) -> pd.DataFrame:
3997
3709
  if (
3998
3710
  set(search_keys.values()) == {SearchKey.PHONE}
@@ -4000,8 +3712,7 @@ if response.status_code == 200:
4000
3712
  or set(search_keys.values()) == {SearchKey.HEM}
4001
3713
  or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
4002
3714
  ):
4003
- if not silent:
4004
- self.__log_warning(bundle.get("current_date_added"))
3715
+ self.__log_warning(bundle.get("current_date_added"))
4005
3716
  df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
4006
3717
  search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
4007
3718
  converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, generate_cyclical_features=False)
@@ -4140,7 +3851,7 @@ if response.status_code == 200:
4140
3851
  columns_to_sort = [date_column] if date_column is not None else []
4141
3852
 
4142
3853
  do_sorting = True
4143
- if self.id_columns and self.cv is not None and self.cv.is_time_series():
3854
+ if self.id_columns and self.cv.is_time_series():
4144
3855
  # Check duplicates by date and id_columns
4145
3856
  reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
4146
3857
  renamed_id_columns = [reversed_columns_renaming.get(c, c) for c in self.id_columns]
@@ -4336,11 +4047,7 @@ if response.status_code == 200:
4336
4047
  return [f.name for f in features_meta if f.type == "categorical" and f.name not in (self.id_columns or [])]
4337
4048
 
4338
4049
  def __prepare_feature_importances(
4339
- self,
4340
- trace_id: str,
4341
- clients_features_df: pd.DataFrame,
4342
- updated_shaps: Optional[Dict[str, float]] = None,
4343
- silent=False,
4050
+ self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
4344
4051
  ):
4345
4052
  if self._search_task is None:
4346
4053
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
@@ -4353,12 +4060,11 @@ if response.status_code == 200:
4353
4060
  features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
4354
4061
 
4355
4062
  # To be sure that names with hash suffixes
4356
- clients_features_df = clients_features_df.rename(columns=original_names_dict)
4063
+ df = df.rename(columns=original_names_dict)
4357
4064
 
4358
4065
  self.feature_names_ = []
4359
4066
  self.external_source_feature_names = []
4360
4067
  self.zero_shap_client_features = []
4361
- self.unstable_client_features = []
4362
4068
  self.feature_importances_ = []
4363
4069
  features_info = []
4364
4070
  features_info_without_links = []
@@ -4367,10 +4073,10 @@ if response.status_code == 200:
4367
4073
  original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
4368
4074
 
4369
4075
  for feature_meta in features_meta:
4370
- original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
4371
- feature_meta.name = original_name
4076
+ if feature_meta.name in original_names_dict.keys():
4077
+ feature_meta.name = original_names_dict[feature_meta.name]
4372
4078
 
4373
- is_client_feature = original_name in clients_features_df.columns
4079
+ is_client_feature = original_names_dict.get(feature_meta.name, feature_meta.name) in df.columns
4374
4080
 
4375
4081
  # Show and update shap values for client features only if select_features is True
4376
4082
  if updated_shaps is not None and (not is_client_feature or self.fit_select_features):
@@ -4387,21 +4093,12 @@ if response.status_code == 200:
4387
4093
 
4388
4094
  for feature_meta in features_meta:
4389
4095
  original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
4390
- is_client_feature = original_name in clients_features_df.columns
4096
+ is_client_feature = original_name in df.columns
4391
4097
 
4392
4098
  if not is_client_feature:
4393
4099
  self.external_source_feature_names.append(original_name)
4394
4100
 
4395
- if self.psi_values is not None:
4396
- if original_name in self.psi_values:
4397
- feature_meta.psi_value = self.psi_values[original_name]
4398
- else:
4399
- if is_client_feature and self.fit_select_features:
4400
- self.unstable_client_features.append(original_name)
4401
- continue
4402
-
4403
4101
  # TODO make a decision about selected features based on special flag from mlb
4404
-
4405
4102
  if original_shaps.get(feature_meta.name, 0.0) == 0.0:
4406
4103
  if is_client_feature and self.fit_select_features:
4407
4104
  self.zero_shap_client_features.append(original_name)
@@ -4425,7 +4122,7 @@ if response.status_code == 200:
4425
4122
  self.feature_names_.append(feature_meta.name)
4426
4123
  self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
4427
4124
 
4428
- df_for_sample = features_df if feature_meta.name in features_df.columns else clients_features_df
4125
+ df_for_sample = features_df if feature_meta.name in features_df.columns else df
4429
4126
  feature_info = FeatureInfo.from_metadata(feature_meta, df_for_sample, is_client_feature)
4430
4127
  features_info.append(feature_info.to_row(self.bundle))
4431
4128
  features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
@@ -4433,8 +4130,6 @@ if response.status_code == 200:
4433
4130
 
4434
4131
  if len(features_info) > 0:
4435
4132
  self.features_info = pd.DataFrame(features_info)
4436
- if self.features_info[self.bundle.get("features_info_psi")].isna().all():
4437
- self.features_info.drop(columns=[self.bundle.get("features_info_psi")], inplace=True)
4438
4133
  self._features_info_without_links = pd.DataFrame(features_info_without_links)
4439
4134
  self._internal_features_info = pd.DataFrame(internal_features_info)
4440
4135
  if not silent: