upgini 1.2.113a5__py3-none-any.whl → 1.2.113a3974.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -112,7 +112,6 @@ except Exception:
112
112
  CustomFallbackProgressBar as ProgressBar,
113
113
  )
114
114
 
115
- from upgini.utils.psi import calculate_features_psi, calculate_sparsity_psi
116
115
  from upgini.utils.sample_utils import SampleColumns, SampleConfig, _num_samples, sample
117
116
  from upgini.utils.sort import sort_columns
118
117
  from upgini.utils.target_utils import calculate_psi, define_task
@@ -298,9 +297,7 @@ class FeaturesEnricher(TransformerMixin):
298
297
  self.feature_names_ = []
299
298
  self.external_source_feature_names = []
300
299
  self.zero_shap_client_features = []
301
- self.unstable_client_features = []
302
300
  self.feature_importances_ = []
303
- self.psi_values: Optional[Dict[str, float]] = None
304
301
  self.search_id = search_id
305
302
  self.disable_force_downsampling = disable_force_downsampling
306
303
  self.print_trace_id = print_trace_id
@@ -401,26 +398,13 @@ class FeaturesEnricher(TransformerMixin):
401
398
  @staticmethod
402
399
  def _check_eval_set(eval_set, X, bundle: ResourceBundle):
403
400
  checked_eval_set = []
404
- if eval_set is None:
405
- return checked_eval_set
406
- if isinstance(eval_set, tuple):
401
+ if eval_set is not None and isinstance(eval_set, tuple):
407
402
  eval_set = [eval_set]
408
- if not isinstance(eval_set, list):
403
+ if eval_set is not None and not isinstance(eval_set, list):
409
404
  raise ValidationError(bundle.get("unsupported_type_eval_set").format(type(eval_set)))
410
405
  for eval_pair in eval_set or []:
411
- # Handle OOT
412
- if isinstance(eval_pair, pd.DataFrame):
413
- empty_target = pd.Series([np.nan] * len(eval_pair), index=eval_pair.index)
414
- eval_pair = (eval_pair, empty_target)
415
- elif isinstance(eval_pair, tuple) and len(eval_pair) == 1:
416
- empty_target = pd.Series([np.nan] * len(eval_pair[0]), index=eval_pair[0].index)
417
- eval_pair = (eval_pair[0], empty_target)
418
-
419
406
  if not isinstance(eval_pair, tuple) or len(eval_pair) != 2:
420
407
  raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
421
- if eval_pair[1] is None:
422
- empty_target = pd.Series([np.nan] * len(eval_pair[0]), index=eval_pair[0].index)
423
- eval_pair = (eval_pair[0], empty_target)
424
408
  if not is_frames_equal(X, eval_pair[0], bundle):
425
409
  checked_eval_set.append(eval_pair)
426
410
  return checked_eval_set
@@ -442,7 +426,6 @@ class FeaturesEnricher(TransformerMixin):
442
426
  search_id_callback: Optional[Callable[[str], Any]] = None,
443
427
  select_features: bool = True,
444
428
  auto_fe_parameters: Optional[AutoFEParameters] = None,
445
- stability_threshold: float = 0.15,
446
429
  **kwargs,
447
430
  ):
448
431
  """Fit to data.
@@ -532,7 +515,6 @@ class FeaturesEnricher(TransformerMixin):
532
515
  estimator=estimator,
533
516
  scoring=scoring,
534
517
  importance_threshold=importance_threshold,
535
- stability_threshold=stability_threshold,
536
518
  max_features=max_features,
537
519
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
538
520
  auto_fe_parameters=auto_fe_parameters,
@@ -592,7 +574,6 @@ class FeaturesEnricher(TransformerMixin):
592
574
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
593
575
  select_features: bool = True,
594
576
  auto_fe_parameters: Optional[AutoFEParameters] = None,
595
- stability_threshold: float = 0.15,
596
577
  **kwargs,
597
578
  ) -> pd.DataFrame:
598
579
  """Fit to data, then transform it.
@@ -637,10 +618,6 @@ class FeaturesEnricher(TransformerMixin):
637
618
  If True, return only selected features both from input and data sources.
638
619
  Otherwise, return all features from input and only selected features from data sources.
639
620
 
640
- stability_threshold: float, optional (default=0.15)
641
- Stability threshold for selected features PSI calculation. If PSI is less than this threshold,
642
- then feature will be dropped.
643
-
644
621
  Returns
645
622
  -------
646
623
  X_new: pandas.DataFrame of shape (n_samples, n_features_new)
@@ -697,7 +674,6 @@ class FeaturesEnricher(TransformerMixin):
697
674
  scoring=scoring,
698
675
  estimator=estimator,
699
676
  importance_threshold=importance_threshold,
700
- stability_threshold=stability_threshold,
701
677
  max_features=max_features,
702
678
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
703
679
  auto_fe_parameters=auto_fe_parameters,
@@ -965,7 +941,7 @@ class FeaturesEnricher(TransformerMixin):
965
941
  raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
966
942
 
967
943
  validated_X, validated_y, validated_eval_set = self._validate_train_eval(
968
- effective_X, effective_y, effective_eval_set, silent=internal_call
944
+ effective_X, effective_y, effective_eval_set
969
945
  )
970
946
 
971
947
  if self.X is None:
@@ -1003,31 +979,29 @@ class FeaturesEnricher(TransformerMixin):
1003
979
  return None
1004
980
 
1005
981
  cat_features_from_backend = self.__get_categorical_features()
1006
- # Convert to original names
1007
- cat_features_from_backend = [self.fit_columns_renaming.get(c, c) for c in cat_features_from_backend]
1008
982
  client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
1009
983
  estimator, validated_X, self.search_keys
1010
984
  )
1011
- # Exclude id columns from cat_features
1012
985
  if self.id_columns and self.id_columns_encoder is not None:
1013
986
  if cat_features_from_backend:
1014
987
  cat_features_from_backend = [
1015
988
  c
1016
989
  for c in cat_features_from_backend
1017
- if c not in self.id_columns_encoder.feature_names_in_
990
+ if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
1018
991
  ]
1019
992
  if client_cat_features:
1020
993
  client_cat_features = [
1021
994
  c
1022
995
  for c in client_cat_features
1023
- if c not in self.id_columns_encoder.feature_names_in_
996
+ if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
1024
997
  ]
1025
998
  for cat_feature in cat_features_from_backend:
1026
- if cat_feature in self.search_keys:
1027
- if self.search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
1028
- search_keys_for_metrics.append(cat_feature)
999
+ original_cat_feature = self.fit_columns_renaming.get(cat_feature)
1000
+ if original_cat_feature in self.search_keys:
1001
+ if self.search_keys[original_cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
1002
+ search_keys_for_metrics.append(original_cat_feature)
1029
1003
  else:
1030
- self.logger.warning(self.bundle.get("cat_feature_search_key").format(cat_feature))
1004
+ self.logger.warning(self.bundle.get("cat_feature_search_key").format(original_cat_feature))
1031
1005
  search_keys_for_metrics.extend([c for c in self.id_columns or [] if c not in search_keys_for_metrics])
1032
1006
  self.logger.info(f"Search keys for metrics: {search_keys_for_metrics}")
1033
1007
 
@@ -1059,9 +1033,23 @@ class FeaturesEnricher(TransformerMixin):
1059
1033
  groups,
1060
1034
  _cv,
1061
1035
  columns_renaming,
1062
- _,
1063
1036
  ) = prepared_data
1064
1037
 
1038
+ # rename cat_features
1039
+ if client_cat_features:
1040
+ for new_c, old_c in columns_renaming.items():
1041
+ if old_c in client_cat_features:
1042
+ client_cat_features.remove(old_c)
1043
+ client_cat_features.append(new_c)
1044
+ for cat_feature in client_cat_features:
1045
+ if cat_feature not in fitting_X.columns:
1046
+ self.logger.error(
1047
+ f"Client cat_feature `{cat_feature}` not found in"
1048
+ f" x columns: {fitting_X.columns.to_list()}"
1049
+ )
1050
+ else:
1051
+ client_cat_features = []
1052
+
1065
1053
  # rename baseline_score_column
1066
1054
  reversed_renaming = {v: k for k, v in columns_renaming.items()}
1067
1055
  baseline_score_column = self.baseline_score_column
@@ -1086,9 +1074,9 @@ class FeaturesEnricher(TransformerMixin):
1086
1074
  self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
1087
1075
 
1088
1076
  has_date = self._get_date_column(search_keys) is not None
1077
+ has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
1089
1078
  model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
1090
1079
  cat_features = list(set(client_cat_features + cat_features_from_backend))
1091
- has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
1092
1080
  baseline_cat_features = [f for f in cat_features if f in fitting_X.columns]
1093
1081
  enriched_cat_features = [f for f in cat_features if f in fitting_enriched_X.columns]
1094
1082
  if len(enriched_cat_features) < len(cat_features):
@@ -1208,6 +1196,8 @@ class FeaturesEnricher(TransformerMixin):
1208
1196
  # max_initial_eval_set_hit_rate = self._search_task.get_max_initial_eval_set_hit_rate_v2()
1209
1197
  if len(fitting_eval_set_dict) > 0:
1210
1198
  for idx in fitting_eval_set_dict.keys():
1199
+ # eval_hit_rate = max_initial_eval_set_hit_rate[idx + 1]
1200
+
1211
1201
  (
1212
1202
  eval_X_sorted,
1213
1203
  eval_y_sorted,
@@ -1215,10 +1205,6 @@ class FeaturesEnricher(TransformerMixin):
1215
1205
  enriched_eval_y_sorted,
1216
1206
  ) = fitting_eval_set_dict[idx]
1217
1207
 
1218
- if eval_y_sorted.isna().all():
1219
- # Skip OOT eval set
1220
- continue
1221
-
1222
1208
  if baseline_estimator is not None:
1223
1209
  self.logger.info(
1224
1210
  f"Calculate baseline {metric} on eval set {idx + 1} "
@@ -1261,14 +1247,17 @@ class FeaturesEnricher(TransformerMixin):
1261
1247
  "quality_metrics_eval_segment"
1262
1248
  ).format(idx + 1),
1263
1249
  self.bundle.get("quality_metrics_rows_header"): _num_samples(
1250
+ # effective_eval_set[idx][0]
1264
1251
  # Use actually used for metrics dataset
1265
1252
  eval_X_sorted
1266
1253
  ),
1254
+ # self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
1267
1255
  }
1268
1256
  if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
1269
1257
  eval_y_sorted
1270
1258
  ):
1271
1259
  eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
1260
+ # np.mean(validated_eval_set[idx][1]), 4
1272
1261
  # Use actually used for metrics dataset
1273
1262
  np.mean(eval_y_sorted),
1274
1263
  4,
@@ -1290,7 +1279,7 @@ class FeaturesEnricher(TransformerMixin):
1290
1279
  metrics.append(eval_metrics)
1291
1280
 
1292
1281
  if updating_shaps is not None:
1293
- decoded_X = self._decode_id_columns(fitting_X)
1282
+ decoded_X = self._decode_id_columns(fitting_X, columns_renaming)
1294
1283
  self._update_shap_values(trace_id, decoded_X, updating_shaps, silent=not internal_call)
1295
1284
 
1296
1285
  metrics_df = pd.DataFrame(metrics)
@@ -1341,202 +1330,6 @@ class FeaturesEnricher(TransformerMixin):
1341
1330
  finally:
1342
1331
  self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
1343
1332
 
1344
- def _select_features_by_psi(
1345
- self,
1346
- trace_id: str,
1347
- X: Union[pd.DataFrame, pd.Series, np.ndarray],
1348
- y: Union[pd.DataFrame, pd.Series, np.ndarray, List],
1349
- eval_set: Optional[Union[List[tuple], tuple]],
1350
- stability_threshold: float,
1351
- cv: Union[BaseCrossValidator, CVType, str, None] = None,
1352
- estimator=None,
1353
- exclude_features_sources: Optional[List[str]] = None,
1354
- importance_threshold: Optional[float] = None,
1355
- max_features: Optional[int] = None,
1356
- progress_bar: bool = True,
1357
- progress_callback: Optional[Callable] = None,
1358
- ):
1359
- search_keys = self.search_keys.copy()
1360
- validated_X, _, validated_eval_set = self._validate_train_eval(X, y, eval_set, silent=True)
1361
- if isinstance(X, np.ndarray):
1362
- search_keys = {str(k): v for k, v in search_keys.items()}
1363
-
1364
- date_column = self._get_date_column(search_keys)
1365
- has_date = date_column is not None
1366
- if not has_date:
1367
- self.logger.info("No date column for OOT PSI calculation")
1368
- return
1369
- if not validated_eval_set:
1370
- self.logger.info("No eval set for OOT PSI calculation")
1371
- return
1372
- if validated_X[date_column].nunique() <= 1:
1373
- self.logger.warning("Constant date for OOT PSI calculation")
1374
- return
1375
- if self.cv is not None and self.cv.is_time_series():
1376
- self.logger.warning("Time series CV is not supported for OOT PSI calculation")
1377
- return
1378
-
1379
- cat_features_from_backend = self.__get_categorical_features()
1380
- cat_features_from_backend = [self.fit_columns_renaming.get(c, c) for c in cat_features_from_backend]
1381
- client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
1382
- estimator, validated_X, search_keys
1383
- )
1384
- if self.id_columns and self.id_columns_encoder is not None:
1385
- if cat_features_from_backend:
1386
- cat_features_from_backend = [
1387
- c
1388
- for c in cat_features_from_backend
1389
- if c not in self.id_columns_encoder.feature_names_in_
1390
- ]
1391
- if client_cat_features:
1392
- client_cat_features = [
1393
- c
1394
- for c in client_cat_features
1395
- if c not in self.id_columns_encoder.feature_names_in_
1396
- ]
1397
-
1398
- prepared_data = self._prepare_data_for_metrics(
1399
- trace_id=trace_id,
1400
- X=X,
1401
- y=y,
1402
- eval_set=eval_set,
1403
- exclude_features_sources=exclude_features_sources,
1404
- importance_threshold=importance_threshold,
1405
- max_features=max_features,
1406
- remove_outliers_calc_metrics=False,
1407
- cv_override=cv,
1408
- search_keys_for_metrics=search_keys_for_metrics,
1409
- progress_bar=progress_bar,
1410
- progress_callback=progress_callback,
1411
- client_cat_features=client_cat_features,
1412
- )
1413
- if prepared_data is None:
1414
- return None
1415
-
1416
- (
1417
- validated_X,
1418
- fitting_X,
1419
- y_sorted,
1420
- fitting_enriched_X,
1421
- _,
1422
- fitting_eval_set_dict,
1423
- _,
1424
- _,
1425
- _,
1426
- columns_renaming,
1427
- eval_set_dates,
1428
- ) = prepared_data
1429
-
1430
- model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
1431
- cat_features = list(set(client_cat_features + cat_features_from_backend))
1432
-
1433
- # Drop unstable features
1434
- unstable_features = self._check_stability(
1435
- validated_X,
1436
- validated_eval_set,
1437
- fitting_eval_set_dict,
1438
- eval_set_dates,
1439
- search_keys,
1440
- stability_threshold,
1441
- cat_features,
1442
- model_task_type,
1443
- )
1444
- client_features_df = self.df_with_original_index.rename(columns=columns_renaming)
1445
- # decoded_X = self._decode_id_columns(fitting_X, columns_renaming)
1446
- self._update_report_psi(trace_id, client_features_df)
1447
-
1448
- if unstable_features:
1449
- msg = f"Some features are unstable: {unstable_features} and will be dropped"
1450
- self.logger.warning(msg)
1451
- print(msg)
1452
- fitting_X = fitting_X.drop(columns=unstable_features, errors="ignore")
1453
- fitting_enriched_X = fitting_enriched_X.drop(columns=unstable_features, errors="ignore")
1454
- msg = f"Threre are {len(fitting_enriched_X.columns)} stable selected features left"
1455
- self.logger.info(msg)
1456
- print(msg)
1457
- for idx, (
1458
- eval_X,
1459
- eval_y,
1460
- eval_enriched_X,
1461
- eval_enriched_y,
1462
- ) in fitting_eval_set_dict.items():
1463
- eval_X = eval_X.drop(columns=unstable_features, errors="ignore")
1464
- eval_enriched_X = eval_enriched_X.drop(columns=unstable_features, errors="ignore")
1465
- fitting_eval_set_dict[idx] = (eval_X, eval_y, eval_enriched_X, eval_enriched_y)
1466
-
1467
- def _check_stability(
1468
- self,
1469
- X: pd.DataFrame,
1470
- eval_set: List[Tuple[pd.DataFrame, pd.Series]],
1471
- enriched_eval_set: Dict,
1472
- eval_set_dates: Dict[int, pd.Series],
1473
- search_keys: Dict[str, SearchKey],
1474
- stability_threshold: float,
1475
- cat_features: List[str],
1476
- model_task_type: ModelTaskType,
1477
- ) -> List[str]:
1478
- # Find latest eval set or earliest if all eval sets are before train set
1479
- date_column = self._get_date_column(search_keys)
1480
-
1481
- # Get minimum date from main dataset X
1482
- main_min_date = X[date_column].min()
1483
-
1484
- # Find minimum date for each eval_set and compare with main dataset
1485
- eval_dates = []
1486
- for i, (eval_x, _) in enumerate(eval_set):
1487
- if date_column in eval_x.columns:
1488
- eval_min_date = eval_x[date_column].min()
1489
- eval_max_date = eval_x[date_column].max()
1490
- eval_dates.append((i, eval_min_date, eval_max_date))
1491
-
1492
- if not eval_dates:
1493
- return []
1494
-
1495
- # Check if any eval_set has minimum date >= main dataset minimum date
1496
- later_eval_sets = [(i, min_date, max_date) for i, min_date, max_date in eval_dates if min_date >= main_min_date]
1497
-
1498
- if later_eval_sets:
1499
- # If there are eval_sets with date >= main date, choose the one with highest maximum date
1500
- selected_eval_set_idx = max(later_eval_sets, key=lambda x: x[2])[0]
1501
- else:
1502
- # If all eval_sets have dates < main date, choose the one with lowest minimux date
1503
- selected_eval_set_idx = max(eval_dates, key=lambda x: x[1])[0]
1504
-
1505
- checking_eval_set = enriched_eval_set[selected_eval_set_idx]
1506
-
1507
- checking_eval_set_df = (
1508
- checking_eval_set[2]
1509
- if checking_eval_set[1] is None or checking_eval_set[1].isna().all()
1510
- else pd.concat([checking_eval_set[2], checking_eval_set[1].to_frame(TARGET)], axis=1)
1511
- )
1512
- checking_eval_set_df = checking_eval_set_df.copy()
1513
-
1514
- checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
1515
-
1516
- psi_values_sparse = calculate_sparsity_psi(
1517
- checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
1518
- )
1519
-
1520
- unstable_by_sparsity = [feature for feature, psi in psi_values_sparse.items() if psi > stability_threshold]
1521
- if unstable_by_sparsity:
1522
- self.logger.info(f"Unstable by sparsity features: {sorted(unstable_by_sparsity)}")
1523
-
1524
- psi_values = calculate_features_psi(
1525
- checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
1526
- )
1527
-
1528
- unstable_by_value = [feature for feature, psi in psi_values.items() if psi > stability_threshold]
1529
- if unstable_by_value:
1530
- self.logger.info(f"Unstable by value features: {sorted(unstable_by_value)}")
1531
-
1532
- self.psi_values = {
1533
- feature: psi_value for feature, psi_value in psi_values.items() if psi_value <= stability_threshold
1534
- }
1535
-
1536
- total_unstable_features = sorted(set(unstable_by_sparsity + unstable_by_value))
1537
-
1538
- return total_unstable_features
1539
-
1540
1333
  def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float], silent: bool = False):
1541
1334
  renaming = self.fit_columns_renaming or {}
1542
1335
  self.logger.info(f"Updating SHAP values: {new_shaps}")
@@ -1592,56 +1385,6 @@ class FeaturesEnricher(TransformerMixin):
1592
1385
  except (ImportError, NameError):
1593
1386
  pass
1594
1387
 
1595
- def _update_report_psi(self, trace_id: str, clients_features_df: pd.DataFrame):
1596
- self.__prepare_feature_importances(trace_id, clients_features_df)
1597
-
1598
- if self.features_info_display_handle is not None:
1599
- try:
1600
- _ = get_ipython() # type: ignore
1601
-
1602
- display_html_dataframe(
1603
- self.features_info,
1604
- self._features_info_without_links,
1605
- self.bundle.get("relevant_features_header"),
1606
- display_handle=self.features_info_display_handle,
1607
- )
1608
- except (ImportError, NameError):
1609
- pass
1610
-
1611
- if self.data_sources_display_handle is not None:
1612
- try:
1613
- _ = get_ipython() # type: ignore
1614
-
1615
- display_html_dataframe(
1616
- self.relevant_data_sources,
1617
- self._relevant_data_sources_wo_links,
1618
- self.bundle.get("relevant_data_sources_header"),
1619
- display_handle=self.data_sources_display_handle,
1620
- )
1621
- except (ImportError, NameError):
1622
- pass
1623
-
1624
- if self.autofe_features_display_handle is not None:
1625
- try:
1626
- _ = get_ipython() # type: ignore
1627
- autofe_descriptions_df = self.get_autofe_features_description()
1628
- if autofe_descriptions_df is not None:
1629
- display_html_dataframe(
1630
- df=autofe_descriptions_df,
1631
- internal_df=autofe_descriptions_df,
1632
- header=self.bundle.get("autofe_descriptions_header"),
1633
- display_handle=self.autofe_features_display_handle,
1634
- )
1635
- except (ImportError, NameError):
1636
- pass
1637
- if self.report_button_handle is not None:
1638
- try:
1639
- _ = get_ipython() # type: ignore
1640
-
1641
- self.__show_report_button(display_handle=self.report_button_handle)
1642
- except (ImportError, NameError):
1643
- pass
1644
-
1645
1388
  def _check_train_and_eval_target_distribution(self, y, eval_set_dict):
1646
1389
  uneven_distribution = False
1647
1390
  for eval_set in eval_set_dict.values():
@@ -1745,7 +1488,7 @@ class FeaturesEnricher(TransformerMixin):
1745
1488
  def _get_and_validate_client_cat_features(
1746
1489
  self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
1747
1490
  ) -> Tuple[Optional[List[str]], List[str]]:
1748
- cat_features = []
1491
+ cat_features = None
1749
1492
  search_keys_for_metrics = []
1750
1493
  if (
1751
1494
  estimator is not None
@@ -1792,7 +1535,7 @@ class FeaturesEnricher(TransformerMixin):
1792
1535
  is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
1793
1536
  is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
1794
1537
  checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
1795
- validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set, silent=True)
1538
+ validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set)
1796
1539
 
1797
1540
  sampled_data = self._get_enriched_for_metrics(
1798
1541
  trace_id,
@@ -1806,7 +1549,7 @@ class FeaturesEnricher(TransformerMixin):
1806
1549
  progress_bar,
1807
1550
  progress_callback,
1808
1551
  )
1809
- (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming) = dataclasses.astuple(
1552
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = dataclasses.astuple(
1810
1553
  sampled_data
1811
1554
  )
1812
1555
 
@@ -1829,7 +1572,8 @@ class FeaturesEnricher(TransformerMixin):
1829
1572
  or c in set(self.feature_names_).union(self.id_columns or [])
1830
1573
  or (self.fit_columns_renaming or {}).get(c, c) in set(self.feature_names_).union(self.id_columns or [])
1831
1574
  )
1832
- and c not in (
1575
+ and c
1576
+ not in (
1833
1577
  excluding_search_keys
1834
1578
  + list(self.fit_dropped_features)
1835
1579
  + [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
@@ -1914,7 +1658,7 @@ class FeaturesEnricher(TransformerMixin):
1914
1658
  fitting_X, y_sorted, search_keys, self.model_task_type, sort_all_columns=True, logger=self.logger
1915
1659
  )
1916
1660
  fitting_X = fitting_X[fitting_x_columns]
1917
- fitting_X, _ = self._encode_id_columns(fitting_X)
1661
+ fitting_X, _ = self._encode_id_columns(fitting_X, self.fit_columns_renaming)
1918
1662
  self.logger.info(f"Final sorted list of fitting X columns: {fitting_x_columns}")
1919
1663
  fitting_enriched_x_columns = fitting_enriched_X.columns.to_list()
1920
1664
  fitting_enriched_x_columns = sort_columns(
@@ -1926,18 +1670,14 @@ class FeaturesEnricher(TransformerMixin):
1926
1670
  logger=self.logger,
1927
1671
  )
1928
1672
  fitting_enriched_X = fitting_enriched_X[fitting_enriched_x_columns]
1929
- fitting_enriched_X, _ = self._encode_id_columns(fitting_enriched_X)
1673
+ fitting_enriched_X, _ = self._encode_id_columns(fitting_enriched_X, self.fit_columns_renaming)
1930
1674
  self.logger.info(f"Final sorted list of fitting enriched X columns: {fitting_enriched_x_columns}")
1931
- date_column = self._get_date_column(search_keys)
1932
- eval_set_dates = {}
1933
1675
  for idx, eval_tuple in eval_set_sampled_dict.items():
1934
1676
  eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
1935
1677
  eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
1936
1678
  enriched_eval_X_sorted, enriched_eval_y_sorted = self._sort_by_system_record_id(
1937
1679
  enriched_eval_X, eval_y_sampled, self.cv
1938
1680
  )
1939
- if date_column is not None:
1940
- eval_set_dates[idx] = eval_X_sorted[date_column]
1941
1681
  fitting_eval_X = eval_X_sorted[fitting_x_columns].copy()
1942
1682
  fitting_enriched_eval_X = enriched_eval_X_sorted[fitting_enriched_x_columns].copy()
1943
1683
 
@@ -1958,8 +1698,8 @@ class FeaturesEnricher(TransformerMixin):
1958
1698
  .astype(np.float64)
1959
1699
  )
1960
1700
 
1961
- fitting_eval_X, unknown_dict = self._encode_id_columns(fitting_eval_X)
1962
- fitting_enriched_eval_X, _ = self._encode_id_columns(fitting_enriched_eval_X)
1701
+ fitting_eval_X, unknown_dict = self._encode_id_columns(fitting_eval_X, self.fit_columns_renaming)
1702
+ fitting_enriched_eval_X, _ = self._encode_id_columns(fitting_enriched_eval_X, self.fit_columns_renaming)
1963
1703
 
1964
1704
  if len(unknown_dict) > 0:
1965
1705
  print(self.bundle.get("unknown_id_column_value_in_eval_set").format(unknown_dict))
@@ -1982,7 +1722,6 @@ class FeaturesEnricher(TransformerMixin):
1982
1722
  groups,
1983
1723
  cv,
1984
1724
  columns_renaming,
1985
- eval_set_dates,
1986
1725
  )
1987
1726
 
1988
1727
  @dataclass
@@ -2145,16 +1884,14 @@ class FeaturesEnricher(TransformerMixin):
2145
1884
  remove_outliers_calc_metrics: Optional[bool],
2146
1885
  ) -> _EnrichedDataForMetrics:
2147
1886
  eval_set_sampled_dict = {}
2148
- search_keys = self.fit_search_keys.copy()
1887
+ search_keys = self.fit_search_keys
2149
1888
 
2150
1889
  rows_to_drop = None
2151
1890
  has_date = self._get_date_column(search_keys) is not None
2152
1891
  self.model_task_type = self.model_task_type or define_task(
2153
1892
  self.df_with_original_index[TARGET], has_date, self.logger, silent=True
2154
1893
  )
2155
- if remove_outliers_calc_metrics is None:
2156
- remove_outliers_calc_metrics = True
2157
- if self.model_task_type == ModelTaskType.REGRESSION and remove_outliers_calc_metrics:
1894
+ if self.model_task_type == ModelTaskType.REGRESSION:
2158
1895
  target_outliers_df = self._search_task.get_target_outliers(trace_id)
2159
1896
  if target_outliers_df is not None and len(target_outliers_df) > 0:
2160
1897
  outliers = pd.merge(
@@ -2164,8 +1901,11 @@ class FeaturesEnricher(TransformerMixin):
2164
1901
  how="inner",
2165
1902
  )
2166
1903
  top_outliers = outliers.sort_values(by=TARGET, ascending=False)[TARGET].head(3)
2167
- rows_to_drop = outliers
2168
- not_msg = ""
1904
+ if remove_outliers_calc_metrics is None or remove_outliers_calc_metrics is True:
1905
+ rows_to_drop = outliers
1906
+ not_msg = ""
1907
+ else:
1908
+ not_msg = "not "
2169
1909
  msg = self.bundle.get("target_outliers_warning").format(len(target_outliers_df), top_outliers, not_msg)
2170
1910
  print(msg)
2171
1911
  self.logger.warning(msg)
@@ -2223,13 +1963,12 @@ class FeaturesEnricher(TransformerMixin):
2223
1963
  enriched_eval_X = enriched_eval_sets[idx + 1][enriched_X_columns].copy()
2224
1964
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
2225
1965
 
2226
- # reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
2227
- X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
2228
- enriched_X.rename(columns=self.fit_columns_renaming, inplace=True)
1966
+ reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
1967
+ X_sampled.rename(columns=reversed_renaming, inplace=True)
1968
+ enriched_X.rename(columns=reversed_renaming, inplace=True)
2229
1969
  for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
2230
- eval_X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
2231
- enriched_eval_X.rename(columns=self.fit_columns_renaming, inplace=True)
2232
- search_keys = {self.fit_columns_renaming.get(k, k): v for k, v in search_keys.items()}
1970
+ eval_X_sampled.rename(columns=reversed_renaming, inplace=True)
1971
+ enriched_eval_X.rename(columns=reversed_renaming, inplace=True)
2233
1972
 
2234
1973
  datasets_hash = hash_input(self.X, self.y, self.eval_set)
2235
1974
  return self.__cache_and_return_results(
@@ -2287,7 +2026,13 @@ class FeaturesEnricher(TransformerMixin):
2287
2026
  enriched_df, x_columns, enriched_X.columns.tolist(), len(eval_set) if has_eval_set else 0
2288
2027
  )
2289
2028
 
2290
- search_keys = {columns_renaming.get(k, k): v for k, v in search_keys.items()}
2029
+ # Add hash-suffixes because output of transform has original names
2030
+ reversed_renaming = {v: k for k, v in columns_renaming.items()}
2031
+ X_sampled.rename(columns=reversed_renaming, inplace=True)
2032
+ enriched_X.rename(columns=reversed_renaming, inplace=True)
2033
+ for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
2034
+ eval_X_sampled.rename(columns=reversed_renaming, inplace=True)
2035
+ enriched_eval_X.rename(columns=reversed_renaming, inplace=True)
2291
2036
 
2292
2037
  # Cache and return results
2293
2038
  datasets_hash = hash_input(validated_X, validated_y, eval_set)
@@ -2367,7 +2112,7 @@ class FeaturesEnricher(TransformerMixin):
2367
2112
 
2368
2113
  def __extract_eval_data(
2369
2114
  self, enriched_df: pd.DataFrame, x_columns: List[str], enriched_X_columns: List[str], eval_set_len: int
2370
- ) -> Tuple[Dict[int, Tuple], Dict[int, pd.Series]]:
2115
+ ) -> Dict[int, Tuple]:
2371
2116
  eval_set_sampled_dict = {}
2372
2117
 
2373
2118
  for idx in range(eval_set_len):
@@ -2413,12 +2158,12 @@ class FeaturesEnricher(TransformerMixin):
2413
2158
  columns_renaming: Dict[str, str],
2414
2159
  ):
2415
2160
  # X_sampled - with hash-suffixes
2416
- # reversed_renaming = {v: k for k, v in columns_renaming.items()}
2417
- # search_keys = {
2418
- # reversed_renaming.get(k, k): v
2419
- # for k, v in search_keys.items()
2420
- # if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
2421
- # }
2161
+ reversed_renaming = {v: k for k, v in columns_renaming.items()}
2162
+ search_keys = {
2163
+ reversed_renaming.get(k, k): v
2164
+ for k, v in search_keys.items()
2165
+ if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
2166
+ }
2422
2167
  return FeaturesEnricher._EnrichedDataForMetrics(
2423
2168
  X_sampled=X_sampled,
2424
2169
  y_sampled=y_sampled,
@@ -2568,7 +2313,7 @@ if response.status_code == 200:
2568
2313
  self.logger.info("Start transform")
2569
2314
 
2570
2315
  validated_X, validated_y, validated_eval_set = self._validate_train_eval(
2571
- X, y, eval_set=None, is_transform=True, silent=True
2316
+ X, y, eval_set=None, is_transform=True
2572
2317
  )
2573
2318
  df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
2574
2319
 
@@ -2670,7 +2415,7 @@ if response.status_code == 200:
2670
2415
  else:
2671
2416
  self.logger.info("Input dataset hasn't date column")
2672
2417
  if self.__should_add_date_column():
2673
- df = self._add_current_date_as_key(df, search_keys, self.bundle, silent=True)
2418
+ df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
2674
2419
 
2675
2420
  email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
2676
2421
  if email_columns and self.generate_search_key_features:
@@ -2919,8 +2664,7 @@ if response.status_code == 200:
2919
2664
  selecting_columns = [
2920
2665
  c
2921
2666
  for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
2922
- if (c not in self.zero_shap_client_features and c not in self.unstable_client_features)
2923
- or c in (self.id_columns or [])
2667
+ if c not in self.zero_shap_client_features or c in (self.id_columns or [])
2924
2668
  ]
2925
2669
  selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
2926
2670
  if add_fit_system_record_id:
@@ -3054,7 +2798,6 @@ if response.status_code == 200:
3054
2798
  scoring: Union[Callable, str, None],
3055
2799
  estimator: Optional[Any],
3056
2800
  importance_threshold: Optional[float],
3057
- stability_threshold: float,
3058
2801
  max_features: Optional[int],
3059
2802
  remove_outliers_calc_metrics: Optional[bool],
3060
2803
  auto_fe_parameters: AutoFEParameters,
@@ -3069,7 +2812,6 @@ if response.status_code == 200:
3069
2812
  self.fit_columns_renaming = None
3070
2813
  self.fit_dropped_features = set()
3071
2814
  self.fit_generated_features = []
3072
- self.psi_values = None
3073
2815
 
3074
2816
  validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, eval_set)
3075
2817
 
@@ -3166,7 +2908,7 @@ if response.status_code == 200:
3166
2908
  self.logger.info("Input dataset hasn't date column")
3167
2909
  # TODO remove when this logic will be implemented on the back
3168
2910
  if self.__should_add_date_column():
3169
- df = self._add_current_date_as_key(df, self.fit_search_keys, self.bundle)
2911
+ df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
3170
2912
 
3171
2913
  email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
3172
2914
  if email_columns and self.generate_search_key_features:
@@ -3181,13 +2923,10 @@ if response.status_code == 200:
3181
2923
  except Exception:
3182
2924
  self.logger.exception("Failed to check dates distribution validity")
3183
2925
 
3184
- self.__adjust_cv(df)
3185
-
3186
2926
  if (
3187
2927
  is_numeric_dtype(df[self.TARGET_NAME])
3188
2928
  and self.model_task_type in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]
3189
2929
  and has_date
3190
- and (self.cv is None or not self.cv.is_time_series())
3191
2930
  ):
3192
2931
  self._validate_PSI(df.sort_values(by=maybe_date_column))
3193
2932
 
@@ -3219,8 +2958,8 @@ if response.status_code == 200:
3219
2958
 
3220
2959
  self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
3221
2960
 
3222
- # Group columns should have normalized names
3223
- self.__adjust_cv(df, force=True)
2961
+ self.__adjust_cv(df)
2962
+
3224
2963
  if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
3225
2964
  id_columns = self.__get_renamed_id_columns()
3226
2965
  if id_columns:
@@ -3425,21 +3164,6 @@ if response.status_code == 200:
3425
3164
  display_id=f"autofe_descriptions_{uuid.uuid4()}",
3426
3165
  )
3427
3166
 
3428
- self._select_features_by_psi(
3429
- trace_id=trace_id,
3430
- X=X,
3431
- y=y,
3432
- eval_set=eval_set,
3433
- stability_threshold=stability_threshold,
3434
- cv=self.cv,
3435
- estimator=estimator,
3436
- exclude_features_sources=exclude_features_sources,
3437
- importance_threshold=importance_threshold,
3438
- max_features=max_features,
3439
- progress_bar=progress_bar,
3440
- progress_callback=progress_callback,
3441
- )
3442
-
3443
3167
  if self._has_paid_features(exclude_features_sources):
3444
3168
  if calculate_metrics is not None and calculate_metrics:
3445
3169
  msg = self.bundle.get("metrics_with_paid_features")
@@ -3525,21 +3249,19 @@ if response.status_code == 200:
3525
3249
  reverse_renaming = {v: k for k, v in renaming.items()}
3526
3250
  return None if self.id_columns is None else [reverse_renaming.get(c) or c for c in self.id_columns]
3527
3251
 
3528
- def __adjust_cv(self, df: pd.DataFrame, force: bool = False):
3529
- if self.cv is not None and not force:
3530
- return
3531
-
3252
+ def __adjust_cv(self, df: pd.DataFrame):
3532
3253
  date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3533
3254
  # Check Multivariate time series
3534
3255
  if (
3535
- date_column
3256
+ self.cv is None
3257
+ and date_column
3536
3258
  and self.model_task_type == ModelTaskType.REGRESSION
3537
3259
  and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys())) == 0
3538
3260
  and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
3539
3261
  ):
3540
3262
  msg = self.bundle.get("multivariate_timeseries_detected")
3541
3263
  self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
3542
- elif self.model_task_type != ModelTaskType.REGRESSION:
3264
+ elif self.cv is None and self.model_task_type != ModelTaskType.REGRESSION:
3543
3265
  msg = self.bundle.get("group_k_fold_in_classification")
3544
3266
  self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
3545
3267
  group_columns = self._get_group_columns(df, self.fit_search_keys)
@@ -3567,42 +3289,48 @@ if response.status_code == 200:
3567
3289
  y: Optional[pd.Series] = None,
3568
3290
  eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]] = None,
3569
3291
  is_transform: bool = False,
3570
- silent: bool = False,
3571
3292
  ) -> Tuple[pd.DataFrame, pd.Series, Optional[List[Tuple[pd.DataFrame, pd.Series]]]]:
3572
3293
  validated_X = self._validate_X(X, is_transform)
3573
3294
  validated_y = self._validate_y(validated_X, y, enforce_y=not is_transform)
3574
- validated_eval_set = self._validate_eval_set(validated_X, eval_set, silent=silent)
3295
+ validated_eval_set = self._validate_eval_set(validated_X, eval_set)
3575
3296
  return validated_X, validated_y, validated_eval_set
3576
3297
 
3577
3298
  def _encode_id_columns(
3578
3299
  self,
3579
3300
  X: pd.DataFrame,
3301
+ columns_renaming: Optional[Dict[str, str]] = None,
3580
3302
  ) -> Tuple[pd.DataFrame, Dict[str, List[Any]]]:
3303
+ columns_renaming = columns_renaming or {}
3581
3304
  unknown_dict = {}
3582
3305
 
3583
3306
  if self.id_columns and self.id_columns_encoder is not None:
3584
- encoding_id_columns = [c for c in self.id_columns if c in X.columns]
3585
- if len(encoding_id_columns) > 0:
3586
- self.logger.info(f"Convert id columns to int: {encoding_id_columns}")
3587
- encoded = self.id_columns_encoder.transform(X[encoding_id_columns])
3588
- for i, c in enumerate(encoding_id_columns):
3589
- unknown_values = X[encoded[:, i] == -1][c].unique().tolist()
3590
- if len(unknown_values) > 0:
3591
- unknown_dict[c] = unknown_values
3592
- X[encoding_id_columns] = encoded
3593
- X = X.loc[(X[encoding_id_columns] != -1).all(axis=1)]
3594
-
3595
- if len(unknown_dict) > 0:
3596
- self.logger.warning(f"Unknown values in id columns: {unknown_dict}")
3307
+ inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
3308
+ renamed_id_columns = [
3309
+ inverse_columns_renaming.get(col, col) for col in self.id_columns_encoder.feature_names_in_
3310
+ ]
3311
+ self.logger.info(f"Convert id columns to int: {renamed_id_columns}")
3312
+ encoded = self.id_columns_encoder.transform(X[renamed_id_columns].rename(columns=columns_renaming))
3313
+ for i, c in enumerate(renamed_id_columns):
3314
+ unknown_values = X[encoded[:, i] == -1][c].unique().tolist()
3315
+ if len(unknown_values) > 0:
3316
+ unknown_dict[c] = unknown_values
3317
+ X[renamed_id_columns] = encoded
3318
+ X = X.loc[(X[renamed_id_columns] != -1).all(axis=1)]
3319
+
3320
+ if len(unknown_dict) > 0:
3321
+ self.logger.warning(f"Unknown values in id columns: {unknown_dict}")
3597
3322
 
3598
3323
  return X, unknown_dict
3599
3324
 
3600
- def _decode_id_columns(self, X: pd.DataFrame):
3325
+ def _decode_id_columns(self, X: pd.DataFrame, columns_renaming: Dict[str, str]):
3326
+ columns_renaming = columns_renaming or {}
3601
3327
  if self.id_columns and self.id_columns_encoder is not None:
3602
- decoding_id_columns = [c for c in self.id_columns if c in X.columns]
3603
- if len(decoding_id_columns) > 0:
3604
- decoded = self.id_columns_encoder.inverse_transform(X[self.id_columns])
3605
- X[self.id_columns] = decoded
3328
+ inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
3329
+ renamed_id_columns = [
3330
+ inverse_columns_renaming.get(col, col) for col in self.id_columns_encoder.feature_names_in_
3331
+ ]
3332
+ decoded = self.id_columns_encoder.inverse_transform(X[renamed_id_columns].rename(columns=columns_renaming))
3333
+ X[renamed_id_columns] = decoded
3606
3334
 
3607
3335
  return X
3608
3336
 
@@ -3696,30 +3424,10 @@ if response.status_code == 200:
3696
3424
 
3697
3425
  return validated_y
3698
3426
 
3699
- def _validate_eval_set(
3700
- self, X: pd.DataFrame, eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]], silent: bool = False
3701
- ):
3427
+ def _validate_eval_set(self, X: pd.DataFrame, eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]]):
3702
3428
  if eval_set is None:
3703
3429
  return None
3704
- validated_eval_set = []
3705
- has_date = self._get_date_column(self.search_keys) is not None
3706
- for idx, eval_pair in enumerate(eval_set):
3707
- validated_pair = self._validate_eval_set_pair(X, eval_pair)
3708
- if validated_pair[1].isna().all():
3709
- if not has_date:
3710
- msg = self.bundle.get("oot_without_date_not_supported").format(idx + 1)
3711
- elif self.columns_for_online_api:
3712
- msg = self.bundle.get("oot_with_online_sources_not_supported").format(idx + 1)
3713
- else:
3714
- msg = None
3715
- if msg:
3716
- if not silent:
3717
- print(msg)
3718
- self.logger.warning(msg)
3719
- continue
3720
- validated_eval_set.append(validated_pair)
3721
-
3722
- return validated_eval_set
3430
+ return [self._validate_eval_set_pair(X, eval_pair) for eval_pair in eval_set]
3723
3431
 
3724
3432
  def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
3725
3433
  if len(eval_pair) != 2:
@@ -3794,18 +3502,16 @@ if response.status_code == 200:
3794
3502
  raise ValidationError(self.bundle.get("unsupported_y_type_eval_set").format(type(eval_y)))
3795
3503
 
3796
3504
  eval_y_nunique = validated_eval_y.nunique()
3797
- is_oot = validated_eval_y.isna().all()
3798
- if not is_oot and eval_y_nunique < 2:
3505
+ if eval_y_nunique < 2:
3799
3506
  raise ValidationError(self.bundle.get("y_is_constant_eval_set"))
3800
3507
 
3801
- if not is_oot and self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
3508
+ if self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
3802
3509
  raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
3803
3510
 
3804
- if not is_oot:
3805
- # Check for duplicates between train and eval sets by comparing all values
3806
- train_eval_intersection = pd.merge(X, validated_eval_X, how="inner")
3807
- if len(train_eval_intersection) > 0:
3808
- raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
3511
+ # Check for duplicates between train and eval sets by comparing all values
3512
+ train_eval_intersection = pd.merge(X, validated_eval_X, how="inner")
3513
+ if len(train_eval_intersection) > 0:
3514
+ raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
3809
3515
 
3810
3516
  return validated_eval_X, validated_eval_y
3811
3517
 
@@ -3821,12 +3527,10 @@ if response.status_code == 200:
3821
3527
  if isinstance(eval_set, tuple):
3822
3528
  eval_set = [eval_set]
3823
3529
  for eval in eval_set:
3824
- is_oot = eval[1].isna().all()
3825
- if not is_oot:
3826
- if self.baseline_score_column not in eval[0].columns:
3827
- raise ValidationError(self.bundle.get("baseline_score_column_not_exists"))
3828
- if eval[0][self.baseline_score_column].isna().any():
3829
- raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
3530
+ if self.baseline_score_column not in eval[0].columns:
3531
+ raise ValidationError(self.bundle.get("baseline_score_column_not_exists"))
3532
+ if eval[0][self.baseline_score_column].isna().any():
3533
+ raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
3830
3534
 
3831
3535
  @staticmethod
3832
3536
  def _sample_X_and_y(X: pd.DataFrame, y: pd.Series, enriched_X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
@@ -4000,7 +3704,7 @@ if response.status_code == 200:
4000
3704
  return df
4001
3705
 
4002
3706
  def _add_current_date_as_key(
4003
- self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], bundle: ResourceBundle, silent: bool = False
3707
+ self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
4004
3708
  ) -> pd.DataFrame:
4005
3709
  if (
4006
3710
  set(search_keys.values()) == {SearchKey.PHONE}
@@ -4008,8 +3712,7 @@ if response.status_code == 200:
4008
3712
  or set(search_keys.values()) == {SearchKey.HEM}
4009
3713
  or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
4010
3714
  ):
4011
- if not silent:
4012
- self.__log_warning(bundle.get("current_date_added"))
3715
+ self.__log_warning(bundle.get("current_date_added"))
4013
3716
  df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
4014
3717
  search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
4015
3718
  converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, generate_cyclical_features=False)
@@ -4148,7 +3851,7 @@ if response.status_code == 200:
4148
3851
  columns_to_sort = [date_column] if date_column is not None else []
4149
3852
 
4150
3853
  do_sorting = True
4151
- if self.id_columns and self.cv is not None and self.cv.is_time_series():
3854
+ if self.id_columns and self.cv.is_time_series():
4152
3855
  # Check duplicates by date and id_columns
4153
3856
  reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
4154
3857
  renamed_id_columns = [reversed_columns_renaming.get(c, c) for c in self.id_columns]
@@ -4344,11 +4047,7 @@ if response.status_code == 200:
4344
4047
  return [f.name for f in features_meta if f.type == "categorical" and f.name not in (self.id_columns or [])]
4345
4048
 
4346
4049
  def __prepare_feature_importances(
4347
- self,
4348
- trace_id: str,
4349
- clients_features_df: pd.DataFrame,
4350
- updated_shaps: Optional[Dict[str, float]] = None,
4351
- silent=False,
4050
+ self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
4352
4051
  ):
4353
4052
  if self._search_task is None:
4354
4053
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
@@ -4361,12 +4060,11 @@ if response.status_code == 200:
4361
4060
  features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
4362
4061
 
4363
4062
  # To be sure that names with hash suffixes
4364
- clients_features_df = clients_features_df.rename(columns=original_names_dict)
4063
+ df = df.rename(columns=original_names_dict)
4365
4064
 
4366
4065
  self.feature_names_ = []
4367
4066
  self.external_source_feature_names = []
4368
4067
  self.zero_shap_client_features = []
4369
- self.unstable_client_features = []
4370
4068
  self.feature_importances_ = []
4371
4069
  features_info = []
4372
4070
  features_info_without_links = []
@@ -4375,10 +4073,10 @@ if response.status_code == 200:
4375
4073
  original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
4376
4074
 
4377
4075
  for feature_meta in features_meta:
4378
- original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
4379
- feature_meta.name = original_name
4076
+ if feature_meta.name in original_names_dict.keys():
4077
+ feature_meta.name = original_names_dict[feature_meta.name]
4380
4078
 
4381
- is_client_feature = original_name in clients_features_df.columns
4079
+ is_client_feature = original_names_dict.get(feature_meta.name, feature_meta.name) in df.columns
4382
4080
 
4383
4081
  # Show and update shap values for client features only if select_features is True
4384
4082
  if updated_shaps is not None and (not is_client_feature or self.fit_select_features):
@@ -4395,21 +4093,12 @@ if response.status_code == 200:
4395
4093
 
4396
4094
  for feature_meta in features_meta:
4397
4095
  original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
4398
- is_client_feature = original_name in clients_features_df.columns
4096
+ is_client_feature = original_name in df.columns
4399
4097
 
4400
4098
  if not is_client_feature:
4401
4099
  self.external_source_feature_names.append(original_name)
4402
4100
 
4403
- if self.psi_values is not None:
4404
- if original_name in self.psi_values:
4405
- feature_meta.psi_value = self.psi_values[original_name]
4406
- else:
4407
- if is_client_feature and self.fit_select_features:
4408
- self.unstable_client_features.append(original_name)
4409
- continue
4410
-
4411
4101
  # TODO make a decision about selected features based on special flag from mlb
4412
-
4413
4102
  if original_shaps.get(feature_meta.name, 0.0) == 0.0:
4414
4103
  if is_client_feature and self.fit_select_features:
4415
4104
  self.zero_shap_client_features.append(original_name)
@@ -4433,7 +4122,7 @@ if response.status_code == 200:
4433
4122
  self.feature_names_.append(feature_meta.name)
4434
4123
  self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
4435
4124
 
4436
- df_for_sample = features_df if feature_meta.name in features_df.columns else clients_features_df
4125
+ df_for_sample = features_df if feature_meta.name in features_df.columns else df
4437
4126
  feature_info = FeatureInfo.from_metadata(feature_meta, df_for_sample, is_client_feature)
4438
4127
  features_info.append(feature_info.to_row(self.bundle))
4439
4128
  features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
@@ -4441,8 +4130,6 @@ if response.status_code == 200:
4441
4130
 
4442
4131
  if len(features_info) > 0:
4443
4132
  self.features_info = pd.DataFrame(features_info)
4444
- if self.features_info[self.bundle.get("features_info_psi")].isna().all():
4445
- self.features_info.drop(columns=[self.bundle.get("features_info_psi")], inplace=True)
4446
4133
  self._features_info_without_links = pd.DataFrame(features_info_without_links)
4447
4134
  self._internal_features_info = pd.DataFrame(internal_features_info)
4448
4135
  if not silent: