upgini 1.2.113a3974.dev2__py3-none-any.whl → 1.2.114a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -112,6 +112,7 @@ except Exception:
112
112
  CustomFallbackProgressBar as ProgressBar,
113
113
  )
114
114
 
115
+ from upgini.utils.psi import calculate_features_psi, calculate_sparsity_psi
115
116
  from upgini.utils.sample_utils import SampleColumns, SampleConfig, _num_samples, sample
116
117
  from upgini.utils.sort import sort_columns
117
118
  from upgini.utils.target_utils import calculate_psi, define_task
@@ -297,7 +298,9 @@ class FeaturesEnricher(TransformerMixin):
297
298
  self.feature_names_ = []
298
299
  self.external_source_feature_names = []
299
300
  self.zero_shap_client_features = []
301
+ self.unstable_client_features = []
300
302
  self.feature_importances_ = []
303
+ self.psi_values: Optional[Dict[str, float]] = None
301
304
  self.search_id = search_id
302
305
  self.disable_force_downsampling = disable_force_downsampling
303
306
  self.print_trace_id = print_trace_id
@@ -398,13 +401,26 @@ class FeaturesEnricher(TransformerMixin):
398
401
  @staticmethod
399
402
  def _check_eval_set(eval_set, X, bundle: ResourceBundle):
400
403
  checked_eval_set = []
401
- if eval_set is not None and isinstance(eval_set, tuple):
404
+ if eval_set is None:
405
+ return checked_eval_set
406
+ if isinstance(eval_set, tuple):
402
407
  eval_set = [eval_set]
403
- if eval_set is not None and not isinstance(eval_set, list):
408
+ if not isinstance(eval_set, list):
404
409
  raise ValidationError(bundle.get("unsupported_type_eval_set").format(type(eval_set)))
405
410
  for eval_pair in eval_set or []:
411
+ # Handle OOT
412
+ if isinstance(eval_pair, pd.DataFrame):
413
+ empty_target = pd.Series([np.nan] * len(eval_pair), index=eval_pair.index)
414
+ eval_pair = (eval_pair, empty_target)
415
+ elif isinstance(eval_pair, tuple) and len(eval_pair) == 1:
416
+ empty_target = pd.Series([np.nan] * len(eval_pair[0]), index=eval_pair[0].index)
417
+ eval_pair = (eval_pair[0], empty_target)
418
+
406
419
  if not isinstance(eval_pair, tuple) or len(eval_pair) != 2:
407
420
  raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
421
+ if eval_pair[1] is None:
422
+ empty_target = pd.Series([np.nan] * len(eval_pair[0]), index=eval_pair[0].index)
423
+ eval_pair = (eval_pair[0], empty_target)
408
424
  if not is_frames_equal(X, eval_pair[0], bundle):
409
425
  checked_eval_set.append(eval_pair)
410
426
  return checked_eval_set
@@ -426,6 +442,7 @@ class FeaturesEnricher(TransformerMixin):
426
442
  search_id_callback: Optional[Callable[[str], Any]] = None,
427
443
  select_features: bool = True,
428
444
  auto_fe_parameters: Optional[AutoFEParameters] = None,
445
+ stability_threshold: float = 0.15,
429
446
  **kwargs,
430
447
  ):
431
448
  """Fit to data.
@@ -515,6 +532,7 @@ class FeaturesEnricher(TransformerMixin):
515
532
  estimator=estimator,
516
533
  scoring=scoring,
517
534
  importance_threshold=importance_threshold,
535
+ stability_threshold=stability_threshold,
518
536
  max_features=max_features,
519
537
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
520
538
  auto_fe_parameters=auto_fe_parameters,
@@ -574,6 +592,7 @@ class FeaturesEnricher(TransformerMixin):
574
592
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
575
593
  select_features: bool = True,
576
594
  auto_fe_parameters: Optional[AutoFEParameters] = None,
595
+ stability_threshold: float = 0.15,
577
596
  **kwargs,
578
597
  ) -> pd.DataFrame:
579
598
  """Fit to data, then transform it.
@@ -618,6 +637,10 @@ class FeaturesEnricher(TransformerMixin):
618
637
  If True, return only selected features both from input and data sources.
619
638
  Otherwise, return all features from input and only selected features from data sources.
620
639
 
640
+ stability_threshold: float, optional (default=0.15)
641
+ Stability threshold for selected features PSI calculation. If PSI is less than this threshold,
642
+ then feature will be dropped.
643
+
621
644
  Returns
622
645
  -------
623
646
  X_new: pandas.DataFrame of shape (n_samples, n_features_new)
@@ -674,6 +697,7 @@ class FeaturesEnricher(TransformerMixin):
674
697
  scoring=scoring,
675
698
  estimator=estimator,
676
699
  importance_threshold=importance_threshold,
700
+ stability_threshold=stability_threshold,
677
701
  max_features=max_features,
678
702
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
679
703
  auto_fe_parameters=auto_fe_parameters,
@@ -941,7 +965,7 @@ class FeaturesEnricher(TransformerMixin):
941
965
  raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
942
966
 
943
967
  validated_X, validated_y, validated_eval_set = self._validate_train_eval(
944
- effective_X, effective_y, effective_eval_set
968
+ effective_X, effective_y, effective_eval_set, silent=internal_call
945
969
  )
946
970
 
947
971
  if self.X is None:
@@ -979,29 +1003,27 @@ class FeaturesEnricher(TransformerMixin):
979
1003
  return None
980
1004
 
981
1005
  cat_features_from_backend = self.__get_categorical_features()
1006
+ # Convert to original names
1007
+ cat_features_from_backend = [self.fit_columns_renaming.get(c, c) for c in cat_features_from_backend]
982
1008
  client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
983
1009
  estimator, validated_X, self.search_keys
984
1010
  )
1011
+ # Exclude id columns from cat_features
985
1012
  if self.id_columns and self.id_columns_encoder is not None:
986
1013
  if cat_features_from_backend:
987
1014
  cat_features_from_backend = [
988
- c
989
- for c in cat_features_from_backend
990
- if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
1015
+ c for c in cat_features_from_backend if c not in self.id_columns_encoder.feature_names_in_
991
1016
  ]
992
1017
  if client_cat_features:
993
1018
  client_cat_features = [
994
- c
995
- for c in client_cat_features
996
- if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
1019
+ c for c in client_cat_features if c not in self.id_columns_encoder.feature_names_in_
997
1020
  ]
998
1021
  for cat_feature in cat_features_from_backend:
999
- original_cat_feature = self.fit_columns_renaming.get(cat_feature)
1000
- if original_cat_feature in self.search_keys:
1001
- if self.search_keys[original_cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
1002
- search_keys_for_metrics.append(original_cat_feature)
1022
+ if cat_feature in self.search_keys:
1023
+ if self.search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
1024
+ search_keys_for_metrics.append(cat_feature)
1003
1025
  else:
1004
- self.logger.warning(self.bundle.get("cat_feature_search_key").format(original_cat_feature))
1026
+ self.logger.warning(self.bundle.get("cat_feature_search_key").format(cat_feature))
1005
1027
  search_keys_for_metrics.extend([c for c in self.id_columns or [] if c not in search_keys_for_metrics])
1006
1028
  self.logger.info(f"Search keys for metrics: {search_keys_for_metrics}")
1007
1029
 
@@ -1033,23 +1055,9 @@ class FeaturesEnricher(TransformerMixin):
1033
1055
  groups,
1034
1056
  _cv,
1035
1057
  columns_renaming,
1058
+ _,
1036
1059
  ) = prepared_data
1037
1060
 
1038
- # rename cat_features
1039
- if client_cat_features:
1040
- for new_c, old_c in columns_renaming.items():
1041
- if old_c in client_cat_features:
1042
- client_cat_features.remove(old_c)
1043
- client_cat_features.append(new_c)
1044
- for cat_feature in client_cat_features:
1045
- if cat_feature not in fitting_X.columns:
1046
- self.logger.error(
1047
- f"Client cat_feature `{cat_feature}` not found in"
1048
- f" x columns: {fitting_X.columns.to_list()}"
1049
- )
1050
- else:
1051
- client_cat_features = []
1052
-
1053
1061
  # rename baseline_score_column
1054
1062
  reversed_renaming = {v: k for k, v in columns_renaming.items()}
1055
1063
  baseline_score_column = self.baseline_score_column
@@ -1074,9 +1082,9 @@ class FeaturesEnricher(TransformerMixin):
1074
1082
  self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
1075
1083
 
1076
1084
  has_date = self._get_date_column(search_keys) is not None
1077
- has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
1078
1085
  model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
1079
1086
  cat_features = list(set(client_cat_features + cat_features_from_backend))
1087
+ has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
1080
1088
  baseline_cat_features = [f for f in cat_features if f in fitting_X.columns]
1081
1089
  enriched_cat_features = [f for f in cat_features if f in fitting_enriched_X.columns]
1082
1090
  if len(enriched_cat_features) < len(cat_features):
@@ -1196,8 +1204,6 @@ class FeaturesEnricher(TransformerMixin):
1196
1204
  # max_initial_eval_set_hit_rate = self._search_task.get_max_initial_eval_set_hit_rate_v2()
1197
1205
  if len(fitting_eval_set_dict) > 0:
1198
1206
  for idx in fitting_eval_set_dict.keys():
1199
- # eval_hit_rate = max_initial_eval_set_hit_rate[idx + 1]
1200
-
1201
1207
  (
1202
1208
  eval_X_sorted,
1203
1209
  eval_y_sorted,
@@ -1205,6 +1211,10 @@ class FeaturesEnricher(TransformerMixin):
1205
1211
  enriched_eval_y_sorted,
1206
1212
  ) = fitting_eval_set_dict[idx]
1207
1213
 
1214
+ if eval_y_sorted.isna().all():
1215
+ # Skip OOT eval set
1216
+ continue
1217
+
1208
1218
  if baseline_estimator is not None:
1209
1219
  self.logger.info(
1210
1220
  f"Calculate baseline {metric} on eval set {idx + 1} "
@@ -1247,17 +1257,14 @@ class FeaturesEnricher(TransformerMixin):
1247
1257
  "quality_metrics_eval_segment"
1248
1258
  ).format(idx + 1),
1249
1259
  self.bundle.get("quality_metrics_rows_header"): _num_samples(
1250
- # effective_eval_set[idx][0]
1251
1260
  # Use actually used for metrics dataset
1252
1261
  eval_X_sorted
1253
1262
  ),
1254
- # self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
1255
1263
  }
1256
1264
  if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
1257
1265
  eval_y_sorted
1258
1266
  ):
1259
1267
  eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
1260
- # np.mean(validated_eval_set[idx][1]), 4
1261
1268
  # Use actually used for metrics dataset
1262
1269
  np.mean(eval_y_sorted),
1263
1270
  4,
@@ -1279,7 +1286,7 @@ class FeaturesEnricher(TransformerMixin):
1279
1286
  metrics.append(eval_metrics)
1280
1287
 
1281
1288
  if updating_shaps is not None:
1282
- decoded_X = self._decode_id_columns(fitting_X, columns_renaming)
1289
+ decoded_X = self._decode_id_columns(fitting_X)
1283
1290
  self._update_shap_values(trace_id, decoded_X, updating_shaps, silent=not internal_call)
1284
1291
 
1285
1292
  metrics_df = pd.DataFrame(metrics)
@@ -1330,6 +1337,198 @@ class FeaturesEnricher(TransformerMixin):
1330
1337
  finally:
1331
1338
  self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
1332
1339
 
1340
+ def _select_features_by_psi(
1341
+ self,
1342
+ trace_id: str,
1343
+ X: Union[pd.DataFrame, pd.Series, np.ndarray],
1344
+ y: Union[pd.DataFrame, pd.Series, np.ndarray, List],
1345
+ eval_set: Optional[Union[List[tuple], tuple]],
1346
+ stability_threshold: float,
1347
+ cv: Union[BaseCrossValidator, CVType, str, None] = None,
1348
+ estimator=None,
1349
+ exclude_features_sources: Optional[List[str]] = None,
1350
+ importance_threshold: Optional[float] = None,
1351
+ max_features: Optional[int] = None,
1352
+ progress_bar: bool = True,
1353
+ progress_callback: Optional[Callable] = None,
1354
+ ):
1355
+ search_keys = self.search_keys.copy()
1356
+ validated_X, _, validated_eval_set = self._validate_train_eval(X, y, eval_set, silent=True)
1357
+ if isinstance(X, np.ndarray):
1358
+ search_keys = {str(k): v for k, v in search_keys.items()}
1359
+
1360
+ date_column = self._get_date_column(search_keys)
1361
+ has_date = date_column is not None
1362
+ if not has_date:
1363
+ self.logger.info("No date column for OOT PSI calculation")
1364
+ return
1365
+ if not validated_eval_set:
1366
+ self.logger.info("No eval set for OOT PSI calculation")
1367
+ return
1368
+ if validated_X[date_column].nunique() <= 1:
1369
+ self.logger.warning("Constant date for OOT PSI calculation")
1370
+ return
1371
+ if self.cv is not None and self.cv.is_time_series():
1372
+ self.logger.warning("Time series CV is not supported for OOT PSI calculation")
1373
+ return
1374
+
1375
+ cat_features_from_backend = self.__get_categorical_features()
1376
+ cat_features_from_backend = [self.fit_columns_renaming.get(c, c) for c in cat_features_from_backend]
1377
+ client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
1378
+ estimator, validated_X, search_keys
1379
+ )
1380
+ if self.id_columns and self.id_columns_encoder is not None:
1381
+ if cat_features_from_backend:
1382
+ cat_features_from_backend = [
1383
+ c for c in cat_features_from_backend if c not in self.id_columns_encoder.feature_names_in_
1384
+ ]
1385
+ if client_cat_features:
1386
+ client_cat_features = [
1387
+ c for c in client_cat_features if c not in self.id_columns_encoder.feature_names_in_
1388
+ ]
1389
+
1390
+ prepared_data = self._prepare_data_for_metrics(
1391
+ trace_id=trace_id,
1392
+ X=X,
1393
+ y=y,
1394
+ eval_set=eval_set,
1395
+ exclude_features_sources=exclude_features_sources,
1396
+ importance_threshold=importance_threshold,
1397
+ max_features=max_features,
1398
+ remove_outliers_calc_metrics=False,
1399
+ cv_override=cv,
1400
+ search_keys_for_metrics=search_keys_for_metrics,
1401
+ progress_bar=progress_bar,
1402
+ progress_callback=progress_callback,
1403
+ client_cat_features=client_cat_features,
1404
+ )
1405
+ if prepared_data is None:
1406
+ return None
1407
+
1408
+ (
1409
+ validated_X,
1410
+ fitting_X,
1411
+ y_sorted,
1412
+ fitting_enriched_X,
1413
+ _,
1414
+ fitting_eval_set_dict,
1415
+ _,
1416
+ _,
1417
+ _,
1418
+ columns_renaming,
1419
+ eval_set_dates,
1420
+ ) = prepared_data
1421
+
1422
+ model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
1423
+ cat_features = list(set(client_cat_features + cat_features_from_backend))
1424
+
1425
+ # Drop unstable features
1426
+ unstable_features = self._check_stability(
1427
+ validated_X,
1428
+ validated_eval_set,
1429
+ fitting_eval_set_dict,
1430
+ eval_set_dates,
1431
+ search_keys,
1432
+ stability_threshold,
1433
+ cat_features,
1434
+ model_task_type,
1435
+ )
1436
+ client_features_df = self.df_with_original_index.rename(columns=columns_renaming)
1437
+ # decoded_X = self._decode_id_columns(fitting_X, columns_renaming)
1438
+ self._update_report_psi(trace_id, client_features_df)
1439
+
1440
+ if unstable_features:
1441
+ msg = f"Some features are unstable: {unstable_features} and will be dropped"
1442
+ self.logger.warning(msg)
1443
+ print(msg)
1444
+ fitting_X = fitting_X.drop(columns=unstable_features, errors="ignore")
1445
+ fitting_enriched_X = fitting_enriched_X.drop(columns=unstable_features, errors="ignore")
1446
+ msg = f"Threre are {len(fitting_enriched_X.columns)} stable selected features left"
1447
+ self.logger.info(msg)
1448
+ print(msg)
1449
+ for idx, (
1450
+ eval_X,
1451
+ eval_y,
1452
+ eval_enriched_X,
1453
+ eval_enriched_y,
1454
+ ) in fitting_eval_set_dict.items():
1455
+ eval_X = eval_X.drop(columns=unstable_features, errors="ignore")
1456
+ eval_enriched_X = eval_enriched_X.drop(columns=unstable_features, errors="ignore")
1457
+ fitting_eval_set_dict[idx] = (eval_X, eval_y, eval_enriched_X, eval_enriched_y)
1458
+
1459
+ def _check_stability(
1460
+ self,
1461
+ X: pd.DataFrame,
1462
+ eval_set: List[Tuple[pd.DataFrame, pd.Series]],
1463
+ enriched_eval_set: Dict,
1464
+ eval_set_dates: Dict[int, pd.Series],
1465
+ search_keys: Dict[str, SearchKey],
1466
+ stability_threshold: float,
1467
+ cat_features: List[str],
1468
+ model_task_type: ModelTaskType,
1469
+ ) -> List[str]:
1470
+ # Find latest eval set or earliest if all eval sets are before train set
1471
+ date_column = self._get_date_column(search_keys)
1472
+
1473
+ # Get minimum date from main dataset X
1474
+ main_min_date = X[date_column].min()
1475
+
1476
+ # Find minimum date for each eval_set and compare with main dataset
1477
+ eval_dates = []
1478
+ for i, (eval_x, _) in enumerate(eval_set):
1479
+ if date_column in eval_x.columns:
1480
+ eval_min_date = eval_x[date_column].min()
1481
+ eval_max_date = eval_x[date_column].max()
1482
+ eval_dates.append((i, eval_min_date, eval_max_date))
1483
+
1484
+ if not eval_dates:
1485
+ return []
1486
+
1487
+ # Check if any eval_set has minimum date >= main dataset minimum date
1488
+ later_eval_sets = [(i, min_date, max_date) for i, min_date, max_date in eval_dates if min_date >= main_min_date]
1489
+
1490
+ if later_eval_sets:
1491
+ # If there are eval_sets with date >= main date, choose the one with highest maximum date
1492
+ selected_eval_set_idx = max(later_eval_sets, key=lambda x: x[2])[0]
1493
+ else:
1494
+ # If all eval_sets have dates < main date, choose the one with lowest minimux date
1495
+ selected_eval_set_idx = max(eval_dates, key=lambda x: x[1])[0]
1496
+
1497
+ checking_eval_set = enriched_eval_set[selected_eval_set_idx]
1498
+
1499
+ checking_eval_set_df = (
1500
+ checking_eval_set[2]
1501
+ if checking_eval_set[1] is None or checking_eval_set[1].isna().all()
1502
+ else pd.concat([checking_eval_set[2], checking_eval_set[1].to_frame(TARGET)], axis=1)
1503
+ )
1504
+ checking_eval_set_df = checking_eval_set_df.copy()
1505
+
1506
+ checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
1507
+
1508
+ psi_values_sparse = calculate_sparsity_psi(
1509
+ checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
1510
+ )
1511
+
1512
+ unstable_by_sparsity = [feature for feature, psi in psi_values_sparse.items() if psi > stability_threshold]
1513
+ if unstable_by_sparsity:
1514
+ self.logger.info(f"Unstable by sparsity features: {sorted(unstable_by_sparsity)}")
1515
+
1516
+ psi_values = calculate_features_psi(
1517
+ checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
1518
+ )
1519
+
1520
+ unstable_by_value = [feature for feature, psi in psi_values.items() if psi > stability_threshold]
1521
+ if unstable_by_value:
1522
+ self.logger.info(f"Unstable by value features: {sorted(unstable_by_value)}")
1523
+
1524
+ self.psi_values = {
1525
+ feature: psi_value for feature, psi_value in psi_values.items() if psi_value <= stability_threshold
1526
+ }
1527
+
1528
+ total_unstable_features = sorted(set(unstable_by_sparsity + unstable_by_value))
1529
+
1530
+ return total_unstable_features
1531
+
1333
1532
  def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float], silent: bool = False):
1334
1533
  renaming = self.fit_columns_renaming or {}
1335
1534
  self.logger.info(f"Updating SHAP values: {new_shaps}")
@@ -1385,6 +1584,56 @@ class FeaturesEnricher(TransformerMixin):
1385
1584
  except (ImportError, NameError):
1386
1585
  pass
1387
1586
 
1587
+ def _update_report_psi(self, trace_id: str, clients_features_df: pd.DataFrame):
1588
+ self.__prepare_feature_importances(trace_id, clients_features_df)
1589
+
1590
+ if self.features_info_display_handle is not None:
1591
+ try:
1592
+ _ = get_ipython() # type: ignore
1593
+
1594
+ display_html_dataframe(
1595
+ self.features_info,
1596
+ self._features_info_without_links,
1597
+ self.bundle.get("relevant_features_header"),
1598
+ display_handle=self.features_info_display_handle,
1599
+ )
1600
+ except (ImportError, NameError):
1601
+ pass
1602
+
1603
+ if self.data_sources_display_handle is not None:
1604
+ try:
1605
+ _ = get_ipython() # type: ignore
1606
+
1607
+ display_html_dataframe(
1608
+ self.relevant_data_sources,
1609
+ self._relevant_data_sources_wo_links,
1610
+ self.bundle.get("relevant_data_sources_header"),
1611
+ display_handle=self.data_sources_display_handle,
1612
+ )
1613
+ except (ImportError, NameError):
1614
+ pass
1615
+
1616
+ if self.autofe_features_display_handle is not None:
1617
+ try:
1618
+ _ = get_ipython() # type: ignore
1619
+ autofe_descriptions_df = self.get_autofe_features_description()
1620
+ if autofe_descriptions_df is not None:
1621
+ display_html_dataframe(
1622
+ df=autofe_descriptions_df,
1623
+ internal_df=autofe_descriptions_df,
1624
+ header=self.bundle.get("autofe_descriptions_header"),
1625
+ display_handle=self.autofe_features_display_handle,
1626
+ )
1627
+ except (ImportError, NameError):
1628
+ pass
1629
+ if self.report_button_handle is not None:
1630
+ try:
1631
+ _ = get_ipython() # type: ignore
1632
+
1633
+ self.__show_report_button(display_handle=self.report_button_handle)
1634
+ except (ImportError, NameError):
1635
+ pass
1636
+
1388
1637
  def _check_train_and_eval_target_distribution(self, y, eval_set_dict):
1389
1638
  uneven_distribution = False
1390
1639
  for eval_set in eval_set_dict.values():
@@ -1488,7 +1737,7 @@ class FeaturesEnricher(TransformerMixin):
1488
1737
  def _get_and_validate_client_cat_features(
1489
1738
  self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
1490
1739
  ) -> Tuple[Optional[List[str]], List[str]]:
1491
- cat_features = None
1740
+ cat_features = []
1492
1741
  search_keys_for_metrics = []
1493
1742
  if (
1494
1743
  estimator is not None
@@ -1535,7 +1784,7 @@ class FeaturesEnricher(TransformerMixin):
1535
1784
  is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
1536
1785
  is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
1537
1786
  checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
1538
- validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set)
1787
+ validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set, silent=True)
1539
1788
 
1540
1789
  sampled_data = self._get_enriched_for_metrics(
1541
1790
  trace_id,
@@ -1549,7 +1798,7 @@ class FeaturesEnricher(TransformerMixin):
1549
1798
  progress_bar,
1550
1799
  progress_callback,
1551
1800
  )
1552
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = dataclasses.astuple(
1801
+ (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming) = dataclasses.astuple(
1553
1802
  sampled_data
1554
1803
  )
1555
1804
 
@@ -1658,7 +1907,7 @@ class FeaturesEnricher(TransformerMixin):
1658
1907
  fitting_X, y_sorted, search_keys, self.model_task_type, sort_all_columns=True, logger=self.logger
1659
1908
  )
1660
1909
  fitting_X = fitting_X[fitting_x_columns]
1661
- fitting_X, _ = self._encode_id_columns(fitting_X, self.fit_columns_renaming)
1910
+ fitting_X, _ = self._encode_id_columns(fitting_X)
1662
1911
  self.logger.info(f"Final sorted list of fitting X columns: {fitting_x_columns}")
1663
1912
  fitting_enriched_x_columns = fitting_enriched_X.columns.to_list()
1664
1913
  fitting_enriched_x_columns = sort_columns(
@@ -1670,14 +1919,18 @@ class FeaturesEnricher(TransformerMixin):
1670
1919
  logger=self.logger,
1671
1920
  )
1672
1921
  fitting_enriched_X = fitting_enriched_X[fitting_enriched_x_columns]
1673
- fitting_enriched_X, _ = self._encode_id_columns(fitting_enriched_X, self.fit_columns_renaming)
1922
+ fitting_enriched_X, _ = self._encode_id_columns(fitting_enriched_X)
1674
1923
  self.logger.info(f"Final sorted list of fitting enriched X columns: {fitting_enriched_x_columns}")
1924
+ date_column = self._get_date_column(search_keys)
1925
+ eval_set_dates = {}
1675
1926
  for idx, eval_tuple in eval_set_sampled_dict.items():
1676
1927
  eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
1677
1928
  eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
1678
1929
  enriched_eval_X_sorted, enriched_eval_y_sorted = self._sort_by_system_record_id(
1679
1930
  enriched_eval_X, eval_y_sampled, self.cv
1680
1931
  )
1932
+ if date_column is not None:
1933
+ eval_set_dates[idx] = eval_X_sorted[date_column]
1681
1934
  fitting_eval_X = eval_X_sorted[fitting_x_columns].copy()
1682
1935
  fitting_enriched_eval_X = enriched_eval_X_sorted[fitting_enriched_x_columns].copy()
1683
1936
 
@@ -1698,8 +1951,8 @@ class FeaturesEnricher(TransformerMixin):
1698
1951
  .astype(np.float64)
1699
1952
  )
1700
1953
 
1701
- fitting_eval_X, unknown_dict = self._encode_id_columns(fitting_eval_X, self.fit_columns_renaming)
1702
- fitting_enriched_eval_X, _ = self._encode_id_columns(fitting_enriched_eval_X, self.fit_columns_renaming)
1954
+ fitting_eval_X, unknown_dict = self._encode_id_columns(fitting_eval_X)
1955
+ fitting_enriched_eval_X, _ = self._encode_id_columns(fitting_enriched_eval_X)
1703
1956
 
1704
1957
  if len(unknown_dict) > 0:
1705
1958
  print(self.bundle.get("unknown_id_column_value_in_eval_set").format(unknown_dict))
@@ -1722,6 +1975,7 @@ class FeaturesEnricher(TransformerMixin):
1722
1975
  groups,
1723
1976
  cv,
1724
1977
  columns_renaming,
1978
+ eval_set_dates,
1725
1979
  )
1726
1980
 
1727
1981
  @dataclass
@@ -1884,14 +2138,16 @@ class FeaturesEnricher(TransformerMixin):
1884
2138
  remove_outliers_calc_metrics: Optional[bool],
1885
2139
  ) -> _EnrichedDataForMetrics:
1886
2140
  eval_set_sampled_dict = {}
1887
- search_keys = self.fit_search_keys
2141
+ search_keys = self.fit_search_keys.copy()
1888
2142
 
1889
2143
  rows_to_drop = None
1890
2144
  has_date = self._get_date_column(search_keys) is not None
1891
2145
  self.model_task_type = self.model_task_type or define_task(
1892
2146
  self.df_with_original_index[TARGET], has_date, self.logger, silent=True
1893
2147
  )
1894
- if self.model_task_type == ModelTaskType.REGRESSION:
2148
+ if remove_outliers_calc_metrics is None:
2149
+ remove_outliers_calc_metrics = True
2150
+ if self.model_task_type == ModelTaskType.REGRESSION and remove_outliers_calc_metrics:
1895
2151
  target_outliers_df = self._search_task.get_target_outliers(trace_id)
1896
2152
  if target_outliers_df is not None and len(target_outliers_df) > 0:
1897
2153
  outliers = pd.merge(
@@ -1901,11 +2157,8 @@ class FeaturesEnricher(TransformerMixin):
1901
2157
  how="inner",
1902
2158
  )
1903
2159
  top_outliers = outliers.sort_values(by=TARGET, ascending=False)[TARGET].head(3)
1904
- if remove_outliers_calc_metrics is None or remove_outliers_calc_metrics is True:
1905
- rows_to_drop = outliers
1906
- not_msg = ""
1907
- else:
1908
- not_msg = "not "
2160
+ rows_to_drop = outliers
2161
+ not_msg = ""
1909
2162
  msg = self.bundle.get("target_outliers_warning").format(len(target_outliers_df), top_outliers, not_msg)
1910
2163
  print(msg)
1911
2164
  self.logger.warning(msg)
@@ -1963,12 +2216,13 @@ class FeaturesEnricher(TransformerMixin):
1963
2216
  enriched_eval_X = enriched_eval_sets[idx + 1][enriched_X_columns].copy()
1964
2217
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
1965
2218
 
1966
- reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
1967
- X_sampled.rename(columns=reversed_renaming, inplace=True)
1968
- enriched_X.rename(columns=reversed_renaming, inplace=True)
2219
+ # reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
2220
+ X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
2221
+ enriched_X.rename(columns=self.fit_columns_renaming, inplace=True)
1969
2222
  for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
1970
- eval_X_sampled.rename(columns=reversed_renaming, inplace=True)
1971
- enriched_eval_X.rename(columns=reversed_renaming, inplace=True)
2223
+ eval_X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
2224
+ enriched_eval_X.rename(columns=self.fit_columns_renaming, inplace=True)
2225
+ search_keys = {self.fit_columns_renaming.get(k, k): v for k, v in search_keys.items()}
1972
2226
 
1973
2227
  datasets_hash = hash_input(self.X, self.y, self.eval_set)
1974
2228
  return self.__cache_and_return_results(
@@ -2026,13 +2280,7 @@ class FeaturesEnricher(TransformerMixin):
2026
2280
  enriched_df, x_columns, enriched_X.columns.tolist(), len(eval_set) if has_eval_set else 0
2027
2281
  )
2028
2282
 
2029
- # Add hash-suffixes because output of transform has original names
2030
- reversed_renaming = {v: k for k, v in columns_renaming.items()}
2031
- X_sampled.rename(columns=reversed_renaming, inplace=True)
2032
- enriched_X.rename(columns=reversed_renaming, inplace=True)
2033
- for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
2034
- eval_X_sampled.rename(columns=reversed_renaming, inplace=True)
2035
- enriched_eval_X.rename(columns=reversed_renaming, inplace=True)
2283
+ search_keys = {columns_renaming.get(k, k): v for k, v in search_keys.items()}
2036
2284
 
2037
2285
  # Cache and return results
2038
2286
  datasets_hash = hash_input(validated_X, validated_y, eval_set)
@@ -2112,7 +2360,7 @@ class FeaturesEnricher(TransformerMixin):
2112
2360
 
2113
2361
  def __extract_eval_data(
2114
2362
  self, enriched_df: pd.DataFrame, x_columns: List[str], enriched_X_columns: List[str], eval_set_len: int
2115
- ) -> Dict[int, Tuple]:
2363
+ ) -> Tuple[Dict[int, Tuple], Dict[int, pd.Series]]:
2116
2364
  eval_set_sampled_dict = {}
2117
2365
 
2118
2366
  for idx in range(eval_set_len):
@@ -2158,12 +2406,12 @@ class FeaturesEnricher(TransformerMixin):
2158
2406
  columns_renaming: Dict[str, str],
2159
2407
  ):
2160
2408
  # X_sampled - with hash-suffixes
2161
- reversed_renaming = {v: k for k, v in columns_renaming.items()}
2162
- search_keys = {
2163
- reversed_renaming.get(k, k): v
2164
- for k, v in search_keys.items()
2165
- if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
2166
- }
2409
+ # reversed_renaming = {v: k for k, v in columns_renaming.items()}
2410
+ # search_keys = {
2411
+ # reversed_renaming.get(k, k): v
2412
+ # for k, v in search_keys.items()
2413
+ # if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
2414
+ # }
2167
2415
  return FeaturesEnricher._EnrichedDataForMetrics(
2168
2416
  X_sampled=X_sampled,
2169
2417
  y_sampled=y_sampled,
@@ -2313,7 +2561,7 @@ if response.status_code == 200:
2313
2561
  self.logger.info("Start transform")
2314
2562
 
2315
2563
  validated_X, validated_y, validated_eval_set = self._validate_train_eval(
2316
- X, y, eval_set=None, is_transform=True
2564
+ X, y, eval_set=None, is_transform=True, silent=True
2317
2565
  )
2318
2566
  df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
2319
2567
 
@@ -2415,7 +2663,7 @@ if response.status_code == 200:
2415
2663
  else:
2416
2664
  self.logger.info("Input dataset hasn't date column")
2417
2665
  if self.__should_add_date_column():
2418
- df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
2666
+ df = self._add_current_date_as_key(df, search_keys, self.bundle, silent=True)
2419
2667
 
2420
2668
  email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
2421
2669
  if email_columns and self.generate_search_key_features:
@@ -2664,7 +2912,8 @@ if response.status_code == 200:
2664
2912
  selecting_columns = [
2665
2913
  c
2666
2914
  for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
2667
- if c not in self.zero_shap_client_features or c in (self.id_columns or [])
2915
+ if (c not in self.zero_shap_client_features and c not in self.unstable_client_features)
2916
+ or c in (self.id_columns or [])
2668
2917
  ]
2669
2918
  selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
2670
2919
  if add_fit_system_record_id:
@@ -2798,6 +3047,7 @@ if response.status_code == 200:
2798
3047
  scoring: Union[Callable, str, None],
2799
3048
  estimator: Optional[Any],
2800
3049
  importance_threshold: Optional[float],
3050
+ stability_threshold: float,
2801
3051
  max_features: Optional[int],
2802
3052
  remove_outliers_calc_metrics: Optional[bool],
2803
3053
  auto_fe_parameters: AutoFEParameters,
@@ -2812,6 +3062,7 @@ if response.status_code == 200:
2812
3062
  self.fit_columns_renaming = None
2813
3063
  self.fit_dropped_features = set()
2814
3064
  self.fit_generated_features = []
3065
+ self.psi_values = None
2815
3066
 
2816
3067
  validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, eval_set)
2817
3068
 
@@ -2908,7 +3159,7 @@ if response.status_code == 200:
2908
3159
  self.logger.info("Input dataset hasn't date column")
2909
3160
  # TODO remove when this logic will be implemented on the back
2910
3161
  if self.__should_add_date_column():
2911
- df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
3162
+ df = self._add_current_date_as_key(df, self.fit_search_keys, self.bundle)
2912
3163
 
2913
3164
  email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
2914
3165
  if email_columns and self.generate_search_key_features:
@@ -2923,10 +3174,13 @@ if response.status_code == 200:
2923
3174
  except Exception:
2924
3175
  self.logger.exception("Failed to check dates distribution validity")
2925
3176
 
3177
+ self.__adjust_cv(df)
3178
+
2926
3179
  if (
2927
3180
  is_numeric_dtype(df[self.TARGET_NAME])
2928
3181
  and self.model_task_type in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]
2929
3182
  and has_date
3183
+ and (self.cv is None or not self.cv.is_time_series())
2930
3184
  ):
2931
3185
  self._validate_PSI(df.sort_values(by=maybe_date_column))
2932
3186
 
@@ -2958,7 +3212,15 @@ if response.status_code == 200:
2958
3212
 
2959
3213
  self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
2960
3214
 
2961
- self.__adjust_cv(df)
3215
+ # Group columns should have normalized names
3216
+ if self.runtime_parameters.properties.get("cv_params.group_columns") is not None:
3217
+ original_to_hash = {v: k for k, v in self.fit_columns_renaming.items()}
3218
+ self.runtime_parameters.properties["cv_params.group_columns"] = ",".join(
3219
+ [
3220
+ original_to_hash.get(c, c)
3221
+ for c in self.runtime_parameters.properties["cv_params.group_columns"].split(",")
3222
+ ]
3223
+ )
2962
3224
 
2963
3225
  if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
2964
3226
  id_columns = self.__get_renamed_id_columns()
@@ -3164,6 +3426,21 @@ if response.status_code == 200:
3164
3426
  display_id=f"autofe_descriptions_{uuid.uuid4()}",
3165
3427
  )
3166
3428
 
3429
+ self._select_features_by_psi(
3430
+ trace_id=trace_id,
3431
+ X=X,
3432
+ y=y,
3433
+ eval_set=eval_set,
3434
+ stability_threshold=stability_threshold,
3435
+ cv=self.cv,
3436
+ estimator=estimator,
3437
+ exclude_features_sources=exclude_features_sources,
3438
+ importance_threshold=importance_threshold,
3439
+ max_features=max_features,
3440
+ progress_bar=progress_bar,
3441
+ progress_callback=progress_callback,
3442
+ )
3443
+
3167
3444
  if self._has_paid_features(exclude_features_sources):
3168
3445
  if calculate_metrics is not None and calculate_metrics:
3169
3446
  msg = self.bundle.get("metrics_with_paid_features")
@@ -3250,20 +3527,23 @@ if response.status_code == 200:
3250
3527
  return None if self.id_columns is None else [reverse_renaming.get(c) or c for c in self.id_columns]
3251
3528
 
3252
3529
  def __adjust_cv(self, df: pd.DataFrame):
3253
- date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3254
- # Check Multivariate time series
3255
- if (
3256
- self.cv is None
3257
- and date_column
3258
- and self.model_task_type == ModelTaskType.REGRESSION
3259
- and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys())) == 0
3260
- and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
3261
- ):
3262
- msg = self.bundle.get("multivariate_timeseries_detected")
3263
- self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
3264
- elif self.cv is None and self.model_task_type != ModelTaskType.REGRESSION:
3265
- msg = self.bundle.get("group_k_fold_in_classification")
3266
- self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
3530
+ if self.cv is None:
3531
+ date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3532
+ # Check Multivariate time series
3533
+ if (
3534
+ date_column
3535
+ and self.model_task_type == ModelTaskType.REGRESSION
3536
+ and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys()))
3537
+ == 0
3538
+ and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
3539
+ ):
3540
+ msg = self.bundle.get("multivariate_timeseries_detected")
3541
+ self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
3542
+ elif self.model_task_type != ModelTaskType.REGRESSION:
3543
+ msg = self.bundle.get("group_k_fold_in_classification")
3544
+ self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
3545
+
3546
+ if self.cv == CVType.group_k_fold:
3267
3547
  group_columns = self._get_group_columns(df, self.fit_search_keys)
3268
3548
  self.runtime_parameters.properties["cv_params.group_columns"] = ",".join(group_columns)
3269
3549
  self.runtime_parameters.properties["cv_params.shuffle_kfold"] = "True"
@@ -3289,48 +3569,42 @@ if response.status_code == 200:
3289
3569
  y: Optional[pd.Series] = None,
3290
3570
  eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]] = None,
3291
3571
  is_transform: bool = False,
3572
+ silent: bool = False,
3292
3573
  ) -> Tuple[pd.DataFrame, pd.Series, Optional[List[Tuple[pd.DataFrame, pd.Series]]]]:
3293
3574
  validated_X = self._validate_X(X, is_transform)
3294
3575
  validated_y = self._validate_y(validated_X, y, enforce_y=not is_transform)
3295
- validated_eval_set = self._validate_eval_set(validated_X, eval_set)
3576
+ validated_eval_set = self._validate_eval_set(validated_X, eval_set, silent=silent)
3296
3577
  return validated_X, validated_y, validated_eval_set
3297
3578
 
3298
3579
  def _encode_id_columns(
3299
3580
  self,
3300
3581
  X: pd.DataFrame,
3301
- columns_renaming: Optional[Dict[str, str]] = None,
3302
3582
  ) -> Tuple[pd.DataFrame, Dict[str, List[Any]]]:
3303
- columns_renaming = columns_renaming or {}
3304
3583
  unknown_dict = {}
3305
3584
 
3306
3585
  if self.id_columns and self.id_columns_encoder is not None:
3307
- inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
3308
- renamed_id_columns = [
3309
- inverse_columns_renaming.get(col, col) for col in self.id_columns_encoder.feature_names_in_
3310
- ]
3311
- self.logger.info(f"Convert id columns to int: {renamed_id_columns}")
3312
- encoded = self.id_columns_encoder.transform(X[renamed_id_columns].rename(columns=columns_renaming))
3313
- for i, c in enumerate(renamed_id_columns):
3314
- unknown_values = X[encoded[:, i] == -1][c].unique().tolist()
3315
- if len(unknown_values) > 0:
3316
- unknown_dict[c] = unknown_values
3317
- X[renamed_id_columns] = encoded
3318
- X = X.loc[(X[renamed_id_columns] != -1).all(axis=1)]
3319
-
3320
- if len(unknown_dict) > 0:
3321
- self.logger.warning(f"Unknown values in id columns: {unknown_dict}")
3586
+ encoding_id_columns = [c for c in self.id_columns if c in X.columns]
3587
+ if len(encoding_id_columns) > 0:
3588
+ self.logger.info(f"Convert id columns to int: {encoding_id_columns}")
3589
+ encoded = self.id_columns_encoder.transform(X[encoding_id_columns])
3590
+ for i, c in enumerate(encoding_id_columns):
3591
+ unknown_values = X[encoded[:, i] == -1][c].unique().tolist()
3592
+ if len(unknown_values) > 0:
3593
+ unknown_dict[c] = unknown_values
3594
+ X[encoding_id_columns] = encoded
3595
+ X = X.loc[(X[encoding_id_columns] != -1).all(axis=1)]
3596
+
3597
+ if len(unknown_dict) > 0:
3598
+ self.logger.warning(f"Unknown values in id columns: {unknown_dict}")
3322
3599
 
3323
3600
  return X, unknown_dict
3324
3601
 
3325
- def _decode_id_columns(self, X: pd.DataFrame, columns_renaming: Dict[str, str]):
3326
- columns_renaming = columns_renaming or {}
3602
+ def _decode_id_columns(self, X: pd.DataFrame):
3327
3603
  if self.id_columns and self.id_columns_encoder is not None:
3328
- inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
3329
- renamed_id_columns = [
3330
- inverse_columns_renaming.get(col, col) for col in self.id_columns_encoder.feature_names_in_
3331
- ]
3332
- decoded = self.id_columns_encoder.inverse_transform(X[renamed_id_columns].rename(columns=columns_renaming))
3333
- X[renamed_id_columns] = decoded
3604
+ decoding_id_columns = [c for c in self.id_columns if c in X.columns]
3605
+ if len(decoding_id_columns) > 0:
3606
+ decoded = self.id_columns_encoder.inverse_transform(X[self.id_columns])
3607
+ X[self.id_columns] = decoded
3334
3608
 
3335
3609
  return X
3336
3610
 
@@ -3424,10 +3698,30 @@ if response.status_code == 200:
3424
3698
 
3425
3699
  return validated_y
3426
3700
 
3427
- def _validate_eval_set(self, X: pd.DataFrame, eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]]):
3701
+ def _validate_eval_set(
3702
+ self, X: pd.DataFrame, eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]], silent: bool = False
3703
+ ):
3428
3704
  if eval_set is None:
3429
3705
  return None
3430
- return [self._validate_eval_set_pair(X, eval_pair) for eval_pair in eval_set]
3706
+ validated_eval_set = []
3707
+ has_date = self._get_date_column(self.search_keys) is not None
3708
+ for idx, eval_pair in enumerate(eval_set):
3709
+ validated_pair = self._validate_eval_set_pair(X, eval_pair)
3710
+ if validated_pair[1].isna().all():
3711
+ if not has_date:
3712
+ msg = self.bundle.get("oot_without_date_not_supported").format(idx + 1)
3713
+ elif self.columns_for_online_api:
3714
+ msg = self.bundle.get("oot_with_online_sources_not_supported").format(idx + 1)
3715
+ else:
3716
+ msg = None
3717
+ if msg:
3718
+ if not silent:
3719
+ print(msg)
3720
+ self.logger.warning(msg)
3721
+ continue
3722
+ validated_eval_set.append(validated_pair)
3723
+
3724
+ return validated_eval_set
3431
3725
 
3432
3726
  def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
3433
3727
  if len(eval_pair) != 2:
@@ -3502,16 +3796,18 @@ if response.status_code == 200:
3502
3796
  raise ValidationError(self.bundle.get("unsupported_y_type_eval_set").format(type(eval_y)))
3503
3797
 
3504
3798
  eval_y_nunique = validated_eval_y.nunique()
3505
- if eval_y_nunique < 2:
3799
+ is_oot = validated_eval_y.isna().all()
3800
+ if not is_oot and eval_y_nunique < 2:
3506
3801
  raise ValidationError(self.bundle.get("y_is_constant_eval_set"))
3507
3802
 
3508
- if self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
3803
+ if not is_oot and self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
3509
3804
  raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
3510
3805
 
3511
- # Check for duplicates between train and eval sets by comparing all values
3512
- train_eval_intersection = pd.merge(X, validated_eval_X, how="inner")
3513
- if len(train_eval_intersection) > 0:
3514
- raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
3806
+ if not is_oot:
3807
+ # Check for duplicates between train and eval sets by comparing all values
3808
+ train_eval_intersection = pd.merge(X, validated_eval_X, how="inner")
3809
+ if len(train_eval_intersection) > 0:
3810
+ raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
3515
3811
 
3516
3812
  return validated_eval_X, validated_eval_y
3517
3813
 
@@ -3527,10 +3823,12 @@ if response.status_code == 200:
3527
3823
  if isinstance(eval_set, tuple):
3528
3824
  eval_set = [eval_set]
3529
3825
  for eval in eval_set:
3530
- if self.baseline_score_column not in eval[0].columns:
3531
- raise ValidationError(self.bundle.get("baseline_score_column_not_exists"))
3532
- if eval[0][self.baseline_score_column].isna().any():
3533
- raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
3826
+ is_oot = eval[1].isna().all()
3827
+ if not is_oot:
3828
+ if self.baseline_score_column not in eval[0].columns:
3829
+ raise ValidationError(self.bundle.get("baseline_score_column_not_exists"))
3830
+ if eval[0][self.baseline_score_column].isna().any():
3831
+ raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
3534
3832
 
3535
3833
  @staticmethod
3536
3834
  def _sample_X_and_y(X: pd.DataFrame, y: pd.Series, enriched_X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
@@ -3704,7 +4002,7 @@ if response.status_code == 200:
3704
4002
  return df
3705
4003
 
3706
4004
  def _add_current_date_as_key(
3707
- self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
4005
+ self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], bundle: ResourceBundle, silent: bool = False
3708
4006
  ) -> pd.DataFrame:
3709
4007
  if (
3710
4008
  set(search_keys.values()) == {SearchKey.PHONE}
@@ -3712,7 +4010,8 @@ if response.status_code == 200:
3712
4010
  or set(search_keys.values()) == {SearchKey.HEM}
3713
4011
  or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
3714
4012
  ):
3715
- self.__log_warning(bundle.get("current_date_added"))
4013
+ if not silent:
4014
+ self.__log_warning(bundle.get("current_date_added"))
3716
4015
  df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
3717
4016
  search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
3718
4017
  converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, generate_cyclical_features=False)
@@ -3851,7 +4150,7 @@ if response.status_code == 200:
3851
4150
  columns_to_sort = [date_column] if date_column is not None else []
3852
4151
 
3853
4152
  do_sorting = True
3854
- if self.id_columns and self.cv.is_time_series():
4153
+ if self.id_columns and self.cv is not None and self.cv.is_time_series():
3855
4154
  # Check duplicates by date and id_columns
3856
4155
  reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
3857
4156
  renamed_id_columns = [reversed_columns_renaming.get(c, c) for c in self.id_columns]
@@ -3978,6 +4277,17 @@ if response.status_code == 200:
3978
4277
 
3979
4278
  # TODO drop system_record_id before merge
3980
4279
  # Merge with result features
4280
+ # Align dtypes for join key to avoid int/float merge warnings
4281
+ if ENTITY_SYSTEM_RECORD_ID in input_df.columns and ENTITY_SYSTEM_RECORD_ID in result_features.columns:
4282
+ input_is_float = pd.api.types.is_float_dtype(input_df[ENTITY_SYSTEM_RECORD_ID])
4283
+ result_is_float = pd.api.types.is_float_dtype(result_features[ENTITY_SYSTEM_RECORD_ID])
4284
+ if input_is_float or result_is_float:
4285
+ input_df[ENTITY_SYSTEM_RECORD_ID] = pd.to_numeric(
4286
+ input_df[ENTITY_SYSTEM_RECORD_ID], errors="coerce"
4287
+ ).astype("float64")
4288
+ result_features[ENTITY_SYSTEM_RECORD_ID] = pd.to_numeric(
4289
+ result_features[ENTITY_SYSTEM_RECORD_ID], errors="coerce"
4290
+ ).astype("float64")
3981
4291
  result_features = pd.merge(
3982
4292
  input_df,
3983
4293
  result_features,
@@ -4047,7 +4357,11 @@ if response.status_code == 200:
4047
4357
  return [f.name for f in features_meta if f.type == "categorical" and f.name not in (self.id_columns or [])]
4048
4358
 
4049
4359
  def __prepare_feature_importances(
4050
- self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
4360
+ self,
4361
+ trace_id: str,
4362
+ clients_features_df: pd.DataFrame,
4363
+ updated_shaps: Optional[Dict[str, float]] = None,
4364
+ silent=False,
4051
4365
  ):
4052
4366
  if self._search_task is None:
4053
4367
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
@@ -4060,11 +4374,12 @@ if response.status_code == 200:
4060
4374
  features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
4061
4375
 
4062
4376
  # To be sure that names with hash suffixes
4063
- df = df.rename(columns=original_names_dict)
4377
+ clients_features_df = clients_features_df.rename(columns=original_names_dict)
4064
4378
 
4065
4379
  self.feature_names_ = []
4066
4380
  self.external_source_feature_names = []
4067
4381
  self.zero_shap_client_features = []
4382
+ self.unstable_client_features = []
4068
4383
  self.feature_importances_ = []
4069
4384
  features_info = []
4070
4385
  features_info_without_links = []
@@ -4073,10 +4388,10 @@ if response.status_code == 200:
4073
4388
  original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
4074
4389
 
4075
4390
  for feature_meta in features_meta:
4076
- if feature_meta.name in original_names_dict.keys():
4077
- feature_meta.name = original_names_dict[feature_meta.name]
4391
+ original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
4392
+ feature_meta.name = original_name
4078
4393
 
4079
- is_client_feature = original_names_dict.get(feature_meta.name, feature_meta.name) in df.columns
4394
+ is_client_feature = original_name in clients_features_df.columns
4080
4395
 
4081
4396
  # Show and update shap values for client features only if select_features is True
4082
4397
  if updated_shaps is not None and (not is_client_feature or self.fit_select_features):
@@ -4093,12 +4408,21 @@ if response.status_code == 200:
4093
4408
 
4094
4409
  for feature_meta in features_meta:
4095
4410
  original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
4096
- is_client_feature = original_name in df.columns
4411
+ is_client_feature = original_name in clients_features_df.columns
4097
4412
 
4098
4413
  if not is_client_feature:
4099
4414
  self.external_source_feature_names.append(original_name)
4100
4415
 
4416
+ if self.psi_values is not None:
4417
+ if original_name in self.psi_values:
4418
+ feature_meta.psi_value = self.psi_values[original_name]
4419
+ else:
4420
+ if is_client_feature and self.fit_select_features:
4421
+ self.unstable_client_features.append(original_name)
4422
+ continue
4423
+
4101
4424
  # TODO make a decision about selected features based on special flag from mlb
4425
+
4102
4426
  if original_shaps.get(feature_meta.name, 0.0) == 0.0:
4103
4427
  if is_client_feature and self.fit_select_features:
4104
4428
  self.zero_shap_client_features.append(original_name)
@@ -4122,7 +4446,7 @@ if response.status_code == 200:
4122
4446
  self.feature_names_.append(feature_meta.name)
4123
4447
  self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
4124
4448
 
4125
- df_for_sample = features_df if feature_meta.name in features_df.columns else df
4449
+ df_for_sample = features_df if feature_meta.name in features_df.columns else clients_features_df
4126
4450
  feature_info = FeatureInfo.from_metadata(feature_meta, df_for_sample, is_client_feature)
4127
4451
  features_info.append(feature_info.to_row(self.bundle))
4128
4452
  features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
@@ -4130,6 +4454,8 @@ if response.status_code == 200:
4130
4454
 
4131
4455
  if len(features_info) > 0:
4132
4456
  self.features_info = pd.DataFrame(features_info)
4457
+ if self.features_info[self.bundle.get("features_info_psi")].isna().all():
4458
+ self.features_info.drop(columns=[self.bundle.get("features_info_psi")], inplace=True)
4133
4459
  self._features_info_without_links = pd.DataFrame(features_info_without_links)
4134
4460
  self._internal_features_info = pd.DataFrame(internal_features_info)
4135
4461
  if not silent: