upgini 1.2.113a1__py3-none-any.whl → 1.2.113a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -112,6 +112,7 @@ except Exception:
112
112
  CustomFallbackProgressBar as ProgressBar,
113
113
  )
114
114
 
115
+ from upgini.utils.psi import calculate_features_psi
115
116
  from upgini.utils.sample_utils import SampleColumns, SampleConfig, _num_samples, sample
116
117
  from upgini.utils.sort import sort_columns
117
118
  from upgini.utils.target_utils import calculate_psi, define_task
@@ -297,7 +298,9 @@ class FeaturesEnricher(TransformerMixin):
297
298
  self.feature_names_ = []
298
299
  self.external_source_feature_names = []
299
300
  self.zero_shap_client_features = []
301
+ self.unstable_client_features = []
300
302
  self.feature_importances_ = []
303
+ self.psi_values: Optional[Dict[str, float]] = None
301
304
  self.search_id = search_id
302
305
  self.disable_force_downsampling = disable_force_downsampling
303
306
  self.print_trace_id = print_trace_id
@@ -398,13 +401,26 @@ class FeaturesEnricher(TransformerMixin):
398
401
  @staticmethod
399
402
  def _check_eval_set(eval_set, X, bundle: ResourceBundle):
400
403
  checked_eval_set = []
401
- if eval_set is not None and isinstance(eval_set, tuple):
404
+ if eval_set is None:
405
+ return checked_eval_set
406
+ if isinstance(eval_set, tuple):
402
407
  eval_set = [eval_set]
403
- if eval_set is not None and not isinstance(eval_set, list):
408
+ if not isinstance(eval_set, list):
404
409
  raise ValidationError(bundle.get("unsupported_type_eval_set").format(type(eval_set)))
405
410
  for eval_pair in eval_set or []:
411
+ # Handle OOT
412
+ if isinstance(eval_pair, pd.DataFrame):
413
+ empty_target = pd.Series([np.nan] * len(eval_pair), index=eval_pair.index)
414
+ eval_pair = (eval_pair, empty_target)
415
+ elif isinstance(eval_pair, tuple) and len(eval_pair) == 1:
416
+ empty_target = pd.Series([np.nan] * len(eval_pair[0]), index=eval_pair[0].index)
417
+ eval_pair = (eval_pair[0], empty_target)
418
+
406
419
  if not isinstance(eval_pair, tuple) or len(eval_pair) != 2:
407
420
  raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
421
+ if eval_pair[1] is None:
422
+ empty_target = pd.Series([np.nan] * len(eval_pair[0]), index=eval_pair[0].index)
423
+ eval_pair = (eval_pair[0], empty_target)
408
424
  if not is_frames_equal(X, eval_pair[0], bundle):
409
425
  checked_eval_set.append(eval_pair)
410
426
  return checked_eval_set
@@ -415,7 +431,6 @@ class FeaturesEnricher(TransformerMixin):
415
431
  y: Union[pd.Series, np.ndarray, List],
416
432
  eval_set: Optional[Union[List[tuple], tuple]] = None,
417
433
  *args,
418
- oot: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
419
434
  exclude_features_sources: Optional[List[str]] = None,
420
435
  calculate_metrics: Optional[bool] = None,
421
436
  estimator: Optional[Any] = None,
@@ -427,6 +442,7 @@ class FeaturesEnricher(TransformerMixin):
427
442
  search_id_callback: Optional[Callable[[str], Any]] = None,
428
443
  select_features: bool = True,
429
444
  auto_fe_parameters: Optional[AutoFEParameters] = None,
445
+ stability_threshold: float = 0.15,
430
446
  **kwargs,
431
447
  ):
432
448
  """Fit to data.
@@ -444,9 +460,6 @@ class FeaturesEnricher(TransformerMixin):
444
460
  eval_set: List[tuple], optional (default=None)
445
461
  List of pairs (X, y) for validation.
446
462
 
447
- oot: pandas.DataFrame of shape (n_samples, n_features)
448
- Out of time data.
449
-
450
463
  importance_threshold: float, optional (default=None)
451
464
  Minimum SHAP value to select a feature. Default value is 0.0.
452
465
 
@@ -512,14 +525,14 @@ class FeaturesEnricher(TransformerMixin):
512
525
  X,
513
526
  y,
514
527
  self.eval_set,
515
- oot=oot,
516
- progress_bar=progress_bar,
528
+ progress_bar,
517
529
  start_time=start_time,
518
530
  exclude_features_sources=exclude_features_sources,
519
531
  calculate_metrics=calculate_metrics,
520
532
  estimator=estimator,
521
533
  scoring=scoring,
522
534
  importance_threshold=importance_threshold,
535
+ stability_threshold=stability_threshold,
523
536
  max_features=max_features,
524
537
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
525
538
  auto_fe_parameters=auto_fe_parameters,
@@ -568,7 +581,6 @@ class FeaturesEnricher(TransformerMixin):
568
581
  y: Union[pd.DataFrame, pd.Series, np.ndarray, List],
569
582
  eval_set: Optional[Union[List[tuple], tuple]] = None,
570
583
  *args,
571
- oot: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
572
584
  exclude_features_sources: Optional[List[str]] = None,
573
585
  keep_input: bool = True,
574
586
  importance_threshold: Optional[float] = None,
@@ -580,6 +592,7 @@ class FeaturesEnricher(TransformerMixin):
580
592
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
581
593
  select_features: bool = True,
582
594
  auto_fe_parameters: Optional[AutoFEParameters] = None,
595
+ stability_threshold: float = 0.15,
583
596
  **kwargs,
584
597
  ) -> pd.DataFrame:
585
598
  """Fit to data, then transform it.
@@ -624,6 +637,10 @@ class FeaturesEnricher(TransformerMixin):
624
637
  If True, return only selected features both from input and data sources.
625
638
  Otherwise, return all features from input and only selected features from data sources.
626
639
 
640
+ stability_threshold: float, optional (default=0.15)
641
+ Stability threshold for selected features PSI calculation. If PSI is less than this threshold,
642
+ then feature will be dropped.
643
+
627
644
  Returns
628
645
  -------
629
646
  X_new: pandas.DataFrame of shape (n_samples, n_features_new)
@@ -673,14 +690,14 @@ class FeaturesEnricher(TransformerMixin):
673
690
  X,
674
691
  y,
675
692
  self.eval_set,
676
- oot=oot,
677
- progress_bar=progress_bar,
693
+ progress_bar,
678
694
  start_time=start_time,
679
695
  exclude_features_sources=exclude_features_sources,
680
696
  calculate_metrics=calculate_metrics,
681
697
  scoring=scoring,
682
698
  estimator=estimator,
683
699
  importance_threshold=importance_threshold,
700
+ stability_threshold=stability_threshold,
684
701
  max_features=max_features,
685
702
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
686
703
  auto_fe_parameters=auto_fe_parameters,
@@ -947,8 +964,8 @@ class FeaturesEnricher(TransformerMixin):
947
964
  ):
948
965
  raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
949
966
 
950
- validated_X, validated_y, validated_eval_set, _ = self._validate_train_eval(
951
- effective_X, effective_y, effective_eval_set
967
+ validated_X, validated_y, validated_eval_set = self._validate_train_eval(
968
+ effective_X, effective_y, effective_eval_set, silent=internal_call
952
969
  )
953
970
 
954
971
  if self.X is None:
@@ -1040,6 +1057,7 @@ class FeaturesEnricher(TransformerMixin):
1040
1057
  groups,
1041
1058
  _cv,
1042
1059
  columns_renaming,
1060
+ eval_set_dates,
1043
1061
  ) = prepared_data
1044
1062
 
1045
1063
  # rename cat_features
@@ -1081,9 +1099,9 @@ class FeaturesEnricher(TransformerMixin):
1081
1099
  self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
1082
1100
 
1083
1101
  has_date = self._get_date_column(search_keys) is not None
1084
- has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
1085
1102
  model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
1086
1103
  cat_features = list(set(client_cat_features + cat_features_from_backend))
1104
+ has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
1087
1105
  baseline_cat_features = [f for f in cat_features if f in fitting_X.columns]
1088
1106
  enriched_cat_features = [f for f in cat_features if f in fitting_enriched_X.columns]
1089
1107
  if len(enriched_cat_features) < len(cat_features):
@@ -1203,8 +1221,6 @@ class FeaturesEnricher(TransformerMixin):
1203
1221
  # max_initial_eval_set_hit_rate = self._search_task.get_max_initial_eval_set_hit_rate_v2()
1204
1222
  if len(fitting_eval_set_dict) > 0:
1205
1223
  for idx in fitting_eval_set_dict.keys():
1206
- # eval_hit_rate = max_initial_eval_set_hit_rate[idx + 1]
1207
-
1208
1224
  (
1209
1225
  eval_X_sorted,
1210
1226
  eval_y_sorted,
@@ -1212,6 +1228,10 @@ class FeaturesEnricher(TransformerMixin):
1212
1228
  enriched_eval_y_sorted,
1213
1229
  ) = fitting_eval_set_dict[idx]
1214
1230
 
1231
+ if eval_y_sorted.isna().all():
1232
+ # Skip OOT eval set
1233
+ continue
1234
+
1215
1235
  if baseline_estimator is not None:
1216
1236
  self.logger.info(
1217
1237
  f"Calculate baseline {metric} on eval set {idx + 1} "
@@ -1254,17 +1274,14 @@ class FeaturesEnricher(TransformerMixin):
1254
1274
  "quality_metrics_eval_segment"
1255
1275
  ).format(idx + 1),
1256
1276
  self.bundle.get("quality_metrics_rows_header"): _num_samples(
1257
- # effective_eval_set[idx][0]
1258
1277
  # Use actually used for metrics dataset
1259
1278
  eval_X_sorted
1260
1279
  ),
1261
- # self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
1262
1280
  }
1263
1281
  if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
1264
1282
  eval_y_sorted
1265
1283
  ):
1266
1284
  eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
1267
- # np.mean(validated_eval_set[idx][1]), 4
1268
1285
  # Use actually used for metrics dataset
1269
1286
  np.mean(eval_y_sorted),
1270
1287
  4,
@@ -1337,6 +1354,199 @@ class FeaturesEnricher(TransformerMixin):
1337
1354
  finally:
1338
1355
  self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
1339
1356
 
1357
+ def _select_features_by_psi(
1358
+ self,
1359
+ trace_id: str,
1360
+ X: Union[pd.DataFrame, pd.Series, np.ndarray],
1361
+ y: Union[pd.DataFrame, pd.Series, np.ndarray, List],
1362
+ eval_set: Optional[Union[List[tuple], tuple]],
1363
+ stability_threshold: float,
1364
+ cv: Union[BaseCrossValidator, CVType, str, None] = None,
1365
+ estimator=None,
1366
+ exclude_features_sources: Optional[List[str]] = None,
1367
+ importance_threshold: Optional[float] = None,
1368
+ max_features: Optional[int] = None,
1369
+ progress_bar: bool = True,
1370
+ progress_callback: Optional[Callable] = None,
1371
+ ):
1372
+ search_keys = self.search_keys.copy()
1373
+ validated_X, _, validated_eval_set = self._validate_train_eval(X, y, eval_set, silent=True)
1374
+ if isinstance(X, np.ndarray):
1375
+ search_keys = {str(k): v for k, v in search_keys.items()}
1376
+
1377
+ has_date = self._get_date_column(search_keys) is not None
1378
+ if not has_date or not validated_eval_set:
1379
+ self.logger.info("No date column or eval set for OOT psi calculation")
1380
+ return
1381
+
1382
+ cat_features_from_backend = self.__get_categorical_features()
1383
+ client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
1384
+ estimator, validated_X, search_keys
1385
+ )
1386
+ if self.id_columns and self.id_columns_encoder is not None:
1387
+ if cat_features_from_backend:
1388
+ cat_features_from_backend = [
1389
+ c
1390
+ for c in cat_features_from_backend
1391
+ if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
1392
+ ]
1393
+ if client_cat_features:
1394
+ client_cat_features = [
1395
+ c
1396
+ for c in client_cat_features
1397
+ if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
1398
+ ]
1399
+
1400
+ prepared_data = self._prepare_data_for_metrics(
1401
+ trace_id=trace_id,
1402
+ X=X,
1403
+ y=y,
1404
+ eval_set=eval_set,
1405
+ exclude_features_sources=exclude_features_sources,
1406
+ importance_threshold=importance_threshold,
1407
+ max_features=max_features,
1408
+ remove_outliers_calc_metrics=False,
1409
+ cv_override=cv,
1410
+ search_keys_for_metrics=search_keys_for_metrics,
1411
+ progress_bar=progress_bar,
1412
+ progress_callback=progress_callback,
1413
+ client_cat_features=client_cat_features,
1414
+ )
1415
+ if prepared_data is None:
1416
+ return None
1417
+
1418
+ (
1419
+ validated_X,
1420
+ fitting_X,
1421
+ y_sorted,
1422
+ fitting_enriched_X,
1423
+ _,
1424
+ fitting_eval_set_dict,
1425
+ _,
1426
+ _,
1427
+ _,
1428
+ columns_renaming,
1429
+ eval_set_dates,
1430
+ ) = prepared_data
1431
+
1432
+ # rename cat_features
1433
+ if client_cat_features:
1434
+ for new_c, old_c in columns_renaming.items():
1435
+ if old_c in client_cat_features:
1436
+ client_cat_features.remove(old_c)
1437
+ client_cat_features.append(new_c)
1438
+ for cat_feature in client_cat_features:
1439
+ if cat_feature not in fitting_X.columns:
1440
+ self.logger.error(
1441
+ f"Client cat_feature `{cat_feature}` not found in" f" x columns: {fitting_X.columns.to_list()}"
1442
+ )
1443
+ else:
1444
+ client_cat_features = []
1445
+
1446
+ model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
1447
+ cat_features = list(set(client_cat_features + cat_features_from_backend))
1448
+
1449
+ # Drop unstable features
1450
+ unstable_features = self._check_stability(
1451
+ validated_X,
1452
+ validated_eval_set,
1453
+ fitting_eval_set_dict,
1454
+ eval_set_dates,
1455
+ search_keys,
1456
+ stability_threshold,
1457
+ cat_features,
1458
+ model_task_type,
1459
+ )
1460
+ client_features_df = self.df_with_original_index.rename(columns=columns_renaming)
1461
+ # decoded_X = self._decode_id_columns(fitting_X, columns_renaming)
1462
+ self._update_report_psi(trace_id, client_features_df)
1463
+
1464
+ if unstable_features:
1465
+ msg = f"Some features are unstable: {unstable_features} and will be dropped"
1466
+ self.logger.warning(msg)
1467
+ print(msg)
1468
+ fitting_X = fitting_X.drop(columns=unstable_features, errors="ignore")
1469
+ fitting_enriched_X = fitting_enriched_X.drop(columns=unstable_features, errors="ignore")
1470
+ msg = f"Threre are {len(fitting_enriched_X.columns)} stable selected features left"
1471
+ self.logger.info(msg)
1472
+ print(msg)
1473
+ for idx, (
1474
+ eval_X,
1475
+ eval_y,
1476
+ eval_enriched_X,
1477
+ eval_enriched_y,
1478
+ ) in fitting_eval_set_dict.items():
1479
+ eval_X = eval_X.drop(columns=unstable_features, errors="ignore")
1480
+ eval_enriched_X = eval_enriched_X.drop(columns=unstable_features, errors="ignore")
1481
+ fitting_eval_set_dict[idx] = (eval_X, eval_y, eval_enriched_X, eval_enriched_y)
1482
+
1483
+ def _check_stability(
1484
+ self,
1485
+ X: pd.DataFrame,
1486
+ eval_set: List[Tuple[pd.DataFrame, pd.Series]],
1487
+ enriched_eval_set: Dict,
1488
+ eval_set_dates: Dict[int, pd.Series],
1489
+ search_keys: Dict[str, SearchKey],
1490
+ stability_threshold: float,
1491
+ cat_features: List[str],
1492
+ model_task_type: ModelTaskType,
1493
+ ) -> List[str]:
1494
+ # Find latest eval set or earliest if all eval sets are before train set
1495
+ date_column = self._get_date_column(search_keys)
1496
+
1497
+ if (
1498
+ date_column is None
1499
+ or not eval_set
1500
+ or not eval_set_dates
1501
+ or (self.cv is not None and self.cv.is_time_series())
1502
+ ):
1503
+ return []
1504
+
1505
+ # Get minimum date from main dataset X
1506
+ main_min_date = X[date_column].min()
1507
+
1508
+ # Find minimum date for each eval_set and compare with main dataset
1509
+ eval_dates = []
1510
+ for i, (eval_x, _) in enumerate(eval_set):
1511
+ if date_column in eval_x.columns:
1512
+ eval_min_date = eval_x[date_column].min()
1513
+ eval_max_date = eval_x[date_column].max()
1514
+ eval_dates.append((i, eval_min_date, eval_max_date))
1515
+
1516
+ if not eval_dates:
1517
+ return []
1518
+
1519
+ # Check if any eval_set has minimum date >= main dataset minimum date
1520
+ later_eval_sets = [(i, min_date, max_date) for i, min_date, max_date in eval_dates if min_date >= main_min_date]
1521
+
1522
+ if later_eval_sets:
1523
+ # If there are eval_sets with date >= main date, choose the one with highest maximum date
1524
+ selected_eval_set_idx = max(later_eval_sets, key=lambda x: x[2])[0]
1525
+ else:
1526
+ # If all eval_sets have dates < main date, choose the one with lowest minimux date
1527
+ selected_eval_set_idx = max(eval_dates, key=lambda x: x[1])[0]
1528
+
1529
+ checking_eval_set = enriched_eval_set[selected_eval_set_idx]
1530
+
1531
+ checking_eval_set_df = (
1532
+ checking_eval_set[2]
1533
+ if checking_eval_set[1] is None or checking_eval_set[1].isna().all()
1534
+ else pd.concat([checking_eval_set[2], checking_eval_set[1].to_frame(TARGET)], axis=1)
1535
+ )
1536
+ checking_eval_set_df = checking_eval_set_df.copy()
1537
+
1538
+ checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
1539
+
1540
+ psi_values = calculate_features_psi(
1541
+ checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
1542
+ )
1543
+
1544
+ self.psi_values = {
1545
+ feature: psi_value for feature, psi_value in psi_values.items() if psi_value <= stability_threshold
1546
+ }
1547
+
1548
+ return [feature for feature, psi in psi_values.items() if psi > stability_threshold]
1549
+
1340
1550
  def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float], silent: bool = False):
1341
1551
  renaming = self.fit_columns_renaming or {}
1342
1552
  self.logger.info(f"Updating SHAP values: {new_shaps}")
@@ -1392,6 +1602,56 @@ class FeaturesEnricher(TransformerMixin):
1392
1602
  except (ImportError, NameError):
1393
1603
  pass
1394
1604
 
1605
+ def _update_report_psi(self, trace_id: str, clients_features_df: pd.DataFrame):
1606
+ self.__prepare_feature_importances(trace_id, clients_features_df)
1607
+
1608
+ if self.features_info_display_handle is not None:
1609
+ try:
1610
+ _ = get_ipython() # type: ignore
1611
+
1612
+ display_html_dataframe(
1613
+ self.features_info,
1614
+ self._features_info_without_links,
1615
+ self.bundle.get("relevant_features_header"),
1616
+ display_handle=self.features_info_display_handle,
1617
+ )
1618
+ except (ImportError, NameError):
1619
+ pass
1620
+
1621
+ if self.data_sources_display_handle is not None:
1622
+ try:
1623
+ _ = get_ipython() # type: ignore
1624
+
1625
+ display_html_dataframe(
1626
+ self.relevant_data_sources,
1627
+ self._relevant_data_sources_wo_links,
1628
+ self.bundle.get("relevant_data_sources_header"),
1629
+ display_handle=self.data_sources_display_handle,
1630
+ )
1631
+ except (ImportError, NameError):
1632
+ pass
1633
+
1634
+ if self.autofe_features_display_handle is not None:
1635
+ try:
1636
+ _ = get_ipython() # type: ignore
1637
+ autofe_descriptions_df = self.get_autofe_features_description()
1638
+ if autofe_descriptions_df is not None:
1639
+ display_html_dataframe(
1640
+ df=autofe_descriptions_df,
1641
+ internal_df=autofe_descriptions_df,
1642
+ header=self.bundle.get("autofe_descriptions_header"),
1643
+ display_handle=self.autofe_features_display_handle,
1644
+ )
1645
+ except (ImportError, NameError):
1646
+ pass
1647
+ if self.report_button_handle is not None:
1648
+ try:
1649
+ _ = get_ipython() # type: ignore
1650
+
1651
+ self.__show_report_button(display_handle=self.report_button_handle)
1652
+ except (ImportError, NameError):
1653
+ pass
1654
+
1395
1655
  def _check_train_and_eval_target_distribution(self, y, eval_set_dict):
1396
1656
  uneven_distribution = False
1397
1657
  for eval_set in eval_set_dict.values():
@@ -1542,7 +1802,7 @@ class FeaturesEnricher(TransformerMixin):
1542
1802
  is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
1543
1803
  is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
1544
1804
  checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
1545
- validated_X, validated_y, validated_eval_set, _ = self._validate_train_eval(X, y, checked_eval_set)
1805
+ validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set, silent=True)
1546
1806
 
1547
1807
  sampled_data = self._get_enriched_for_metrics(
1548
1808
  trace_id,
@@ -1556,7 +1816,7 @@ class FeaturesEnricher(TransformerMixin):
1556
1816
  progress_bar,
1557
1817
  progress_callback,
1558
1818
  )
1559
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = dataclasses.astuple(
1819
+ (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming) = dataclasses.astuple(
1560
1820
  sampled_data
1561
1821
  )
1562
1822
 
@@ -1579,8 +1839,7 @@ class FeaturesEnricher(TransformerMixin):
1579
1839
  or c in set(self.feature_names_).union(self.id_columns or [])
1580
1840
  or (self.fit_columns_renaming or {}).get(c, c) in set(self.feature_names_).union(self.id_columns or [])
1581
1841
  )
1582
- and c
1583
- not in (
1842
+ and c not in (
1584
1843
  excluding_search_keys
1585
1844
  + list(self.fit_dropped_features)
1586
1845
  + [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
@@ -1679,12 +1938,16 @@ class FeaturesEnricher(TransformerMixin):
1679
1938
  fitting_enriched_X = fitting_enriched_X[fitting_enriched_x_columns]
1680
1939
  fitting_enriched_X, _ = self._encode_id_columns(fitting_enriched_X, self.fit_columns_renaming)
1681
1940
  self.logger.info(f"Final sorted list of fitting enriched X columns: {fitting_enriched_x_columns}")
1941
+ date_column = self._get_date_column(search_keys)
1942
+ eval_set_dates = {}
1682
1943
  for idx, eval_tuple in eval_set_sampled_dict.items():
1683
1944
  eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
1684
1945
  eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
1685
1946
  enriched_eval_X_sorted, enriched_eval_y_sorted = self._sort_by_system_record_id(
1686
1947
  enriched_eval_X, eval_y_sampled, self.cv
1687
1948
  )
1949
+ if date_column is not None:
1950
+ eval_set_dates[idx] = eval_X_sorted[date_column]
1688
1951
  fitting_eval_X = eval_X_sorted[fitting_x_columns].copy()
1689
1952
  fitting_enriched_eval_X = enriched_eval_X_sorted[fitting_enriched_x_columns].copy()
1690
1953
 
@@ -1729,6 +1992,7 @@ class FeaturesEnricher(TransformerMixin):
1729
1992
  groups,
1730
1993
  cv,
1731
1994
  columns_renaming,
1995
+ eval_set_dates,
1732
1996
  )
1733
1997
 
1734
1998
  @dataclass
@@ -1891,14 +2155,16 @@ class FeaturesEnricher(TransformerMixin):
1891
2155
  remove_outliers_calc_metrics: Optional[bool],
1892
2156
  ) -> _EnrichedDataForMetrics:
1893
2157
  eval_set_sampled_dict = {}
1894
- search_keys = self.fit_search_keys
2158
+ search_keys = self.fit_search_keys.copy()
1895
2159
 
1896
2160
  rows_to_drop = None
1897
2161
  has_date = self._get_date_column(search_keys) is not None
1898
2162
  self.model_task_type = self.model_task_type or define_task(
1899
2163
  self.df_with_original_index[TARGET], has_date, self.logger, silent=True
1900
2164
  )
1901
- if self.model_task_type == ModelTaskType.REGRESSION:
2165
+ if remove_outliers_calc_metrics is None:
2166
+ remove_outliers_calc_metrics = True
2167
+ if self.model_task_type == ModelTaskType.REGRESSION and remove_outliers_calc_metrics:
1902
2168
  target_outliers_df = self._search_task.get_target_outliers(trace_id)
1903
2169
  if target_outliers_df is not None and len(target_outliers_df) > 0:
1904
2170
  outliers = pd.merge(
@@ -1908,11 +2174,8 @@ class FeaturesEnricher(TransformerMixin):
1908
2174
  how="inner",
1909
2175
  )
1910
2176
  top_outliers = outliers.sort_values(by=TARGET, ascending=False)[TARGET].head(3)
1911
- if remove_outliers_calc_metrics is None or remove_outliers_calc_metrics is True:
1912
- rows_to_drop = outliers
1913
- not_msg = ""
1914
- else:
1915
- not_msg = "not "
2177
+ rows_to_drop = outliers
2178
+ not_msg = ""
1916
2179
  msg = self.bundle.get("target_outliers_warning").format(len(target_outliers_df), top_outliers, not_msg)
1917
2180
  print(msg)
1918
2181
  self.logger.warning(msg)
@@ -1938,8 +2201,11 @@ class FeaturesEnricher(TransformerMixin):
1938
2201
  )
1939
2202
 
1940
2203
  # Handle eval sets extraction based on EVAL_SET_INDEX
1941
- if EVAL_SET_INDEX in enriched_Xy.columns and eval_set is not None:
1942
- for eval_set_index in range(1, len(eval_set) + 1):
2204
+ if EVAL_SET_INDEX in enriched_Xy.columns:
2205
+ eval_set_indices = list(enriched_Xy[EVAL_SET_INDEX].unique())
2206
+ if 0 in eval_set_indices:
2207
+ eval_set_indices.remove(0)
2208
+ for eval_set_index in eval_set_indices:
1943
2209
  enriched_eval_sets[eval_set_index] = enriched_Xy.loc[
1944
2210
  enriched_Xy[EVAL_SET_INDEX] == eval_set_index
1945
2211
  ].copy()
@@ -1967,12 +2233,13 @@ class FeaturesEnricher(TransformerMixin):
1967
2233
  enriched_eval_X = enriched_eval_sets[idx + 1][enriched_X_columns].copy()
1968
2234
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
1969
2235
 
1970
- reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
1971
- X_sampled.rename(columns=reversed_renaming, inplace=True)
1972
- enriched_X.rename(columns=reversed_renaming, inplace=True)
2236
+ # reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
2237
+ X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
2238
+ enriched_X.rename(columns=self.fit_columns_renaming, inplace=True)
1973
2239
  for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
1974
- eval_X_sampled.rename(columns=reversed_renaming, inplace=True)
1975
- enriched_eval_X.rename(columns=reversed_renaming, inplace=True)
2240
+ eval_X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
2241
+ enriched_eval_X.rename(columns=self.fit_columns_renaming, inplace=True)
2242
+ search_keys = {self.fit_columns_renaming.get(k, k): v for k, v in search_keys.items()}
1976
2243
 
1977
2244
  datasets_hash = hash_input(self.X, self.y, self.eval_set)
1978
2245
  return self.__cache_and_return_results(
@@ -2051,11 +2318,7 @@ class FeaturesEnricher(TransformerMixin):
2051
2318
  )
2052
2319
 
2053
2320
  def __combine_train_and_eval_sets(
2054
- self,
2055
- X: pd.DataFrame,
2056
- y: Optional[pd.Series] = None,
2057
- eval_set: Optional[List[tuple]] = None,
2058
- oot: Optional[pd.DataFrame] = None,
2321
+ self, X: pd.DataFrame, y: Optional[pd.Series] = None, eval_set: Optional[List[tuple]] = None
2059
2322
  ) -> pd.DataFrame:
2060
2323
  df = X.copy()
2061
2324
  if y is not None:
@@ -2071,11 +2334,6 @@ class FeaturesEnricher(TransformerMixin):
2071
2334
  eval_df_with_index[TARGET] = eval_y
2072
2335
  eval_df_with_index[EVAL_SET_INDEX] = idx + 1
2073
2336
  df = pd.concat([df, eval_df_with_index])
2074
-
2075
- if oot is not None:
2076
- oot_df_with_index = oot.copy()
2077
- oot_df_with_index[EVAL_SET_INDEX] = -1
2078
- df = pd.concat([df, oot_df_with_index])
2079
2337
 
2080
2338
  return df
2081
2339
 
@@ -2125,15 +2383,15 @@ class FeaturesEnricher(TransformerMixin):
2125
2383
 
2126
2384
  def __extract_eval_data(
2127
2385
  self, enriched_df: pd.DataFrame, x_columns: List[str], enriched_X_columns: List[str], eval_set_len: int
2128
- ) -> Dict[int, Tuple]:
2386
+ ) -> Tuple[Dict[int, Tuple], Dict[int, pd.Series]]:
2129
2387
  eval_set_sampled_dict = {}
2130
2388
 
2131
- for idx in range(1, eval_set_len + 1):
2132
- enriched_eval_xy = enriched_df.query(f"{EVAL_SET_INDEX} == {idx}")
2389
+ for idx in range(eval_set_len):
2390
+ enriched_eval_xy = enriched_df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
2133
2391
  eval_x_sampled = enriched_eval_xy[x_columns].copy()
2134
2392
  eval_y_sampled = enriched_eval_xy[TARGET].copy()
2135
2393
  enriched_eval_x = enriched_eval_xy[enriched_X_columns].copy()
2136
- eval_set_sampled_dict[idx - 1] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
2394
+ eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
2137
2395
 
2138
2396
  return eval_set_sampled_dict
2139
2397
 
@@ -2171,12 +2429,12 @@ class FeaturesEnricher(TransformerMixin):
2171
2429
  columns_renaming: Dict[str, str],
2172
2430
  ):
2173
2431
  # X_sampled - with hash-suffixes
2174
- reversed_renaming = {v: k for k, v in columns_renaming.items()}
2175
- search_keys = {
2176
- reversed_renaming.get(k, k): v
2177
- for k, v in search_keys.items()
2178
- if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
2179
- }
2432
+ # reversed_renaming = {v: k for k, v in columns_renaming.items()}
2433
+ # search_keys = {
2434
+ # reversed_renaming.get(k, k): v
2435
+ # for k, v in search_keys.items()
2436
+ # if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
2437
+ # }
2180
2438
  return FeaturesEnricher._EnrichedDataForMetrics(
2181
2439
  X_sampled=X_sampled,
2182
2440
  y_sampled=y_sampled,
@@ -2325,10 +2583,10 @@ if response.status_code == 200:
2325
2583
  with MDC(trace_id=trace_id, search_id=search_id):
2326
2584
  self.logger.info("Start transform")
2327
2585
 
2328
- validated_X, validated_y, _, _ = self._validate_train_eval(
2329
- X, y, is_transform=True
2586
+ validated_X, validated_y, validated_eval_set = self._validate_train_eval(
2587
+ X, y, eval_set=None, is_transform=True, silent=True
2330
2588
  )
2331
- df = self.__combine_train_and_eval_sets(validated_X, validated_y)
2589
+ df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
2332
2590
 
2333
2591
  validated_Xy = df.copy()
2334
2592
 
@@ -2428,7 +2686,7 @@ if response.status_code == 200:
2428
2686
  else:
2429
2687
  self.logger.info("Input dataset hasn't date column")
2430
2688
  if self.__should_add_date_column():
2431
- df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
2689
+ df = self._add_current_date_as_key(df, search_keys, self.bundle, silent=True)
2432
2690
 
2433
2691
  email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
2434
2692
  if email_columns and self.generate_search_key_features:
@@ -2677,7 +2935,8 @@ if response.status_code == 200:
2677
2935
  selecting_columns = [
2678
2936
  c
2679
2937
  for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
2680
- if c not in self.zero_shap_client_features or c in (self.id_columns or [])
2938
+ if (c not in self.zero_shap_client_features and c not in self.unstable_client_features)
2939
+ or c in (self.id_columns or [])
2681
2940
  ]
2682
2941
  selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
2683
2942
  if add_fit_system_record_id:
@@ -2803,15 +3062,15 @@ if response.status_code == 200:
2803
3062
  X: Union[pd.DataFrame, pd.Series, np.ndarray],
2804
3063
  y: Union[pd.DataFrame, pd.Series, np.ndarray, List, None],
2805
3064
  eval_set: Optional[List[tuple]],
2806
- *,
2807
- oot: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
2808
3065
  progress_bar: Optional[ProgressBar],
2809
3066
  start_time: int,
3067
+ *,
2810
3068
  exclude_features_sources: Optional[List[str]] = None,
2811
3069
  calculate_metrics: Optional[bool],
2812
3070
  scoring: Union[Callable, str, None],
2813
3071
  estimator: Optional[Any],
2814
3072
  importance_threshold: Optional[float],
3073
+ stability_threshold: float,
2815
3074
  max_features: Optional[int],
2816
3075
  remove_outliers_calc_metrics: Optional[bool],
2817
3076
  auto_fe_parameters: AutoFEParameters,
@@ -2826,8 +3085,9 @@ if response.status_code == 200:
2826
3085
  self.fit_columns_renaming = None
2827
3086
  self.fit_dropped_features = set()
2828
3087
  self.fit_generated_features = []
3088
+ self.psi_values = None
2829
3089
 
2830
- validated_X, validated_y, validated_eval_set, validated_oot = self._validate_train_eval(X, y, eval_set, oot)
3090
+ validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, eval_set)
2831
3091
 
2832
3092
  is_demo_dataset = hash_input(validated_X, validated_y, validated_eval_set) in DEMO_DATASET_HASHES
2833
3093
  if is_demo_dataset:
@@ -2868,7 +3128,6 @@ if response.status_code == 200:
2868
3128
  validated_X,
2869
3129
  validated_y,
2870
3130
  validated_eval_set,
2871
- validated_oot,
2872
3131
  exclude_features_sources=exclude_features_sources,
2873
3132
  calculate_metrics=calculate_metrics,
2874
3133
  scoring=scoring,
@@ -2876,7 +3135,7 @@ if response.status_code == 200:
2876
3135
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
2877
3136
  )
2878
3137
 
2879
- df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set, validated_oot)
3138
+ df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
2880
3139
  self.id_columns_encoder = OrdinalEncoder().fit(df[self.id_columns or []])
2881
3140
 
2882
3141
  self.fit_search_keys = self.search_keys.copy()
@@ -2923,7 +3182,7 @@ if response.status_code == 200:
2923
3182
  self.logger.info("Input dataset hasn't date column")
2924
3183
  # TODO remove when this logic will be implemented on the back
2925
3184
  if self.__should_add_date_column():
2926
- df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
3185
+ df = self._add_current_date_as_key(df, self.fit_search_keys, self.bundle)
2927
3186
 
2928
3187
  email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
2929
3188
  if email_columns and self.generate_search_key_features:
@@ -2938,10 +3197,13 @@ if response.status_code == 200:
2938
3197
  except Exception:
2939
3198
  self.logger.exception("Failed to check dates distribution validity")
2940
3199
 
3200
+ self.__adjust_cv(df)
3201
+
2941
3202
  if (
2942
3203
  is_numeric_dtype(df[self.TARGET_NAME])
2943
3204
  and self.model_task_type in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]
2944
3205
  and has_date
3206
+ and not self.cv.is_time_series()
2945
3207
  ):
2946
3208
  self._validate_PSI(df.sort_values(by=maybe_date_column))
2947
3209
 
@@ -2973,8 +3235,9 @@ if response.status_code == 200:
2973
3235
 
2974
3236
  self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
2975
3237
 
3238
+ # Group columns should have normalized names
3239
+ self.cv = None
2976
3240
  self.__adjust_cv(df)
2977
-
2978
3241
  if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
2979
3242
  id_columns = self.__get_renamed_id_columns()
2980
3243
  if id_columns:
@@ -3179,6 +3442,21 @@ if response.status_code == 200:
3179
3442
  display_id=f"autofe_descriptions_{uuid.uuid4()}",
3180
3443
  )
3181
3444
 
3445
+ self._select_features_by_psi(
3446
+ trace_id=trace_id,
3447
+ X=X,
3448
+ y=y,
3449
+ eval_set=eval_set,
3450
+ stability_threshold=stability_threshold,
3451
+ cv=self.cv,
3452
+ estimator=estimator,
3453
+ exclude_features_sources=exclude_features_sources,
3454
+ importance_threshold=importance_threshold,
3455
+ max_features=max_features,
3456
+ progress_bar=progress_bar,
3457
+ progress_callback=progress_callback,
3458
+ )
3459
+
3182
3460
  if self._has_paid_features(exclude_features_sources):
3183
3461
  if calculate_metrics is not None and calculate_metrics:
3184
3462
  msg = self.bundle.get("metrics_with_paid_features")
@@ -3303,14 +3581,13 @@ if response.status_code == 200:
3303
3581
  X: pd.DataFrame,
3304
3582
  y: Optional[pd.Series] = None,
3305
3583
  eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]] = None,
3306
- oot: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
3307
3584
  is_transform: bool = False,
3585
+ silent: bool = False,
3308
3586
  ) -> Tuple[pd.DataFrame, pd.Series, Optional[List[Tuple[pd.DataFrame, pd.Series]]]]:
3309
3587
  validated_X = self._validate_X(X, is_transform)
3310
3588
  validated_y = self._validate_y(validated_X, y, enforce_y=not is_transform)
3311
- validated_eval_set = self._validate_eval_set(validated_X, eval_set)
3312
- validated_oot = self._validate_oot(validated_X, oot)
3313
- return validated_X, validated_y, validated_eval_set, validated_oot
3589
+ validated_eval_set = self._validate_eval_set(validated_X, eval_set, silent=silent)
3590
+ return validated_X, validated_y, validated_eval_set
3314
3591
 
3315
3592
  def _encode_id_columns(
3316
3593
  self,
@@ -3441,53 +3718,30 @@ if response.status_code == 200:
3441
3718
 
3442
3719
  return validated_y
3443
3720
 
3444
- def _validate_eval_set(self, X: pd.DataFrame, eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]]):
3721
+ def _validate_eval_set(
3722
+ self, X: pd.DataFrame, eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]], silent: bool = False
3723
+ ):
3445
3724
  if eval_set is None:
3446
3725
  return None
3447
- return [self._validate_eval_set_pair(X, eval_pair) for eval_pair in eval_set]
3448
-
3449
- def _validate_oot(self, X: pd.DataFrame, oot: Optional[pd.DataFrame]):
3450
- if oot is None:
3451
- return None
3452
-
3453
- if _num_samples(oot) == 0:
3454
- raise ValidationError(self.bundle.get("oot_is_empty"))
3455
- if isinstance(oot, pd.DataFrame):
3456
- if isinstance(oot.columns, pd.MultiIndex) or isinstance(oot.index, pd.MultiIndex):
3457
- raise ValidationError(self.bundle.get("oot_multiindex_unsupported"))
3458
- validated_oot = oot.copy()
3459
- elif isinstance(oot, pd.Series):
3460
- validated_oot = oot.to_frame()
3461
- elif isinstance(oot, (list, np.ndarray)):
3462
- validated_oot = pd.DataFrame(oot)
3463
- renaming = {c: str(c) for c in validated_oot.columns}
3464
- validated_oot = validated_oot.rename(columns=renaming)
3465
- else:
3466
- raise ValidationError(self.bundle.get("unsupported_type_oot").format(type(oot)))
3467
-
3468
- if not validated_oot.index.is_unique:
3469
- raise ValidationError(self.bundle.get("non_unique_index_oot"))
3470
-
3471
- if self.exclude_columns is not None:
3472
- validated_oot = validated_oot.drop(columns=self.exclude_columns, errors="ignore")
3473
-
3474
- if self.baseline_score_column:
3475
- validated_oot[self.baseline_score_column] = validated_oot[self.baseline_score_column].astype(
3476
- "float64", errors="ignore"
3477
- )
3478
-
3479
- if validated_oot.columns.to_list() != X.columns.to_list():
3480
- if set(validated_oot.columns.to_list()) == set(X.columns.to_list()):
3481
- validated_oot = validated_oot[X.columns.to_list()]
3482
- else:
3483
- raise ValidationError(self.bundle.get("oot_and_x_diff_shape"))
3484
-
3485
- # Check for duplicates between train and eval sets by comparing all values
3486
- train_eval_intersection = pd.merge(X, validated_oot, how="inner")
3487
- if len(train_eval_intersection) > 0:
3488
- raise ValidationError(self.bundle.get("oot_has_train_samples"))
3726
+ validated_eval_set = []
3727
+ has_date = self._get_date_column(self.search_keys) is not None
3728
+ for idx, eval_pair in enumerate(eval_set):
3729
+ validated_pair = self._validate_eval_set_pair(X, eval_pair)
3730
+ if validated_pair[1].isna().all():
3731
+ if not has_date:
3732
+ msg = self.bundle.get("oot_without_date_not_supported").format(idx + 1)
3733
+ elif self.columns_for_online_api:
3734
+ msg = self.bundle.get("oot_with_online_sources_not_supported").format(idx + 1)
3735
+ else:
3736
+ msg = None
3737
+ if msg:
3738
+ if not silent:
3739
+ print(msg)
3740
+ self.logger.warning(msg)
3741
+ continue
3742
+ validated_eval_set.append(validated_pair)
3489
3743
 
3490
- return validated_oot
3744
+ return validated_eval_set
3491
3745
 
3492
3746
  def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
3493
3747
  if len(eval_pair) != 2:
@@ -3562,16 +3816,18 @@ if response.status_code == 200:
3562
3816
  raise ValidationError(self.bundle.get("unsupported_y_type_eval_set").format(type(eval_y)))
3563
3817
 
3564
3818
  eval_y_nunique = validated_eval_y.nunique()
3565
- if eval_y_nunique < 2:
3819
+ is_oot = validated_eval_y.isna().all()
3820
+ if not is_oot and eval_y_nunique < 2:
3566
3821
  raise ValidationError(self.bundle.get("y_is_constant_eval_set"))
3567
3822
 
3568
- if self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
3823
+ if not is_oot and self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
3569
3824
  raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
3570
3825
 
3571
- # Check for duplicates between train and eval sets by comparing all values
3572
- train_eval_intersection = pd.merge(X, validated_eval_X, how="inner")
3573
- if len(train_eval_intersection) > 0:
3574
- raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
3826
+ if not is_oot:
3827
+ # Check for duplicates between train and eval sets by comparing all values
3828
+ train_eval_intersection = pd.merge(X, validated_eval_X, how="inner")
3829
+ if len(train_eval_intersection) > 0:
3830
+ raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
3575
3831
 
3576
3832
  return validated_eval_X, validated_eval_y
3577
3833
 
@@ -3587,10 +3843,12 @@ if response.status_code == 200:
3587
3843
  if isinstance(eval_set, tuple):
3588
3844
  eval_set = [eval_set]
3589
3845
  for eval in eval_set:
3590
- if self.baseline_score_column not in eval[0].columns:
3591
- raise ValidationError(self.bundle.get("baseline_score_column_not_exists"))
3592
- if eval[0][self.baseline_score_column].isna().any():
3593
- raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
3846
+ is_oot = eval[1].isna().all()
3847
+ if not is_oot:
3848
+ if self.baseline_score_column not in eval[0].columns:
3849
+ raise ValidationError(self.bundle.get("baseline_score_column_not_exists"))
3850
+ if eval[0][self.baseline_score_column].isna().any():
3851
+ raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
3594
3852
 
3595
3853
  @staticmethod
3596
3854
  def _sample_X_and_y(X: pd.DataFrame, y: pd.Series, enriched_X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
@@ -3660,7 +3918,6 @@ if response.status_code == 200:
3660
3918
  X: pd.DataFrame,
3661
3919
  y: Union[pd.Series, np.ndarray, list, None] = None,
3662
3920
  eval_set: Optional[List[tuple]] = None,
3663
- oot: Optional[pd.DataFrame] = None,
3664
3921
  exclude_features_sources: Optional[List[str]] = None,
3665
3922
  calculate_metrics: Optional[bool] = None,
3666
3923
  cv: Optional[Any] = None,
@@ -3729,8 +3986,6 @@ if response.status_code == 200:
3729
3986
  self.logger.info(
3730
3987
  f"First 10 rows of the eval_y_{idx} with shape {_num_samples(eval_y)}:\n{sample(eval_y)}"
3731
3988
  )
3732
- if oot is not None:
3733
- self.logger.info(f"First 10 rows of the oot with shape {oot.shape}:\n{sample(oot)}")
3734
3989
 
3735
3990
  do_without_pandas_limits(print_datasets_sample)
3736
3991
 
@@ -3767,7 +4022,7 @@ if response.status_code == 200:
3767
4022
  return df
3768
4023
 
3769
4024
  def _add_current_date_as_key(
3770
- self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
4025
+ self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], bundle: ResourceBundle, silent: bool = False
3771
4026
  ) -> pd.DataFrame:
3772
4027
  if (
3773
4028
  set(search_keys.values()) == {SearchKey.PHONE}
@@ -3775,7 +4030,8 @@ if response.status_code == 200:
3775
4030
  or set(search_keys.values()) == {SearchKey.HEM}
3776
4031
  or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
3777
4032
  ):
3778
- self.__log_warning(bundle.get("current_date_added"))
4033
+ if not silent:
4034
+ self.__log_warning(bundle.get("current_date_added"))
3779
4035
  df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
3780
4036
  search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
3781
4037
  converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, generate_cyclical_features=False)
@@ -4110,7 +4366,11 @@ if response.status_code == 200:
4110
4366
  return [f.name for f in features_meta if f.type == "categorical" and f.name not in (self.id_columns or [])]
4111
4367
 
4112
4368
  def __prepare_feature_importances(
4113
- self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
4369
+ self,
4370
+ trace_id: str,
4371
+ clients_features_df: pd.DataFrame,
4372
+ updated_shaps: Optional[Dict[str, float]] = None,
4373
+ silent=False,
4114
4374
  ):
4115
4375
  if self._search_task is None:
4116
4376
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
@@ -4123,11 +4383,12 @@ if response.status_code == 200:
4123
4383
  features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
4124
4384
 
4125
4385
  # To be sure that names with hash suffixes
4126
- df = df.rename(columns=original_names_dict)
4386
+ clients_features_df = clients_features_df.rename(columns=original_names_dict)
4127
4387
 
4128
4388
  self.feature_names_ = []
4129
4389
  self.external_source_feature_names = []
4130
4390
  self.zero_shap_client_features = []
4391
+ self.unstable_client_features = []
4131
4392
  self.feature_importances_ = []
4132
4393
  features_info = []
4133
4394
  features_info_without_links = []
@@ -4136,10 +4397,10 @@ if response.status_code == 200:
4136
4397
  original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
4137
4398
 
4138
4399
  for feature_meta in features_meta:
4139
- if feature_meta.name in original_names_dict.keys():
4140
- feature_meta.name = original_names_dict[feature_meta.name]
4400
+ original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
4401
+ feature_meta.name = original_name
4141
4402
 
4142
- is_client_feature = original_names_dict.get(feature_meta.name, feature_meta.name) in df.columns
4403
+ is_client_feature = original_name in clients_features_df.columns
4143
4404
 
4144
4405
  # Show and update shap values for client features only if select_features is True
4145
4406
  if updated_shaps is not None and (not is_client_feature or self.fit_select_features):
@@ -4156,12 +4417,21 @@ if response.status_code == 200:
4156
4417
 
4157
4418
  for feature_meta in features_meta:
4158
4419
  original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
4159
- is_client_feature = original_name in df.columns
4420
+ is_client_feature = original_name in clients_features_df.columns
4160
4421
 
4161
4422
  if not is_client_feature:
4162
4423
  self.external_source_feature_names.append(original_name)
4163
4424
 
4425
+ if self.psi_values is not None:
4426
+ if original_name in self.psi_values:
4427
+ feature_meta.psi_value = self.psi_values[original_name]
4428
+ else:
4429
+ if is_client_feature and self.fit_select_features:
4430
+ self.unstable_client_features.append(original_name)
4431
+ continue
4432
+
4164
4433
  # TODO make a decision about selected features based on special flag from mlb
4434
+
4165
4435
  if original_shaps.get(feature_meta.name, 0.0) == 0.0:
4166
4436
  if is_client_feature and self.fit_select_features:
4167
4437
  self.zero_shap_client_features.append(original_name)
@@ -4185,7 +4455,7 @@ if response.status_code == 200:
4185
4455
  self.feature_names_.append(feature_meta.name)
4186
4456
  self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
4187
4457
 
4188
- df_for_sample = features_df if feature_meta.name in features_df.columns else df
4458
+ df_for_sample = features_df if feature_meta.name in features_df.columns else clients_features_df
4189
4459
  feature_info = FeatureInfo.from_metadata(feature_meta, df_for_sample, is_client_feature)
4190
4460
  features_info.append(feature_info.to_row(self.bundle))
4191
4461
  features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
@@ -4193,6 +4463,8 @@ if response.status_code == 200:
4193
4463
 
4194
4464
  if len(features_info) > 0:
4195
4465
  self.features_info = pd.DataFrame(features_info)
4466
+ if self.features_info[self.bundle.get("features_info_psi")].isna().all():
4467
+ self.features_info.drop(columns=[self.bundle.get("features_info_psi")], inplace=True)
4196
4468
  self._features_info_without_links = pd.DataFrame(features_info_without_links)
4197
4469
  self._internal_features_info = pd.DataFrame(internal_features_info)
4198
4470
  if not silent:
@@ -4640,7 +4912,7 @@ if response.status_code == 200:
4640
4912
  print(msg)
4641
4913
 
4642
4914
  def _validate_PSI(self, df: pd.DataFrame):
4643
- if EVAL_SET_INDEX in df.columns and (df[EVAL_SET_INDEX] == 1).any():
4915
+ if EVAL_SET_INDEX in df.columns:
4644
4916
  train = df.query(f"{EVAL_SET_INDEX} == 0")
4645
4917
  eval1 = df.query(f"{EVAL_SET_INDEX} == 1")
4646
4918
  else: