upgini 1.2.112__py3-none-any.whl → 1.2.113a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -112,6 +112,7 @@ except Exception:
112
112
  CustomFallbackProgressBar as ProgressBar,
113
113
  )
114
114
 
115
+ from upgini.utils.psi import calculate_features_psi
115
116
  from upgini.utils.sample_utils import SampleColumns, SampleConfig, _num_samples, sample
116
117
  from upgini.utils.sort import sort_columns
117
118
  from upgini.utils.target_utils import calculate_psi, define_task
@@ -297,7 +298,9 @@ class FeaturesEnricher(TransformerMixin):
297
298
  self.feature_names_ = []
298
299
  self.external_source_feature_names = []
299
300
  self.zero_shap_client_features = []
301
+ self.unstable_client_features = []
300
302
  self.feature_importances_ = []
303
+ self.psi_values: Optional[Dict[str, float]] = None
301
304
  self.search_id = search_id
302
305
  self.disable_force_downsampling = disable_force_downsampling
303
306
  self.print_trace_id = print_trace_id
@@ -398,13 +401,26 @@ class FeaturesEnricher(TransformerMixin):
398
401
  @staticmethod
399
402
  def _check_eval_set(eval_set, X, bundle: ResourceBundle):
400
403
  checked_eval_set = []
401
- if eval_set is not None and isinstance(eval_set, tuple):
404
+ if eval_set is None:
405
+ return checked_eval_set
406
+ if isinstance(eval_set, tuple):
402
407
  eval_set = [eval_set]
403
- if eval_set is not None and not isinstance(eval_set, list):
408
+ if not isinstance(eval_set, list):
404
409
  raise ValidationError(bundle.get("unsupported_type_eval_set").format(type(eval_set)))
405
410
  for eval_pair in eval_set or []:
411
+ # Handle OOT
412
+ if isinstance(eval_pair, pd.DataFrame):
413
+ empty_target = pd.Series([np.nan] * len(eval_pair), index=eval_pair.index)
414
+ eval_pair = (eval_pair, empty_target)
415
+ elif isinstance(eval_pair, tuple) and len(eval_pair) == 1:
416
+ empty_target = pd.Series([np.nan] * len(eval_pair[0]), index=eval_pair[0].index)
417
+ eval_pair = (eval_pair[0], empty_target)
418
+
406
419
  if not isinstance(eval_pair, tuple) or len(eval_pair) != 2:
407
420
  raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
421
+ if eval_pair[1] is None:
422
+ empty_target = pd.Series([np.nan] * len(eval_pair[0]), index=eval_pair[0].index)
423
+ eval_pair = (eval_pair[0], empty_target)
408
424
  if not is_frames_equal(X, eval_pair[0], bundle):
409
425
  checked_eval_set.append(eval_pair)
410
426
  return checked_eval_set
@@ -426,6 +442,7 @@ class FeaturesEnricher(TransformerMixin):
426
442
  search_id_callback: Optional[Callable[[str], Any]] = None,
427
443
  select_features: bool = True,
428
444
  auto_fe_parameters: Optional[AutoFEParameters] = None,
445
+ stability_threshold: float = 0.15,
429
446
  **kwargs,
430
447
  ):
431
448
  """Fit to data.
@@ -515,6 +532,7 @@ class FeaturesEnricher(TransformerMixin):
515
532
  estimator=estimator,
516
533
  scoring=scoring,
517
534
  importance_threshold=importance_threshold,
535
+ stability_threshold=stability_threshold,
518
536
  max_features=max_features,
519
537
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
520
538
  auto_fe_parameters=auto_fe_parameters,
@@ -574,6 +592,7 @@ class FeaturesEnricher(TransformerMixin):
574
592
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
575
593
  select_features: bool = True,
576
594
  auto_fe_parameters: Optional[AutoFEParameters] = None,
595
+ stability_threshold: float = 0.15,
577
596
  **kwargs,
578
597
  ) -> pd.DataFrame:
579
598
  """Fit to data, then transform it.
@@ -618,6 +637,10 @@ class FeaturesEnricher(TransformerMixin):
618
637
  If True, return only selected features both from input and data sources.
619
638
  Otherwise, return all features from input and only selected features from data sources.
620
639
 
640
+ stability_threshold: float, optional (default=0.15)
641
+ Stability threshold for selected features PSI calculation. If PSI is less than this threshold,
642
+ then feature will be dropped.
643
+
621
644
  Returns
622
645
  -------
623
646
  X_new: pandas.DataFrame of shape (n_samples, n_features_new)
@@ -674,6 +697,7 @@ class FeaturesEnricher(TransformerMixin):
674
697
  scoring=scoring,
675
698
  estimator=estimator,
676
699
  importance_threshold=importance_threshold,
700
+ stability_threshold=stability_threshold,
677
701
  max_features=max_features,
678
702
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
679
703
  auto_fe_parameters=auto_fe_parameters,
@@ -941,7 +965,7 @@ class FeaturesEnricher(TransformerMixin):
941
965
  raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
942
966
 
943
967
  validated_X, validated_y, validated_eval_set = self._validate_train_eval(
944
- effective_X, effective_y, effective_eval_set
968
+ effective_X, effective_y, effective_eval_set, silent=internal_call
945
969
  )
946
970
 
947
971
  if self.X is None:
@@ -1033,6 +1057,7 @@ class FeaturesEnricher(TransformerMixin):
1033
1057
  groups,
1034
1058
  _cv,
1035
1059
  columns_renaming,
1060
+ eval_set_dates,
1036
1061
  ) = prepared_data
1037
1062
 
1038
1063
  # rename cat_features
@@ -1074,9 +1099,9 @@ class FeaturesEnricher(TransformerMixin):
1074
1099
  self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
1075
1100
 
1076
1101
  has_date = self._get_date_column(search_keys) is not None
1077
- has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
1078
1102
  model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
1079
1103
  cat_features = list(set(client_cat_features + cat_features_from_backend))
1104
+ has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
1080
1105
  baseline_cat_features = [f for f in cat_features if f in fitting_X.columns]
1081
1106
  enriched_cat_features = [f for f in cat_features if f in fitting_enriched_X.columns]
1082
1107
  if len(enriched_cat_features) < len(cat_features):
@@ -1196,8 +1221,6 @@ class FeaturesEnricher(TransformerMixin):
1196
1221
  # max_initial_eval_set_hit_rate = self._search_task.get_max_initial_eval_set_hit_rate_v2()
1197
1222
  if len(fitting_eval_set_dict) > 0:
1198
1223
  for idx in fitting_eval_set_dict.keys():
1199
- # eval_hit_rate = max_initial_eval_set_hit_rate[idx + 1]
1200
-
1201
1224
  (
1202
1225
  eval_X_sorted,
1203
1226
  eval_y_sorted,
@@ -1205,6 +1228,10 @@ class FeaturesEnricher(TransformerMixin):
1205
1228
  enriched_eval_y_sorted,
1206
1229
  ) = fitting_eval_set_dict[idx]
1207
1230
 
1231
+ if eval_y_sorted.isna().all():
1232
+ # Skip OOT eval set
1233
+ continue
1234
+
1208
1235
  if baseline_estimator is not None:
1209
1236
  self.logger.info(
1210
1237
  f"Calculate baseline {metric} on eval set {idx + 1} "
@@ -1247,17 +1274,14 @@ class FeaturesEnricher(TransformerMixin):
1247
1274
  "quality_metrics_eval_segment"
1248
1275
  ).format(idx + 1),
1249
1276
  self.bundle.get("quality_metrics_rows_header"): _num_samples(
1250
- # effective_eval_set[idx][0]
1251
1277
  # Use actually used for metrics dataset
1252
1278
  eval_X_sorted
1253
1279
  ),
1254
- # self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
1255
1280
  }
1256
1281
  if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
1257
1282
  eval_y_sorted
1258
1283
  ):
1259
1284
  eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
1260
- # np.mean(validated_eval_set[idx][1]), 4
1261
1285
  # Use actually used for metrics dataset
1262
1286
  np.mean(eval_y_sorted),
1263
1287
  4,
@@ -1330,6 +1354,199 @@ class FeaturesEnricher(TransformerMixin):
1330
1354
  finally:
1331
1355
  self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
1332
1356
 
1357
+ def _select_features_by_psi(
1358
+ self,
1359
+ trace_id: str,
1360
+ X: Union[pd.DataFrame, pd.Series, np.ndarray],
1361
+ y: Union[pd.DataFrame, pd.Series, np.ndarray, List],
1362
+ eval_set: Optional[Union[List[tuple], tuple]],
1363
+ stability_threshold: float,
1364
+ cv: Union[BaseCrossValidator, CVType, str, None] = None,
1365
+ estimator=None,
1366
+ exclude_features_sources: Optional[List[str]] = None,
1367
+ importance_threshold: Optional[float] = None,
1368
+ max_features: Optional[int] = None,
1369
+ progress_bar: bool = True,
1370
+ progress_callback: Optional[Callable] = None,
1371
+ ):
1372
+ search_keys = self.search_keys.copy()
1373
+ validated_X, _, validated_eval_set = self._validate_train_eval(X, y, eval_set, silent=True)
1374
+ if isinstance(X, np.ndarray):
1375
+ search_keys = {str(k): v for k, v in search_keys.items()}
1376
+
1377
+ has_date = self._get_date_column(search_keys) is not None
1378
+ if not has_date or not validated_eval_set:
1379
+ self.logger.info("No date column or eval set for OOT psi calculation")
1380
+ return
1381
+
1382
+ cat_features_from_backend = self.__get_categorical_features()
1383
+ client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
1384
+ estimator, validated_X, search_keys
1385
+ )
1386
+ if self.id_columns and self.id_columns_encoder is not None:
1387
+ if cat_features_from_backend:
1388
+ cat_features_from_backend = [
1389
+ c
1390
+ for c in cat_features_from_backend
1391
+ if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
1392
+ ]
1393
+ if client_cat_features:
1394
+ client_cat_features = [
1395
+ c
1396
+ for c in client_cat_features
1397
+ if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
1398
+ ]
1399
+
1400
+ prepared_data = self._prepare_data_for_metrics(
1401
+ trace_id=trace_id,
1402
+ X=X,
1403
+ y=y,
1404
+ eval_set=eval_set,
1405
+ exclude_features_sources=exclude_features_sources,
1406
+ importance_threshold=importance_threshold,
1407
+ max_features=max_features,
1408
+ remove_outliers_calc_metrics=False,
1409
+ cv_override=cv,
1410
+ search_keys_for_metrics=search_keys_for_metrics,
1411
+ progress_bar=progress_bar,
1412
+ progress_callback=progress_callback,
1413
+ client_cat_features=client_cat_features,
1414
+ )
1415
+ if prepared_data is None:
1416
+ return None
1417
+
1418
+ (
1419
+ validated_X,
1420
+ fitting_X,
1421
+ y_sorted,
1422
+ fitting_enriched_X,
1423
+ _,
1424
+ fitting_eval_set_dict,
1425
+ _,
1426
+ _,
1427
+ _,
1428
+ columns_renaming,
1429
+ eval_set_dates,
1430
+ ) = prepared_data
1431
+
1432
+ # rename cat_features
1433
+ if client_cat_features:
1434
+ for new_c, old_c in columns_renaming.items():
1435
+ if old_c in client_cat_features:
1436
+ client_cat_features.remove(old_c)
1437
+ client_cat_features.append(new_c)
1438
+ for cat_feature in client_cat_features:
1439
+ if cat_feature not in fitting_X.columns:
1440
+ self.logger.error(
1441
+ f"Client cat_feature `{cat_feature}` not found in" f" x columns: {fitting_X.columns.to_list()}"
1442
+ )
1443
+ else:
1444
+ client_cat_features = []
1445
+
1446
+ model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
1447
+ cat_features = list(set(client_cat_features + cat_features_from_backend))
1448
+
1449
+ # Drop unstable features
1450
+ unstable_features = self._check_stability(
1451
+ validated_X,
1452
+ validated_eval_set,
1453
+ fitting_eval_set_dict,
1454
+ eval_set_dates,
1455
+ search_keys,
1456
+ stability_threshold,
1457
+ cat_features,
1458
+ model_task_type,
1459
+ )
1460
+ client_features_df = self.df_with_original_index.rename(columns=columns_renaming)
1461
+ # decoded_X = self._decode_id_columns(fitting_X, columns_renaming)
1462
+ self._update_report_psi(trace_id, client_features_df)
1463
+
1464
+ if unstable_features:
1465
+ msg = f"Some features are unstable: {unstable_features} and will be dropped"
1466
+ self.logger.warning(msg)
1467
+ print(msg)
1468
+ fitting_X = fitting_X.drop(columns=unstable_features, errors="ignore")
1469
+ fitting_enriched_X = fitting_enriched_X.drop(columns=unstable_features, errors="ignore")
1470
+ msg = f"Threre are {len(fitting_enriched_X.columns)} stable selected features left"
1471
+ self.logger.info(msg)
1472
+ print(msg)
1473
+ for idx, (
1474
+ eval_X,
1475
+ eval_y,
1476
+ eval_enriched_X,
1477
+ eval_enriched_y,
1478
+ ) in fitting_eval_set_dict.items():
1479
+ eval_X = eval_X.drop(columns=unstable_features, errors="ignore")
1480
+ eval_enriched_X = eval_enriched_X.drop(columns=unstable_features, errors="ignore")
1481
+ fitting_eval_set_dict[idx] = (eval_X, eval_y, eval_enriched_X, eval_enriched_y)
1482
+
1483
+ def _check_stability(
1484
+ self,
1485
+ X: pd.DataFrame,
1486
+ eval_set: List[Tuple[pd.DataFrame, pd.Series]],
1487
+ enriched_eval_set: Dict,
1488
+ eval_set_dates: Dict[int, pd.Series],
1489
+ search_keys: Dict[str, SearchKey],
1490
+ stability_threshold: float,
1491
+ cat_features: List[str],
1492
+ model_task_type: ModelTaskType,
1493
+ ) -> List[str]:
1494
+ # Find latest eval set or earliest if all eval sets are before train set
1495
+ date_column = self._get_date_column(search_keys)
1496
+
1497
+ if (
1498
+ date_column is None
1499
+ or not eval_set
1500
+ or not eval_set_dates
1501
+ or (self.cv is not None and self.cv.is_time_series())
1502
+ ):
1503
+ return []
1504
+
1505
+ # Get minimum date from main dataset X
1506
+ main_min_date = X[date_column].min()
1507
+
1508
+ # Find minimum date for each eval_set and compare with main dataset
1509
+ eval_dates = []
1510
+ for i, (eval_x, _) in enumerate(eval_set):
1511
+ if date_column in eval_x.columns:
1512
+ eval_min_date = eval_x[date_column].min()
1513
+ eval_max_date = eval_x[date_column].max()
1514
+ eval_dates.append((i, eval_min_date, eval_max_date))
1515
+
1516
+ if not eval_dates:
1517
+ return []
1518
+
1519
+ # Check if any eval_set has minimum date >= main dataset minimum date
1520
+ later_eval_sets = [(i, min_date, max_date) for i, min_date, max_date in eval_dates if min_date >= main_min_date]
1521
+
1522
+ if later_eval_sets:
1523
+ # If there are eval_sets with date >= main date, choose the one with highest maximum date
1524
+ selected_eval_set_idx = max(later_eval_sets, key=lambda x: x[2])[0]
1525
+ else:
1526
+ # If all eval_sets have dates < main date, choose the one with lowest minimux date
1527
+ selected_eval_set_idx = max(eval_dates, key=lambda x: x[1])[0]
1528
+
1529
+ checking_eval_set = enriched_eval_set[selected_eval_set_idx]
1530
+
1531
+ checking_eval_set_df = (
1532
+ checking_eval_set[2]
1533
+ if checking_eval_set[1] is None or checking_eval_set[1].isna().all()
1534
+ else pd.concat([checking_eval_set[2], checking_eval_set[1].to_frame(TARGET)], axis=1)
1535
+ )
1536
+ checking_eval_set_df = checking_eval_set_df.copy()
1537
+
1538
+ checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
1539
+
1540
+ psi_values = calculate_features_psi(
1541
+ checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
1542
+ )
1543
+
1544
+ self.psi_values = {
1545
+ feature: psi_value for feature, psi_value in psi_values.items() if psi_value <= stability_threshold
1546
+ }
1547
+
1548
+ return [feature for feature, psi in psi_values.items() if psi > stability_threshold]
1549
+
1333
1550
  def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float], silent: bool = False):
1334
1551
  renaming = self.fit_columns_renaming or {}
1335
1552
  self.logger.info(f"Updating SHAP values: {new_shaps}")
@@ -1385,6 +1602,56 @@ class FeaturesEnricher(TransformerMixin):
1385
1602
  except (ImportError, NameError):
1386
1603
  pass
1387
1604
 
1605
+ def _update_report_psi(self, trace_id: str, clients_features_df: pd.DataFrame):
1606
+ self.__prepare_feature_importances(trace_id, clients_features_df)
1607
+
1608
+ if self.features_info_display_handle is not None:
1609
+ try:
1610
+ _ = get_ipython() # type: ignore
1611
+
1612
+ display_html_dataframe(
1613
+ self.features_info,
1614
+ self._features_info_without_links,
1615
+ self.bundle.get("relevant_features_header"),
1616
+ display_handle=self.features_info_display_handle,
1617
+ )
1618
+ except (ImportError, NameError):
1619
+ pass
1620
+
1621
+ if self.data_sources_display_handle is not None:
1622
+ try:
1623
+ _ = get_ipython() # type: ignore
1624
+
1625
+ display_html_dataframe(
1626
+ self.relevant_data_sources,
1627
+ self._relevant_data_sources_wo_links,
1628
+ self.bundle.get("relevant_data_sources_header"),
1629
+ display_handle=self.data_sources_display_handle,
1630
+ )
1631
+ except (ImportError, NameError):
1632
+ pass
1633
+
1634
+ if self.autofe_features_display_handle is not None:
1635
+ try:
1636
+ _ = get_ipython() # type: ignore
1637
+ autofe_descriptions_df = self.get_autofe_features_description()
1638
+ if autofe_descriptions_df is not None:
1639
+ display_html_dataframe(
1640
+ df=autofe_descriptions_df,
1641
+ internal_df=autofe_descriptions_df,
1642
+ header=self.bundle.get("autofe_descriptions_header"),
1643
+ display_handle=self.autofe_features_display_handle,
1644
+ )
1645
+ except (ImportError, NameError):
1646
+ pass
1647
+ if self.report_button_handle is not None:
1648
+ try:
1649
+ _ = get_ipython() # type: ignore
1650
+
1651
+ self.__show_report_button(display_handle=self.report_button_handle)
1652
+ except (ImportError, NameError):
1653
+ pass
1654
+
1388
1655
  def _check_train_and_eval_target_distribution(self, y, eval_set_dict):
1389
1656
  uneven_distribution = False
1390
1657
  for eval_set in eval_set_dict.values():
@@ -1535,7 +1802,7 @@ class FeaturesEnricher(TransformerMixin):
1535
1802
  is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
1536
1803
  is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
1537
1804
  checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
1538
- validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set)
1805
+ validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set, silent=True)
1539
1806
 
1540
1807
  sampled_data = self._get_enriched_for_metrics(
1541
1808
  trace_id,
@@ -1549,7 +1816,7 @@ class FeaturesEnricher(TransformerMixin):
1549
1816
  progress_bar,
1550
1817
  progress_callback,
1551
1818
  )
1552
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = dataclasses.astuple(
1819
+ (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming) = dataclasses.astuple(
1553
1820
  sampled_data
1554
1821
  )
1555
1822
 
@@ -1572,8 +1839,7 @@ class FeaturesEnricher(TransformerMixin):
1572
1839
  or c in set(self.feature_names_).union(self.id_columns or [])
1573
1840
  or (self.fit_columns_renaming or {}).get(c, c) in set(self.feature_names_).union(self.id_columns or [])
1574
1841
  )
1575
- and c
1576
- not in (
1842
+ and c not in (
1577
1843
  excluding_search_keys
1578
1844
  + list(self.fit_dropped_features)
1579
1845
  + [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
@@ -1672,12 +1938,16 @@ class FeaturesEnricher(TransformerMixin):
1672
1938
  fitting_enriched_X = fitting_enriched_X[fitting_enriched_x_columns]
1673
1939
  fitting_enriched_X, _ = self._encode_id_columns(fitting_enriched_X, self.fit_columns_renaming)
1674
1940
  self.logger.info(f"Final sorted list of fitting enriched X columns: {fitting_enriched_x_columns}")
1941
+ date_column = self._get_date_column(search_keys)
1942
+ eval_set_dates = {}
1675
1943
  for idx, eval_tuple in eval_set_sampled_dict.items():
1676
1944
  eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
1677
1945
  eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
1678
1946
  enriched_eval_X_sorted, enriched_eval_y_sorted = self._sort_by_system_record_id(
1679
1947
  enriched_eval_X, eval_y_sampled, self.cv
1680
1948
  )
1949
+ if date_column is not None:
1950
+ eval_set_dates[idx] = eval_X_sorted[date_column]
1681
1951
  fitting_eval_X = eval_X_sorted[fitting_x_columns].copy()
1682
1952
  fitting_enriched_eval_X = enriched_eval_X_sorted[fitting_enriched_x_columns].copy()
1683
1953
 
@@ -1722,6 +1992,7 @@ class FeaturesEnricher(TransformerMixin):
1722
1992
  groups,
1723
1993
  cv,
1724
1994
  columns_renaming,
1995
+ eval_set_dates,
1725
1996
  )
1726
1997
 
1727
1998
  @dataclass
@@ -1884,14 +2155,16 @@ class FeaturesEnricher(TransformerMixin):
1884
2155
  remove_outliers_calc_metrics: Optional[bool],
1885
2156
  ) -> _EnrichedDataForMetrics:
1886
2157
  eval_set_sampled_dict = {}
1887
- search_keys = self.fit_search_keys
2158
+ search_keys = self.fit_search_keys.copy()
1888
2159
 
1889
2160
  rows_to_drop = None
1890
2161
  has_date = self._get_date_column(search_keys) is not None
1891
2162
  self.model_task_type = self.model_task_type or define_task(
1892
2163
  self.df_with_original_index[TARGET], has_date, self.logger, silent=True
1893
2164
  )
1894
- if self.model_task_type == ModelTaskType.REGRESSION:
2165
+ if remove_outliers_calc_metrics is None:
2166
+ remove_outliers_calc_metrics = True
2167
+ if self.model_task_type == ModelTaskType.REGRESSION and remove_outliers_calc_metrics:
1895
2168
  target_outliers_df = self._search_task.get_target_outliers(trace_id)
1896
2169
  if target_outliers_df is not None and len(target_outliers_df) > 0:
1897
2170
  outliers = pd.merge(
@@ -1901,11 +2174,8 @@ class FeaturesEnricher(TransformerMixin):
1901
2174
  how="inner",
1902
2175
  )
1903
2176
  top_outliers = outliers.sort_values(by=TARGET, ascending=False)[TARGET].head(3)
1904
- if remove_outliers_calc_metrics is None or remove_outliers_calc_metrics is True:
1905
- rows_to_drop = outliers
1906
- not_msg = ""
1907
- else:
1908
- not_msg = "not "
2177
+ rows_to_drop = outliers
2178
+ not_msg = ""
1909
2179
  msg = self.bundle.get("target_outliers_warning").format(len(target_outliers_df), top_outliers, not_msg)
1910
2180
  print(msg)
1911
2181
  self.logger.warning(msg)
@@ -1963,12 +2233,13 @@ class FeaturesEnricher(TransformerMixin):
1963
2233
  enriched_eval_X = enriched_eval_sets[idx + 1][enriched_X_columns].copy()
1964
2234
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
1965
2235
 
1966
- reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
1967
- X_sampled.rename(columns=reversed_renaming, inplace=True)
1968
- enriched_X.rename(columns=reversed_renaming, inplace=True)
2236
+ # reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
2237
+ X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
2238
+ enriched_X.rename(columns=self.fit_columns_renaming, inplace=True)
1969
2239
  for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
1970
- eval_X_sampled.rename(columns=reversed_renaming, inplace=True)
1971
- enriched_eval_X.rename(columns=reversed_renaming, inplace=True)
2240
+ eval_X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
2241
+ enriched_eval_X.rename(columns=self.fit_columns_renaming, inplace=True)
2242
+ search_keys = {self.fit_columns_renaming.get(k, k): v for k, v in search_keys.items()}
1972
2243
 
1973
2244
  datasets_hash = hash_input(self.X, self.y, self.eval_set)
1974
2245
  return self.__cache_and_return_results(
@@ -2112,7 +2383,7 @@ class FeaturesEnricher(TransformerMixin):
2112
2383
 
2113
2384
  def __extract_eval_data(
2114
2385
  self, enriched_df: pd.DataFrame, x_columns: List[str], enriched_X_columns: List[str], eval_set_len: int
2115
- ) -> Dict[int, Tuple]:
2386
+ ) -> Tuple[Dict[int, Tuple], Dict[int, pd.Series]]:
2116
2387
  eval_set_sampled_dict = {}
2117
2388
 
2118
2389
  for idx in range(eval_set_len):
@@ -2158,12 +2429,12 @@ class FeaturesEnricher(TransformerMixin):
2158
2429
  columns_renaming: Dict[str, str],
2159
2430
  ):
2160
2431
  # X_sampled - with hash-suffixes
2161
- reversed_renaming = {v: k for k, v in columns_renaming.items()}
2162
- search_keys = {
2163
- reversed_renaming.get(k, k): v
2164
- for k, v in search_keys.items()
2165
- if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
2166
- }
2432
+ # reversed_renaming = {v: k for k, v in columns_renaming.items()}
2433
+ # search_keys = {
2434
+ # reversed_renaming.get(k, k): v
2435
+ # for k, v in search_keys.items()
2436
+ # if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
2437
+ # }
2167
2438
  return FeaturesEnricher._EnrichedDataForMetrics(
2168
2439
  X_sampled=X_sampled,
2169
2440
  y_sampled=y_sampled,
@@ -2313,7 +2584,7 @@ if response.status_code == 200:
2313
2584
  self.logger.info("Start transform")
2314
2585
 
2315
2586
  validated_X, validated_y, validated_eval_set = self._validate_train_eval(
2316
- X, y, eval_set=None, is_transform=True
2587
+ X, y, eval_set=None, is_transform=True, silent=True
2317
2588
  )
2318
2589
  df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
2319
2590
 
@@ -2415,7 +2686,7 @@ if response.status_code == 200:
2415
2686
  else:
2416
2687
  self.logger.info("Input dataset hasn't date column")
2417
2688
  if self.__should_add_date_column():
2418
- df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
2689
+ df = self._add_current_date_as_key(df, search_keys, self.bundle, silent=True)
2419
2690
 
2420
2691
  email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
2421
2692
  if email_columns and self.generate_search_key_features:
@@ -2664,7 +2935,8 @@ if response.status_code == 200:
2664
2935
  selecting_columns = [
2665
2936
  c
2666
2937
  for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
2667
- if c not in self.zero_shap_client_features or c in (self.id_columns or [])
2938
+ if (c not in self.zero_shap_client_features and c not in self.unstable_client_features)
2939
+ or c in (self.id_columns or [])
2668
2940
  ]
2669
2941
  selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
2670
2942
  if add_fit_system_record_id:
@@ -2798,6 +3070,7 @@ if response.status_code == 200:
2798
3070
  scoring: Union[Callable, str, None],
2799
3071
  estimator: Optional[Any],
2800
3072
  importance_threshold: Optional[float],
3073
+ stability_threshold: float,
2801
3074
  max_features: Optional[int],
2802
3075
  remove_outliers_calc_metrics: Optional[bool],
2803
3076
  auto_fe_parameters: AutoFEParameters,
@@ -2812,6 +3085,7 @@ if response.status_code == 200:
2812
3085
  self.fit_columns_renaming = None
2813
3086
  self.fit_dropped_features = set()
2814
3087
  self.fit_generated_features = []
3088
+ self.psi_values = None
2815
3089
 
2816
3090
  validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, eval_set)
2817
3091
 
@@ -2908,7 +3182,7 @@ if response.status_code == 200:
2908
3182
  self.logger.info("Input dataset hasn't date column")
2909
3183
  # TODO remove when this logic will be implemented on the back
2910
3184
  if self.__should_add_date_column():
2911
- df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
3185
+ df = self._add_current_date_as_key(df, self.fit_search_keys, self.bundle)
2912
3186
 
2913
3187
  email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
2914
3188
  if email_columns and self.generate_search_key_features:
@@ -2923,10 +3197,13 @@ if response.status_code == 200:
2923
3197
  except Exception:
2924
3198
  self.logger.exception("Failed to check dates distribution validity")
2925
3199
 
3200
+ self.__adjust_cv(df)
3201
+
2926
3202
  if (
2927
3203
  is_numeric_dtype(df[self.TARGET_NAME])
2928
3204
  and self.model_task_type in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]
2929
3205
  and has_date
3206
+ and not self.cv.is_time_series()
2930
3207
  ):
2931
3208
  self._validate_PSI(df.sort_values(by=maybe_date_column))
2932
3209
 
@@ -2958,8 +3235,9 @@ if response.status_code == 200:
2958
3235
 
2959
3236
  self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
2960
3237
 
3238
+ # Group columns should have normalized names
3239
+ self.cv = None
2961
3240
  self.__adjust_cv(df)
2962
-
2963
3241
  if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
2964
3242
  id_columns = self.__get_renamed_id_columns()
2965
3243
  if id_columns:
@@ -3164,6 +3442,21 @@ if response.status_code == 200:
3164
3442
  display_id=f"autofe_descriptions_{uuid.uuid4()}",
3165
3443
  )
3166
3444
 
3445
+ self._select_features_by_psi(
3446
+ trace_id=trace_id,
3447
+ X=X,
3448
+ y=y,
3449
+ eval_set=eval_set,
3450
+ stability_threshold=stability_threshold,
3451
+ cv=self.cv,
3452
+ estimator=estimator,
3453
+ exclude_features_sources=exclude_features_sources,
3454
+ importance_threshold=importance_threshold,
3455
+ max_features=max_features,
3456
+ progress_bar=progress_bar,
3457
+ progress_callback=progress_callback,
3458
+ )
3459
+
3167
3460
  if self._has_paid_features(exclude_features_sources):
3168
3461
  if calculate_metrics is not None and calculate_metrics:
3169
3462
  msg = self.bundle.get("metrics_with_paid_features")
@@ -3289,10 +3582,11 @@ if response.status_code == 200:
3289
3582
  y: Optional[pd.Series] = None,
3290
3583
  eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]] = None,
3291
3584
  is_transform: bool = False,
3585
+ silent: bool = False,
3292
3586
  ) -> Tuple[pd.DataFrame, pd.Series, Optional[List[Tuple[pd.DataFrame, pd.Series]]]]:
3293
3587
  validated_X = self._validate_X(X, is_transform)
3294
3588
  validated_y = self._validate_y(validated_X, y, enforce_y=not is_transform)
3295
- validated_eval_set = self._validate_eval_set(validated_X, eval_set)
3589
+ validated_eval_set = self._validate_eval_set(validated_X, eval_set, silent=silent)
3296
3590
  return validated_X, validated_y, validated_eval_set
3297
3591
 
3298
3592
  def _encode_id_columns(
@@ -3424,10 +3718,30 @@ if response.status_code == 200:
3424
3718
 
3425
3719
  return validated_y
3426
3720
 
3427
- def _validate_eval_set(self, X: pd.DataFrame, eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]]):
3721
+ def _validate_eval_set(
3722
+ self, X: pd.DataFrame, eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]], silent: bool = False
3723
+ ):
3428
3724
  if eval_set is None:
3429
3725
  return None
3430
- return [self._validate_eval_set_pair(X, eval_pair) for eval_pair in eval_set]
3726
+ validated_eval_set = []
3727
+ has_date = self._get_date_column(self.search_keys) is not None
3728
+ for idx, eval_pair in enumerate(eval_set):
3729
+ validated_pair = self._validate_eval_set_pair(X, eval_pair)
3730
+ if validated_pair[1].isna().all():
3731
+ if not has_date:
3732
+ msg = self.bundle.get("oot_without_date_not_supported").format(idx + 1)
3733
+ elif self.columns_for_online_api:
3734
+ msg = self.bundle.get("oot_with_online_sources_not_supported").format(idx + 1)
3735
+ else:
3736
+ msg = None
3737
+ if msg:
3738
+ if not silent:
3739
+ print(msg)
3740
+ self.logger.warning(msg)
3741
+ continue
3742
+ validated_eval_set.append(validated_pair)
3743
+
3744
+ return validated_eval_set
3431
3745
 
3432
3746
  def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
3433
3747
  if len(eval_pair) != 2:
@@ -3502,16 +3816,18 @@ if response.status_code == 200:
3502
3816
  raise ValidationError(self.bundle.get("unsupported_y_type_eval_set").format(type(eval_y)))
3503
3817
 
3504
3818
  eval_y_nunique = validated_eval_y.nunique()
3505
- if eval_y_nunique < 2:
3819
+ is_oot = validated_eval_y.isna().all()
3820
+ if not is_oot and eval_y_nunique < 2:
3506
3821
  raise ValidationError(self.bundle.get("y_is_constant_eval_set"))
3507
3822
 
3508
- if self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
3823
+ if not is_oot and self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
3509
3824
  raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
3510
3825
 
3511
- # Check for duplicates between train and eval sets by comparing all values
3512
- train_eval_intersection = pd.merge(X, validated_eval_X, how="inner")
3513
- if len(train_eval_intersection) > 0:
3514
- raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
3826
+ if not is_oot:
3827
+ # Check for duplicates between train and eval sets by comparing all values
3828
+ train_eval_intersection = pd.merge(X, validated_eval_X, how="inner")
3829
+ if len(train_eval_intersection) > 0:
3830
+ raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
3515
3831
 
3516
3832
  return validated_eval_X, validated_eval_y
3517
3833
 
@@ -3527,10 +3843,12 @@ if response.status_code == 200:
3527
3843
  if isinstance(eval_set, tuple):
3528
3844
  eval_set = [eval_set]
3529
3845
  for eval in eval_set:
3530
- if self.baseline_score_column not in eval[0].columns:
3531
- raise ValidationError(self.bundle.get("baseline_score_column_not_exists"))
3532
- if eval[0][self.baseline_score_column].isna().any():
3533
- raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
3846
+ is_oot = eval[1].isna().all()
3847
+ if not is_oot:
3848
+ if self.baseline_score_column not in eval[0].columns:
3849
+ raise ValidationError(self.bundle.get("baseline_score_column_not_exists"))
3850
+ if eval[0][self.baseline_score_column].isna().any():
3851
+ raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
3534
3852
 
3535
3853
  @staticmethod
3536
3854
  def _sample_X_and_y(X: pd.DataFrame, y: pd.Series, enriched_X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
@@ -3704,7 +4022,7 @@ if response.status_code == 200:
3704
4022
  return df
3705
4023
 
3706
4024
  def _add_current_date_as_key(
3707
- self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
4025
+ self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], bundle: ResourceBundle, silent: bool = False
3708
4026
  ) -> pd.DataFrame:
3709
4027
  if (
3710
4028
  set(search_keys.values()) == {SearchKey.PHONE}
@@ -3712,7 +4030,8 @@ if response.status_code == 200:
3712
4030
  or set(search_keys.values()) == {SearchKey.HEM}
3713
4031
  or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
3714
4032
  ):
3715
- self.__log_warning(bundle.get("current_date_added"))
4033
+ if not silent:
4034
+ self.__log_warning(bundle.get("current_date_added"))
3716
4035
  df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
3717
4036
  search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
3718
4037
  converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, generate_cyclical_features=False)
@@ -4047,7 +4366,11 @@ if response.status_code == 200:
4047
4366
  return [f.name for f in features_meta if f.type == "categorical" and f.name not in (self.id_columns or [])]
4048
4367
 
4049
4368
  def __prepare_feature_importances(
4050
- self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
4369
+ self,
4370
+ trace_id: str,
4371
+ clients_features_df: pd.DataFrame,
4372
+ updated_shaps: Optional[Dict[str, float]] = None,
4373
+ silent=False,
4051
4374
  ):
4052
4375
  if self._search_task is None:
4053
4376
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
@@ -4060,11 +4383,12 @@ if response.status_code == 200:
4060
4383
  features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
4061
4384
 
4062
4385
  # To be sure that names with hash suffixes
4063
- df = df.rename(columns=original_names_dict)
4386
+ clients_features_df = clients_features_df.rename(columns=original_names_dict)
4064
4387
 
4065
4388
  self.feature_names_ = []
4066
4389
  self.external_source_feature_names = []
4067
4390
  self.zero_shap_client_features = []
4391
+ self.unstable_client_features = []
4068
4392
  self.feature_importances_ = []
4069
4393
  features_info = []
4070
4394
  features_info_without_links = []
@@ -4073,10 +4397,10 @@ if response.status_code == 200:
4073
4397
  original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
4074
4398
 
4075
4399
  for feature_meta in features_meta:
4076
- if feature_meta.name in original_names_dict.keys():
4077
- feature_meta.name = original_names_dict[feature_meta.name]
4400
+ original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
4401
+ feature_meta.name = original_name
4078
4402
 
4079
- is_client_feature = original_names_dict.get(feature_meta.name, feature_meta.name) in df.columns
4403
+ is_client_feature = original_name in clients_features_df.columns
4080
4404
 
4081
4405
  # Show and update shap values for client features only if select_features is True
4082
4406
  if updated_shaps is not None and (not is_client_feature or self.fit_select_features):
@@ -4093,12 +4417,21 @@ if response.status_code == 200:
4093
4417
 
4094
4418
  for feature_meta in features_meta:
4095
4419
  original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
4096
- is_client_feature = original_name in df.columns
4420
+ is_client_feature = original_name in clients_features_df.columns
4097
4421
 
4098
4422
  if not is_client_feature:
4099
4423
  self.external_source_feature_names.append(original_name)
4100
4424
 
4425
+ if self.psi_values is not None:
4426
+ if original_name in self.psi_values:
4427
+ feature_meta.psi_value = self.psi_values[original_name]
4428
+ else:
4429
+ if is_client_feature and self.fit_select_features:
4430
+ self.unstable_client_features.append(original_name)
4431
+ continue
4432
+
4101
4433
  # TODO make a decision about selected features based on special flag from mlb
4434
+
4102
4435
  if original_shaps.get(feature_meta.name, 0.0) == 0.0:
4103
4436
  if is_client_feature and self.fit_select_features:
4104
4437
  self.zero_shap_client_features.append(original_name)
@@ -4122,7 +4455,7 @@ if response.status_code == 200:
4122
4455
  self.feature_names_.append(feature_meta.name)
4123
4456
  self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
4124
4457
 
4125
- df_for_sample = features_df if feature_meta.name in features_df.columns else df
4458
+ df_for_sample = features_df if feature_meta.name in features_df.columns else clients_features_df
4126
4459
  feature_info = FeatureInfo.from_metadata(feature_meta, df_for_sample, is_client_feature)
4127
4460
  features_info.append(feature_info.to_row(self.bundle))
4128
4461
  features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
@@ -4130,6 +4463,8 @@ if response.status_code == 200:
4130
4463
 
4131
4464
  if len(features_info) > 0:
4132
4465
  self.features_info = pd.DataFrame(features_info)
4466
+ if self.features_info[self.bundle.get("features_info_psi")].isna().all():
4467
+ self.features_info.drop(columns=[self.bundle.get("features_info_psi")], inplace=True)
4133
4468
  self._features_info_without_links = pd.DataFrame(features_info_without_links)
4134
4469
  self._internal_features_info = pd.DataFrame(internal_features_info)
4135
4470
  if not silent: