upgini 1.2.112__py3-none-any.whl → 1.2.113a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +394 -59
- upgini/metadata.py +1 -0
- upgini/metrics.py +4 -1
- upgini/resource_bundle/strings.properties +4 -1
- upgini/sampler/base.py +3 -1
- upgini/sampler/random_under_sampler.py +18 -8
- upgini/utils/deduplicate_utils.py +43 -7
- upgini/utils/feature_info.py +5 -0
- upgini/utils/psi.py +268 -0
- {upgini-1.2.112.dist-info → upgini-1.2.113a2.dist-info}/METADATA +1 -1
- {upgini-1.2.112.dist-info → upgini-1.2.113a2.dist-info}/RECORD +14 -13
- {upgini-1.2.112.dist-info → upgini-1.2.113a2.dist-info}/WHEEL +0 -0
- {upgini-1.2.112.dist-info → upgini-1.2.113a2.dist-info}/licenses/LICENSE +0 -0
upgini/features_enricher.py
CHANGED
@@ -112,6 +112,7 @@ except Exception:
|
|
112
112
|
CustomFallbackProgressBar as ProgressBar,
|
113
113
|
)
|
114
114
|
|
115
|
+
from upgini.utils.psi import calculate_features_psi
|
115
116
|
from upgini.utils.sample_utils import SampleColumns, SampleConfig, _num_samples, sample
|
116
117
|
from upgini.utils.sort import sort_columns
|
117
118
|
from upgini.utils.target_utils import calculate_psi, define_task
|
@@ -297,7 +298,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
297
298
|
self.feature_names_ = []
|
298
299
|
self.external_source_feature_names = []
|
299
300
|
self.zero_shap_client_features = []
|
301
|
+
self.unstable_client_features = []
|
300
302
|
self.feature_importances_ = []
|
303
|
+
self.psi_values: Optional[Dict[str, float]] = None
|
301
304
|
self.search_id = search_id
|
302
305
|
self.disable_force_downsampling = disable_force_downsampling
|
303
306
|
self.print_trace_id = print_trace_id
|
@@ -398,13 +401,26 @@ class FeaturesEnricher(TransformerMixin):
|
|
398
401
|
@staticmethod
|
399
402
|
def _check_eval_set(eval_set, X, bundle: ResourceBundle):
|
400
403
|
checked_eval_set = []
|
401
|
-
if eval_set is
|
404
|
+
if eval_set is None:
|
405
|
+
return checked_eval_set
|
406
|
+
if isinstance(eval_set, tuple):
|
402
407
|
eval_set = [eval_set]
|
403
|
-
if
|
408
|
+
if not isinstance(eval_set, list):
|
404
409
|
raise ValidationError(bundle.get("unsupported_type_eval_set").format(type(eval_set)))
|
405
410
|
for eval_pair in eval_set or []:
|
411
|
+
# Handle OOT
|
412
|
+
if isinstance(eval_pair, pd.DataFrame):
|
413
|
+
empty_target = pd.Series([np.nan] * len(eval_pair), index=eval_pair.index)
|
414
|
+
eval_pair = (eval_pair, empty_target)
|
415
|
+
elif isinstance(eval_pair, tuple) and len(eval_pair) == 1:
|
416
|
+
empty_target = pd.Series([np.nan] * len(eval_pair[0]), index=eval_pair[0].index)
|
417
|
+
eval_pair = (eval_pair[0], empty_target)
|
418
|
+
|
406
419
|
if not isinstance(eval_pair, tuple) or len(eval_pair) != 2:
|
407
420
|
raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
421
|
+
if eval_pair[1] is None:
|
422
|
+
empty_target = pd.Series([np.nan] * len(eval_pair[0]), index=eval_pair[0].index)
|
423
|
+
eval_pair = (eval_pair[0], empty_target)
|
408
424
|
if not is_frames_equal(X, eval_pair[0], bundle):
|
409
425
|
checked_eval_set.append(eval_pair)
|
410
426
|
return checked_eval_set
|
@@ -426,6 +442,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
426
442
|
search_id_callback: Optional[Callable[[str], Any]] = None,
|
427
443
|
select_features: bool = True,
|
428
444
|
auto_fe_parameters: Optional[AutoFEParameters] = None,
|
445
|
+
stability_threshold: float = 0.15,
|
429
446
|
**kwargs,
|
430
447
|
):
|
431
448
|
"""Fit to data.
|
@@ -515,6 +532,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
515
532
|
estimator=estimator,
|
516
533
|
scoring=scoring,
|
517
534
|
importance_threshold=importance_threshold,
|
535
|
+
stability_threshold=stability_threshold,
|
518
536
|
max_features=max_features,
|
519
537
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
520
538
|
auto_fe_parameters=auto_fe_parameters,
|
@@ -574,6 +592,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
574
592
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
575
593
|
select_features: bool = True,
|
576
594
|
auto_fe_parameters: Optional[AutoFEParameters] = None,
|
595
|
+
stability_threshold: float = 0.15,
|
577
596
|
**kwargs,
|
578
597
|
) -> pd.DataFrame:
|
579
598
|
"""Fit to data, then transform it.
|
@@ -618,6 +637,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
618
637
|
If True, return only selected features both from input and data sources.
|
619
638
|
Otherwise, return all features from input and only selected features from data sources.
|
620
639
|
|
640
|
+
stability_threshold: float, optional (default=0.15)
|
641
|
+
Stability threshold for selected features PSI calculation. If PSI is less than this threshold,
|
642
|
+
then feature will be dropped.
|
643
|
+
|
621
644
|
Returns
|
622
645
|
-------
|
623
646
|
X_new: pandas.DataFrame of shape (n_samples, n_features_new)
|
@@ -674,6 +697,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
674
697
|
scoring=scoring,
|
675
698
|
estimator=estimator,
|
676
699
|
importance_threshold=importance_threshold,
|
700
|
+
stability_threshold=stability_threshold,
|
677
701
|
max_features=max_features,
|
678
702
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
679
703
|
auto_fe_parameters=auto_fe_parameters,
|
@@ -941,7 +965,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
941
965
|
raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
|
942
966
|
|
943
967
|
validated_X, validated_y, validated_eval_set = self._validate_train_eval(
|
944
|
-
effective_X, effective_y, effective_eval_set
|
968
|
+
effective_X, effective_y, effective_eval_set, silent=internal_call
|
945
969
|
)
|
946
970
|
|
947
971
|
if self.X is None:
|
@@ -1033,6 +1057,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1033
1057
|
groups,
|
1034
1058
|
_cv,
|
1035
1059
|
columns_renaming,
|
1060
|
+
eval_set_dates,
|
1036
1061
|
) = prepared_data
|
1037
1062
|
|
1038
1063
|
# rename cat_features
|
@@ -1074,9 +1099,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
1074
1099
|
self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
|
1075
1100
|
|
1076
1101
|
has_date = self._get_date_column(search_keys) is not None
|
1077
|
-
has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
|
1078
1102
|
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
1079
1103
|
cat_features = list(set(client_cat_features + cat_features_from_backend))
|
1104
|
+
has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
|
1080
1105
|
baseline_cat_features = [f for f in cat_features if f in fitting_X.columns]
|
1081
1106
|
enriched_cat_features = [f for f in cat_features if f in fitting_enriched_X.columns]
|
1082
1107
|
if len(enriched_cat_features) < len(cat_features):
|
@@ -1196,8 +1221,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
1196
1221
|
# max_initial_eval_set_hit_rate = self._search_task.get_max_initial_eval_set_hit_rate_v2()
|
1197
1222
|
if len(fitting_eval_set_dict) > 0:
|
1198
1223
|
for idx in fitting_eval_set_dict.keys():
|
1199
|
-
# eval_hit_rate = max_initial_eval_set_hit_rate[idx + 1]
|
1200
|
-
|
1201
1224
|
(
|
1202
1225
|
eval_X_sorted,
|
1203
1226
|
eval_y_sorted,
|
@@ -1205,6 +1228,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
1205
1228
|
enriched_eval_y_sorted,
|
1206
1229
|
) = fitting_eval_set_dict[idx]
|
1207
1230
|
|
1231
|
+
if eval_y_sorted.isna().all():
|
1232
|
+
# Skip OOT eval set
|
1233
|
+
continue
|
1234
|
+
|
1208
1235
|
if baseline_estimator is not None:
|
1209
1236
|
self.logger.info(
|
1210
1237
|
f"Calculate baseline {metric} on eval set {idx + 1} "
|
@@ -1247,17 +1274,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
1247
1274
|
"quality_metrics_eval_segment"
|
1248
1275
|
).format(idx + 1),
|
1249
1276
|
self.bundle.get("quality_metrics_rows_header"): _num_samples(
|
1250
|
-
# effective_eval_set[idx][0]
|
1251
1277
|
# Use actually used for metrics dataset
|
1252
1278
|
eval_X_sorted
|
1253
1279
|
),
|
1254
|
-
# self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
|
1255
1280
|
}
|
1256
1281
|
if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
|
1257
1282
|
eval_y_sorted
|
1258
1283
|
):
|
1259
1284
|
eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
|
1260
|
-
# np.mean(validated_eval_set[idx][1]), 4
|
1261
1285
|
# Use actually used for metrics dataset
|
1262
1286
|
np.mean(eval_y_sorted),
|
1263
1287
|
4,
|
@@ -1330,6 +1354,199 @@ class FeaturesEnricher(TransformerMixin):
|
|
1330
1354
|
finally:
|
1331
1355
|
self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
|
1332
1356
|
|
1357
|
+
def _select_features_by_psi(
|
1358
|
+
self,
|
1359
|
+
trace_id: str,
|
1360
|
+
X: Union[pd.DataFrame, pd.Series, np.ndarray],
|
1361
|
+
y: Union[pd.DataFrame, pd.Series, np.ndarray, List],
|
1362
|
+
eval_set: Optional[Union[List[tuple], tuple]],
|
1363
|
+
stability_threshold: float,
|
1364
|
+
cv: Union[BaseCrossValidator, CVType, str, None] = None,
|
1365
|
+
estimator=None,
|
1366
|
+
exclude_features_sources: Optional[List[str]] = None,
|
1367
|
+
importance_threshold: Optional[float] = None,
|
1368
|
+
max_features: Optional[int] = None,
|
1369
|
+
progress_bar: bool = True,
|
1370
|
+
progress_callback: Optional[Callable] = None,
|
1371
|
+
):
|
1372
|
+
search_keys = self.search_keys.copy()
|
1373
|
+
validated_X, _, validated_eval_set = self._validate_train_eval(X, y, eval_set, silent=True)
|
1374
|
+
if isinstance(X, np.ndarray):
|
1375
|
+
search_keys = {str(k): v for k, v in search_keys.items()}
|
1376
|
+
|
1377
|
+
has_date = self._get_date_column(search_keys) is not None
|
1378
|
+
if not has_date or not validated_eval_set:
|
1379
|
+
self.logger.info("No date column or eval set for OOT psi calculation")
|
1380
|
+
return
|
1381
|
+
|
1382
|
+
cat_features_from_backend = self.__get_categorical_features()
|
1383
|
+
client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
|
1384
|
+
estimator, validated_X, search_keys
|
1385
|
+
)
|
1386
|
+
if self.id_columns and self.id_columns_encoder is not None:
|
1387
|
+
if cat_features_from_backend:
|
1388
|
+
cat_features_from_backend = [
|
1389
|
+
c
|
1390
|
+
for c in cat_features_from_backend
|
1391
|
+
if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
|
1392
|
+
]
|
1393
|
+
if client_cat_features:
|
1394
|
+
client_cat_features = [
|
1395
|
+
c
|
1396
|
+
for c in client_cat_features
|
1397
|
+
if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
|
1398
|
+
]
|
1399
|
+
|
1400
|
+
prepared_data = self._prepare_data_for_metrics(
|
1401
|
+
trace_id=trace_id,
|
1402
|
+
X=X,
|
1403
|
+
y=y,
|
1404
|
+
eval_set=eval_set,
|
1405
|
+
exclude_features_sources=exclude_features_sources,
|
1406
|
+
importance_threshold=importance_threshold,
|
1407
|
+
max_features=max_features,
|
1408
|
+
remove_outliers_calc_metrics=False,
|
1409
|
+
cv_override=cv,
|
1410
|
+
search_keys_for_metrics=search_keys_for_metrics,
|
1411
|
+
progress_bar=progress_bar,
|
1412
|
+
progress_callback=progress_callback,
|
1413
|
+
client_cat_features=client_cat_features,
|
1414
|
+
)
|
1415
|
+
if prepared_data is None:
|
1416
|
+
return None
|
1417
|
+
|
1418
|
+
(
|
1419
|
+
validated_X,
|
1420
|
+
fitting_X,
|
1421
|
+
y_sorted,
|
1422
|
+
fitting_enriched_X,
|
1423
|
+
_,
|
1424
|
+
fitting_eval_set_dict,
|
1425
|
+
_,
|
1426
|
+
_,
|
1427
|
+
_,
|
1428
|
+
columns_renaming,
|
1429
|
+
eval_set_dates,
|
1430
|
+
) = prepared_data
|
1431
|
+
|
1432
|
+
# rename cat_features
|
1433
|
+
if client_cat_features:
|
1434
|
+
for new_c, old_c in columns_renaming.items():
|
1435
|
+
if old_c in client_cat_features:
|
1436
|
+
client_cat_features.remove(old_c)
|
1437
|
+
client_cat_features.append(new_c)
|
1438
|
+
for cat_feature in client_cat_features:
|
1439
|
+
if cat_feature not in fitting_X.columns:
|
1440
|
+
self.logger.error(
|
1441
|
+
f"Client cat_feature `{cat_feature}` not found in" f" x columns: {fitting_X.columns.to_list()}"
|
1442
|
+
)
|
1443
|
+
else:
|
1444
|
+
client_cat_features = []
|
1445
|
+
|
1446
|
+
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
1447
|
+
cat_features = list(set(client_cat_features + cat_features_from_backend))
|
1448
|
+
|
1449
|
+
# Drop unstable features
|
1450
|
+
unstable_features = self._check_stability(
|
1451
|
+
validated_X,
|
1452
|
+
validated_eval_set,
|
1453
|
+
fitting_eval_set_dict,
|
1454
|
+
eval_set_dates,
|
1455
|
+
search_keys,
|
1456
|
+
stability_threshold,
|
1457
|
+
cat_features,
|
1458
|
+
model_task_type,
|
1459
|
+
)
|
1460
|
+
client_features_df = self.df_with_original_index.rename(columns=columns_renaming)
|
1461
|
+
# decoded_X = self._decode_id_columns(fitting_X, columns_renaming)
|
1462
|
+
self._update_report_psi(trace_id, client_features_df)
|
1463
|
+
|
1464
|
+
if unstable_features:
|
1465
|
+
msg = f"Some features are unstable: {unstable_features} and will be dropped"
|
1466
|
+
self.logger.warning(msg)
|
1467
|
+
print(msg)
|
1468
|
+
fitting_X = fitting_X.drop(columns=unstable_features, errors="ignore")
|
1469
|
+
fitting_enriched_X = fitting_enriched_X.drop(columns=unstable_features, errors="ignore")
|
1470
|
+
msg = f"Threre are {len(fitting_enriched_X.columns)} stable selected features left"
|
1471
|
+
self.logger.info(msg)
|
1472
|
+
print(msg)
|
1473
|
+
for idx, (
|
1474
|
+
eval_X,
|
1475
|
+
eval_y,
|
1476
|
+
eval_enriched_X,
|
1477
|
+
eval_enriched_y,
|
1478
|
+
) in fitting_eval_set_dict.items():
|
1479
|
+
eval_X = eval_X.drop(columns=unstable_features, errors="ignore")
|
1480
|
+
eval_enriched_X = eval_enriched_X.drop(columns=unstable_features, errors="ignore")
|
1481
|
+
fitting_eval_set_dict[idx] = (eval_X, eval_y, eval_enriched_X, eval_enriched_y)
|
1482
|
+
|
1483
|
+
def _check_stability(
|
1484
|
+
self,
|
1485
|
+
X: pd.DataFrame,
|
1486
|
+
eval_set: List[Tuple[pd.DataFrame, pd.Series]],
|
1487
|
+
enriched_eval_set: Dict,
|
1488
|
+
eval_set_dates: Dict[int, pd.Series],
|
1489
|
+
search_keys: Dict[str, SearchKey],
|
1490
|
+
stability_threshold: float,
|
1491
|
+
cat_features: List[str],
|
1492
|
+
model_task_type: ModelTaskType,
|
1493
|
+
) -> List[str]:
|
1494
|
+
# Find latest eval set or earliest if all eval sets are before train set
|
1495
|
+
date_column = self._get_date_column(search_keys)
|
1496
|
+
|
1497
|
+
if (
|
1498
|
+
date_column is None
|
1499
|
+
or not eval_set
|
1500
|
+
or not eval_set_dates
|
1501
|
+
or (self.cv is not None and self.cv.is_time_series())
|
1502
|
+
):
|
1503
|
+
return []
|
1504
|
+
|
1505
|
+
# Get minimum date from main dataset X
|
1506
|
+
main_min_date = X[date_column].min()
|
1507
|
+
|
1508
|
+
# Find minimum date for each eval_set and compare with main dataset
|
1509
|
+
eval_dates = []
|
1510
|
+
for i, (eval_x, _) in enumerate(eval_set):
|
1511
|
+
if date_column in eval_x.columns:
|
1512
|
+
eval_min_date = eval_x[date_column].min()
|
1513
|
+
eval_max_date = eval_x[date_column].max()
|
1514
|
+
eval_dates.append((i, eval_min_date, eval_max_date))
|
1515
|
+
|
1516
|
+
if not eval_dates:
|
1517
|
+
return []
|
1518
|
+
|
1519
|
+
# Check if any eval_set has minimum date >= main dataset minimum date
|
1520
|
+
later_eval_sets = [(i, min_date, max_date) for i, min_date, max_date in eval_dates if min_date >= main_min_date]
|
1521
|
+
|
1522
|
+
if later_eval_sets:
|
1523
|
+
# If there are eval_sets with date >= main date, choose the one with highest maximum date
|
1524
|
+
selected_eval_set_idx = max(later_eval_sets, key=lambda x: x[2])[0]
|
1525
|
+
else:
|
1526
|
+
# If all eval_sets have dates < main date, choose the one with lowest minimux date
|
1527
|
+
selected_eval_set_idx = max(eval_dates, key=lambda x: x[1])[0]
|
1528
|
+
|
1529
|
+
checking_eval_set = enriched_eval_set[selected_eval_set_idx]
|
1530
|
+
|
1531
|
+
checking_eval_set_df = (
|
1532
|
+
checking_eval_set[2]
|
1533
|
+
if checking_eval_set[1] is None or checking_eval_set[1].isna().all()
|
1534
|
+
else pd.concat([checking_eval_set[2], checking_eval_set[1].to_frame(TARGET)], axis=1)
|
1535
|
+
)
|
1536
|
+
checking_eval_set_df = checking_eval_set_df.copy()
|
1537
|
+
|
1538
|
+
checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
|
1539
|
+
|
1540
|
+
psi_values = calculate_features_psi(
|
1541
|
+
checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
|
1542
|
+
)
|
1543
|
+
|
1544
|
+
self.psi_values = {
|
1545
|
+
feature: psi_value for feature, psi_value in psi_values.items() if psi_value <= stability_threshold
|
1546
|
+
}
|
1547
|
+
|
1548
|
+
return [feature for feature, psi in psi_values.items() if psi > stability_threshold]
|
1549
|
+
|
1333
1550
|
def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float], silent: bool = False):
|
1334
1551
|
renaming = self.fit_columns_renaming or {}
|
1335
1552
|
self.logger.info(f"Updating SHAP values: {new_shaps}")
|
@@ -1385,6 +1602,56 @@ class FeaturesEnricher(TransformerMixin):
|
|
1385
1602
|
except (ImportError, NameError):
|
1386
1603
|
pass
|
1387
1604
|
|
1605
|
+
def _update_report_psi(self, trace_id: str, clients_features_df: pd.DataFrame):
|
1606
|
+
self.__prepare_feature_importances(trace_id, clients_features_df)
|
1607
|
+
|
1608
|
+
if self.features_info_display_handle is not None:
|
1609
|
+
try:
|
1610
|
+
_ = get_ipython() # type: ignore
|
1611
|
+
|
1612
|
+
display_html_dataframe(
|
1613
|
+
self.features_info,
|
1614
|
+
self._features_info_without_links,
|
1615
|
+
self.bundle.get("relevant_features_header"),
|
1616
|
+
display_handle=self.features_info_display_handle,
|
1617
|
+
)
|
1618
|
+
except (ImportError, NameError):
|
1619
|
+
pass
|
1620
|
+
|
1621
|
+
if self.data_sources_display_handle is not None:
|
1622
|
+
try:
|
1623
|
+
_ = get_ipython() # type: ignore
|
1624
|
+
|
1625
|
+
display_html_dataframe(
|
1626
|
+
self.relevant_data_sources,
|
1627
|
+
self._relevant_data_sources_wo_links,
|
1628
|
+
self.bundle.get("relevant_data_sources_header"),
|
1629
|
+
display_handle=self.data_sources_display_handle,
|
1630
|
+
)
|
1631
|
+
except (ImportError, NameError):
|
1632
|
+
pass
|
1633
|
+
|
1634
|
+
if self.autofe_features_display_handle is not None:
|
1635
|
+
try:
|
1636
|
+
_ = get_ipython() # type: ignore
|
1637
|
+
autofe_descriptions_df = self.get_autofe_features_description()
|
1638
|
+
if autofe_descriptions_df is not None:
|
1639
|
+
display_html_dataframe(
|
1640
|
+
df=autofe_descriptions_df,
|
1641
|
+
internal_df=autofe_descriptions_df,
|
1642
|
+
header=self.bundle.get("autofe_descriptions_header"),
|
1643
|
+
display_handle=self.autofe_features_display_handle,
|
1644
|
+
)
|
1645
|
+
except (ImportError, NameError):
|
1646
|
+
pass
|
1647
|
+
if self.report_button_handle is not None:
|
1648
|
+
try:
|
1649
|
+
_ = get_ipython() # type: ignore
|
1650
|
+
|
1651
|
+
self.__show_report_button(display_handle=self.report_button_handle)
|
1652
|
+
except (ImportError, NameError):
|
1653
|
+
pass
|
1654
|
+
|
1388
1655
|
def _check_train_and_eval_target_distribution(self, y, eval_set_dict):
|
1389
1656
|
uneven_distribution = False
|
1390
1657
|
for eval_set in eval_set_dict.values():
|
@@ -1535,7 +1802,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1535
1802
|
is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
|
1536
1803
|
is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
|
1537
1804
|
checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
1538
|
-
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set)
|
1805
|
+
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set, silent=True)
|
1539
1806
|
|
1540
1807
|
sampled_data = self._get_enriched_for_metrics(
|
1541
1808
|
trace_id,
|
@@ -1549,7 +1816,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1549
1816
|
progress_bar,
|
1550
1817
|
progress_callback,
|
1551
1818
|
)
|
1552
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = dataclasses.astuple(
|
1819
|
+
(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming) = dataclasses.astuple(
|
1553
1820
|
sampled_data
|
1554
1821
|
)
|
1555
1822
|
|
@@ -1572,8 +1839,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1572
1839
|
or c in set(self.feature_names_).union(self.id_columns or [])
|
1573
1840
|
or (self.fit_columns_renaming or {}).get(c, c) in set(self.feature_names_).union(self.id_columns or [])
|
1574
1841
|
)
|
1575
|
-
and c
|
1576
|
-
not in (
|
1842
|
+
and c not in (
|
1577
1843
|
excluding_search_keys
|
1578
1844
|
+ list(self.fit_dropped_features)
|
1579
1845
|
+ [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
|
@@ -1672,12 +1938,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
1672
1938
|
fitting_enriched_X = fitting_enriched_X[fitting_enriched_x_columns]
|
1673
1939
|
fitting_enriched_X, _ = self._encode_id_columns(fitting_enriched_X, self.fit_columns_renaming)
|
1674
1940
|
self.logger.info(f"Final sorted list of fitting enriched X columns: {fitting_enriched_x_columns}")
|
1941
|
+
date_column = self._get_date_column(search_keys)
|
1942
|
+
eval_set_dates = {}
|
1675
1943
|
for idx, eval_tuple in eval_set_sampled_dict.items():
|
1676
1944
|
eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
|
1677
1945
|
eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
|
1678
1946
|
enriched_eval_X_sorted, enriched_eval_y_sorted = self._sort_by_system_record_id(
|
1679
1947
|
enriched_eval_X, eval_y_sampled, self.cv
|
1680
1948
|
)
|
1949
|
+
if date_column is not None:
|
1950
|
+
eval_set_dates[idx] = eval_X_sorted[date_column]
|
1681
1951
|
fitting_eval_X = eval_X_sorted[fitting_x_columns].copy()
|
1682
1952
|
fitting_enriched_eval_X = enriched_eval_X_sorted[fitting_enriched_x_columns].copy()
|
1683
1953
|
|
@@ -1722,6 +1992,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1722
1992
|
groups,
|
1723
1993
|
cv,
|
1724
1994
|
columns_renaming,
|
1995
|
+
eval_set_dates,
|
1725
1996
|
)
|
1726
1997
|
|
1727
1998
|
@dataclass
|
@@ -1884,14 +2155,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
1884
2155
|
remove_outliers_calc_metrics: Optional[bool],
|
1885
2156
|
) -> _EnrichedDataForMetrics:
|
1886
2157
|
eval_set_sampled_dict = {}
|
1887
|
-
search_keys = self.fit_search_keys
|
2158
|
+
search_keys = self.fit_search_keys.copy()
|
1888
2159
|
|
1889
2160
|
rows_to_drop = None
|
1890
2161
|
has_date = self._get_date_column(search_keys) is not None
|
1891
2162
|
self.model_task_type = self.model_task_type or define_task(
|
1892
2163
|
self.df_with_original_index[TARGET], has_date, self.logger, silent=True
|
1893
2164
|
)
|
1894
|
-
if
|
2165
|
+
if remove_outliers_calc_metrics is None:
|
2166
|
+
remove_outliers_calc_metrics = True
|
2167
|
+
if self.model_task_type == ModelTaskType.REGRESSION and remove_outliers_calc_metrics:
|
1895
2168
|
target_outliers_df = self._search_task.get_target_outliers(trace_id)
|
1896
2169
|
if target_outliers_df is not None and len(target_outliers_df) > 0:
|
1897
2170
|
outliers = pd.merge(
|
@@ -1901,11 +2174,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1901
2174
|
how="inner",
|
1902
2175
|
)
|
1903
2176
|
top_outliers = outliers.sort_values(by=TARGET, ascending=False)[TARGET].head(3)
|
1904
|
-
|
1905
|
-
|
1906
|
-
not_msg = ""
|
1907
|
-
else:
|
1908
|
-
not_msg = "not "
|
2177
|
+
rows_to_drop = outliers
|
2178
|
+
not_msg = ""
|
1909
2179
|
msg = self.bundle.get("target_outliers_warning").format(len(target_outliers_df), top_outliers, not_msg)
|
1910
2180
|
print(msg)
|
1911
2181
|
self.logger.warning(msg)
|
@@ -1963,12 +2233,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
1963
2233
|
enriched_eval_X = enriched_eval_sets[idx + 1][enriched_X_columns].copy()
|
1964
2234
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
1965
2235
|
|
1966
|
-
reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
|
1967
|
-
X_sampled.rename(columns=
|
1968
|
-
enriched_X.rename(columns=
|
2236
|
+
# reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
|
2237
|
+
X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
|
2238
|
+
enriched_X.rename(columns=self.fit_columns_renaming, inplace=True)
|
1969
2239
|
for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
|
1970
|
-
eval_X_sampled.rename(columns=
|
1971
|
-
enriched_eval_X.rename(columns=
|
2240
|
+
eval_X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
|
2241
|
+
enriched_eval_X.rename(columns=self.fit_columns_renaming, inplace=True)
|
2242
|
+
search_keys = {self.fit_columns_renaming.get(k, k): v for k, v in search_keys.items()}
|
1972
2243
|
|
1973
2244
|
datasets_hash = hash_input(self.X, self.y, self.eval_set)
|
1974
2245
|
return self.__cache_and_return_results(
|
@@ -2112,7 +2383,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2112
2383
|
|
2113
2384
|
def __extract_eval_data(
|
2114
2385
|
self, enriched_df: pd.DataFrame, x_columns: List[str], enriched_X_columns: List[str], eval_set_len: int
|
2115
|
-
) -> Dict[int, Tuple]:
|
2386
|
+
) -> Tuple[Dict[int, Tuple], Dict[int, pd.Series]]:
|
2116
2387
|
eval_set_sampled_dict = {}
|
2117
2388
|
|
2118
2389
|
for idx in range(eval_set_len):
|
@@ -2158,12 +2429,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
2158
2429
|
columns_renaming: Dict[str, str],
|
2159
2430
|
):
|
2160
2431
|
# X_sampled - with hash-suffixes
|
2161
|
-
reversed_renaming = {v: k for k, v in columns_renaming.items()}
|
2162
|
-
search_keys = {
|
2163
|
-
|
2164
|
-
|
2165
|
-
|
2166
|
-
}
|
2432
|
+
# reversed_renaming = {v: k for k, v in columns_renaming.items()}
|
2433
|
+
# search_keys = {
|
2434
|
+
# reversed_renaming.get(k, k): v
|
2435
|
+
# for k, v in search_keys.items()
|
2436
|
+
# if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
|
2437
|
+
# }
|
2167
2438
|
return FeaturesEnricher._EnrichedDataForMetrics(
|
2168
2439
|
X_sampled=X_sampled,
|
2169
2440
|
y_sampled=y_sampled,
|
@@ -2313,7 +2584,7 @@ if response.status_code == 200:
|
|
2313
2584
|
self.logger.info("Start transform")
|
2314
2585
|
|
2315
2586
|
validated_X, validated_y, validated_eval_set = self._validate_train_eval(
|
2316
|
-
X, y, eval_set=None, is_transform=True
|
2587
|
+
X, y, eval_set=None, is_transform=True, silent=True
|
2317
2588
|
)
|
2318
2589
|
df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
|
2319
2590
|
|
@@ -2415,7 +2686,7 @@ if response.status_code == 200:
|
|
2415
2686
|
else:
|
2416
2687
|
self.logger.info("Input dataset hasn't date column")
|
2417
2688
|
if self.__should_add_date_column():
|
2418
|
-
df = self._add_current_date_as_key(df, search_keys, self.
|
2689
|
+
df = self._add_current_date_as_key(df, search_keys, self.bundle, silent=True)
|
2419
2690
|
|
2420
2691
|
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
2421
2692
|
if email_columns and self.generate_search_key_features:
|
@@ -2664,7 +2935,8 @@ if response.status_code == 200:
|
|
2664
2935
|
selecting_columns = [
|
2665
2936
|
c
|
2666
2937
|
for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
|
2667
|
-
if c not in self.zero_shap_client_features
|
2938
|
+
if (c not in self.zero_shap_client_features and c not in self.unstable_client_features)
|
2939
|
+
or c in (self.id_columns or [])
|
2668
2940
|
]
|
2669
2941
|
selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
|
2670
2942
|
if add_fit_system_record_id:
|
@@ -2798,6 +3070,7 @@ if response.status_code == 200:
|
|
2798
3070
|
scoring: Union[Callable, str, None],
|
2799
3071
|
estimator: Optional[Any],
|
2800
3072
|
importance_threshold: Optional[float],
|
3073
|
+
stability_threshold: float,
|
2801
3074
|
max_features: Optional[int],
|
2802
3075
|
remove_outliers_calc_metrics: Optional[bool],
|
2803
3076
|
auto_fe_parameters: AutoFEParameters,
|
@@ -2812,6 +3085,7 @@ if response.status_code == 200:
|
|
2812
3085
|
self.fit_columns_renaming = None
|
2813
3086
|
self.fit_dropped_features = set()
|
2814
3087
|
self.fit_generated_features = []
|
3088
|
+
self.psi_values = None
|
2815
3089
|
|
2816
3090
|
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, eval_set)
|
2817
3091
|
|
@@ -2908,7 +3182,7 @@ if response.status_code == 200:
|
|
2908
3182
|
self.logger.info("Input dataset hasn't date column")
|
2909
3183
|
# TODO remove when this logic will be implemented on the back
|
2910
3184
|
if self.__should_add_date_column():
|
2911
|
-
df = self._add_current_date_as_key(df, self.fit_search_keys, self.
|
3185
|
+
df = self._add_current_date_as_key(df, self.fit_search_keys, self.bundle)
|
2912
3186
|
|
2913
3187
|
email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
|
2914
3188
|
if email_columns and self.generate_search_key_features:
|
@@ -2923,10 +3197,13 @@ if response.status_code == 200:
|
|
2923
3197
|
except Exception:
|
2924
3198
|
self.logger.exception("Failed to check dates distribution validity")
|
2925
3199
|
|
3200
|
+
self.__adjust_cv(df)
|
3201
|
+
|
2926
3202
|
if (
|
2927
3203
|
is_numeric_dtype(df[self.TARGET_NAME])
|
2928
3204
|
and self.model_task_type in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]
|
2929
3205
|
and has_date
|
3206
|
+
and not self.cv.is_time_series()
|
2930
3207
|
):
|
2931
3208
|
self._validate_PSI(df.sort_values(by=maybe_date_column))
|
2932
3209
|
|
@@ -2958,8 +3235,9 @@ if response.status_code == 200:
|
|
2958
3235
|
|
2959
3236
|
self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
|
2960
3237
|
|
3238
|
+
# Group columns should have normalized names
|
3239
|
+
self.cv = None
|
2961
3240
|
self.__adjust_cv(df)
|
2962
|
-
|
2963
3241
|
if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
|
2964
3242
|
id_columns = self.__get_renamed_id_columns()
|
2965
3243
|
if id_columns:
|
@@ -3164,6 +3442,21 @@ if response.status_code == 200:
|
|
3164
3442
|
display_id=f"autofe_descriptions_{uuid.uuid4()}",
|
3165
3443
|
)
|
3166
3444
|
|
3445
|
+
self._select_features_by_psi(
|
3446
|
+
trace_id=trace_id,
|
3447
|
+
X=X,
|
3448
|
+
y=y,
|
3449
|
+
eval_set=eval_set,
|
3450
|
+
stability_threshold=stability_threshold,
|
3451
|
+
cv=self.cv,
|
3452
|
+
estimator=estimator,
|
3453
|
+
exclude_features_sources=exclude_features_sources,
|
3454
|
+
importance_threshold=importance_threshold,
|
3455
|
+
max_features=max_features,
|
3456
|
+
progress_bar=progress_bar,
|
3457
|
+
progress_callback=progress_callback,
|
3458
|
+
)
|
3459
|
+
|
3167
3460
|
if self._has_paid_features(exclude_features_sources):
|
3168
3461
|
if calculate_metrics is not None and calculate_metrics:
|
3169
3462
|
msg = self.bundle.get("metrics_with_paid_features")
|
@@ -3289,10 +3582,11 @@ if response.status_code == 200:
|
|
3289
3582
|
y: Optional[pd.Series] = None,
|
3290
3583
|
eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]] = None,
|
3291
3584
|
is_transform: bool = False,
|
3585
|
+
silent: bool = False,
|
3292
3586
|
) -> Tuple[pd.DataFrame, pd.Series, Optional[List[Tuple[pd.DataFrame, pd.Series]]]]:
|
3293
3587
|
validated_X = self._validate_X(X, is_transform)
|
3294
3588
|
validated_y = self._validate_y(validated_X, y, enforce_y=not is_transform)
|
3295
|
-
validated_eval_set = self._validate_eval_set(validated_X, eval_set)
|
3589
|
+
validated_eval_set = self._validate_eval_set(validated_X, eval_set, silent=silent)
|
3296
3590
|
return validated_X, validated_y, validated_eval_set
|
3297
3591
|
|
3298
3592
|
def _encode_id_columns(
|
@@ -3424,10 +3718,30 @@ if response.status_code == 200:
|
|
3424
3718
|
|
3425
3719
|
return validated_y
|
3426
3720
|
|
3427
|
-
def _validate_eval_set(
|
3721
|
+
def _validate_eval_set(
|
3722
|
+
self, X: pd.DataFrame, eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]], silent: bool = False
|
3723
|
+
):
|
3428
3724
|
if eval_set is None:
|
3429
3725
|
return None
|
3430
|
-
|
3726
|
+
validated_eval_set = []
|
3727
|
+
has_date = self._get_date_column(self.search_keys) is not None
|
3728
|
+
for idx, eval_pair in enumerate(eval_set):
|
3729
|
+
validated_pair = self._validate_eval_set_pair(X, eval_pair)
|
3730
|
+
if validated_pair[1].isna().all():
|
3731
|
+
if not has_date:
|
3732
|
+
msg = self.bundle.get("oot_without_date_not_supported").format(idx + 1)
|
3733
|
+
elif self.columns_for_online_api:
|
3734
|
+
msg = self.bundle.get("oot_with_online_sources_not_supported").format(idx + 1)
|
3735
|
+
else:
|
3736
|
+
msg = None
|
3737
|
+
if msg:
|
3738
|
+
if not silent:
|
3739
|
+
print(msg)
|
3740
|
+
self.logger.warning(msg)
|
3741
|
+
continue
|
3742
|
+
validated_eval_set.append(validated_pair)
|
3743
|
+
|
3744
|
+
return validated_eval_set
|
3431
3745
|
|
3432
3746
|
def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
|
3433
3747
|
if len(eval_pair) != 2:
|
@@ -3502,16 +3816,18 @@ if response.status_code == 200:
|
|
3502
3816
|
raise ValidationError(self.bundle.get("unsupported_y_type_eval_set").format(type(eval_y)))
|
3503
3817
|
|
3504
3818
|
eval_y_nunique = validated_eval_y.nunique()
|
3505
|
-
|
3819
|
+
is_oot = validated_eval_y.isna().all()
|
3820
|
+
if not is_oot and eval_y_nunique < 2:
|
3506
3821
|
raise ValidationError(self.bundle.get("y_is_constant_eval_set"))
|
3507
3822
|
|
3508
|
-
if self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
|
3823
|
+
if not is_oot and self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
|
3509
3824
|
raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
|
3510
3825
|
|
3511
|
-
|
3512
|
-
|
3513
|
-
|
3514
|
-
|
3826
|
+
if not is_oot:
|
3827
|
+
# Check for duplicates between train and eval sets by comparing all values
|
3828
|
+
train_eval_intersection = pd.merge(X, validated_eval_X, how="inner")
|
3829
|
+
if len(train_eval_intersection) > 0:
|
3830
|
+
raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
|
3515
3831
|
|
3516
3832
|
return validated_eval_X, validated_eval_y
|
3517
3833
|
|
@@ -3527,10 +3843,12 @@ if response.status_code == 200:
|
|
3527
3843
|
if isinstance(eval_set, tuple):
|
3528
3844
|
eval_set = [eval_set]
|
3529
3845
|
for eval in eval_set:
|
3530
|
-
|
3531
|
-
|
3532
|
-
|
3533
|
-
|
3846
|
+
is_oot = eval[1].isna().all()
|
3847
|
+
if not is_oot:
|
3848
|
+
if self.baseline_score_column not in eval[0].columns:
|
3849
|
+
raise ValidationError(self.bundle.get("baseline_score_column_not_exists"))
|
3850
|
+
if eval[0][self.baseline_score_column].isna().any():
|
3851
|
+
raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
|
3534
3852
|
|
3535
3853
|
@staticmethod
|
3536
3854
|
def _sample_X_and_y(X: pd.DataFrame, y: pd.Series, enriched_X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
|
@@ -3704,7 +4022,7 @@ if response.status_code == 200:
|
|
3704
4022
|
return df
|
3705
4023
|
|
3706
4024
|
def _add_current_date_as_key(
|
3707
|
-
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey],
|
4025
|
+
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], bundle: ResourceBundle, silent: bool = False
|
3708
4026
|
) -> pd.DataFrame:
|
3709
4027
|
if (
|
3710
4028
|
set(search_keys.values()) == {SearchKey.PHONE}
|
@@ -3712,7 +4030,8 @@ if response.status_code == 200:
|
|
3712
4030
|
or set(search_keys.values()) == {SearchKey.HEM}
|
3713
4031
|
or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
|
3714
4032
|
):
|
3715
|
-
|
4033
|
+
if not silent:
|
4034
|
+
self.__log_warning(bundle.get("current_date_added"))
|
3716
4035
|
df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
|
3717
4036
|
search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
|
3718
4037
|
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, generate_cyclical_features=False)
|
@@ -4047,7 +4366,11 @@ if response.status_code == 200:
|
|
4047
4366
|
return [f.name for f in features_meta if f.type == "categorical" and f.name not in (self.id_columns or [])]
|
4048
4367
|
|
4049
4368
|
def __prepare_feature_importances(
|
4050
|
-
self,
|
4369
|
+
self,
|
4370
|
+
trace_id: str,
|
4371
|
+
clients_features_df: pd.DataFrame,
|
4372
|
+
updated_shaps: Optional[Dict[str, float]] = None,
|
4373
|
+
silent=False,
|
4051
4374
|
):
|
4052
4375
|
if self._search_task is None:
|
4053
4376
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
@@ -4060,11 +4383,12 @@ if response.status_code == 200:
|
|
4060
4383
|
features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
|
4061
4384
|
|
4062
4385
|
# To be sure that names with hash suffixes
|
4063
|
-
|
4386
|
+
clients_features_df = clients_features_df.rename(columns=original_names_dict)
|
4064
4387
|
|
4065
4388
|
self.feature_names_ = []
|
4066
4389
|
self.external_source_feature_names = []
|
4067
4390
|
self.zero_shap_client_features = []
|
4391
|
+
self.unstable_client_features = []
|
4068
4392
|
self.feature_importances_ = []
|
4069
4393
|
features_info = []
|
4070
4394
|
features_info_without_links = []
|
@@ -4073,10 +4397,10 @@ if response.status_code == 200:
|
|
4073
4397
|
original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
|
4074
4398
|
|
4075
4399
|
for feature_meta in features_meta:
|
4076
|
-
|
4077
|
-
|
4400
|
+
original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
|
4401
|
+
feature_meta.name = original_name
|
4078
4402
|
|
4079
|
-
is_client_feature =
|
4403
|
+
is_client_feature = original_name in clients_features_df.columns
|
4080
4404
|
|
4081
4405
|
# Show and update shap values for client features only if select_features is True
|
4082
4406
|
if updated_shaps is not None and (not is_client_feature or self.fit_select_features):
|
@@ -4093,12 +4417,21 @@ if response.status_code == 200:
|
|
4093
4417
|
|
4094
4418
|
for feature_meta in features_meta:
|
4095
4419
|
original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
|
4096
|
-
is_client_feature = original_name in
|
4420
|
+
is_client_feature = original_name in clients_features_df.columns
|
4097
4421
|
|
4098
4422
|
if not is_client_feature:
|
4099
4423
|
self.external_source_feature_names.append(original_name)
|
4100
4424
|
|
4425
|
+
if self.psi_values is not None:
|
4426
|
+
if original_name in self.psi_values:
|
4427
|
+
feature_meta.psi_value = self.psi_values[original_name]
|
4428
|
+
else:
|
4429
|
+
if is_client_feature and self.fit_select_features:
|
4430
|
+
self.unstable_client_features.append(original_name)
|
4431
|
+
continue
|
4432
|
+
|
4101
4433
|
# TODO make a decision about selected features based on special flag from mlb
|
4434
|
+
|
4102
4435
|
if original_shaps.get(feature_meta.name, 0.0) == 0.0:
|
4103
4436
|
if is_client_feature and self.fit_select_features:
|
4104
4437
|
self.zero_shap_client_features.append(original_name)
|
@@ -4122,7 +4455,7 @@ if response.status_code == 200:
|
|
4122
4455
|
self.feature_names_.append(feature_meta.name)
|
4123
4456
|
self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
|
4124
4457
|
|
4125
|
-
df_for_sample = features_df if feature_meta.name in features_df.columns else
|
4458
|
+
df_for_sample = features_df if feature_meta.name in features_df.columns else clients_features_df
|
4126
4459
|
feature_info = FeatureInfo.from_metadata(feature_meta, df_for_sample, is_client_feature)
|
4127
4460
|
features_info.append(feature_info.to_row(self.bundle))
|
4128
4461
|
features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
|
@@ -4130,6 +4463,8 @@ if response.status_code == 200:
|
|
4130
4463
|
|
4131
4464
|
if len(features_info) > 0:
|
4132
4465
|
self.features_info = pd.DataFrame(features_info)
|
4466
|
+
if self.features_info[self.bundle.get("features_info_psi")].isna().all():
|
4467
|
+
self.features_info.drop(columns=[self.bundle.get("features_info_psi")], inplace=True)
|
4133
4468
|
self._features_info_without_links = pd.DataFrame(features_info_without_links)
|
4134
4469
|
self._internal_features_info = pd.DataFrame(internal_features_info)
|
4135
4470
|
if not silent:
|