upgini 1.2.113a1__py3-none-any.whl → 1.2.113a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +412 -140
- upgini/metadata.py +1 -0
- upgini/metrics.py +4 -1
- upgini/resource_bundle/strings.properties +4 -8
- upgini/sampler/base.py +3 -1
- upgini/sampler/random_under_sampler.py +18 -8
- upgini/utils/deduplicate_utils.py +43 -7
- upgini/utils/feature_info.py +5 -0
- upgini/utils/psi.py +268 -0
- {upgini-1.2.113a1.dist-info → upgini-1.2.113a2.dist-info}/METADATA +1 -1
- {upgini-1.2.113a1.dist-info → upgini-1.2.113a2.dist-info}/RECORD +14 -13
- {upgini-1.2.113a1.dist-info → upgini-1.2.113a2.dist-info}/WHEEL +0 -0
- {upgini-1.2.113a1.dist-info → upgini-1.2.113a2.dist-info}/licenses/LICENSE +0 -0
upgini/features_enricher.py
CHANGED
@@ -112,6 +112,7 @@ except Exception:
|
|
112
112
|
CustomFallbackProgressBar as ProgressBar,
|
113
113
|
)
|
114
114
|
|
115
|
+
from upgini.utils.psi import calculate_features_psi
|
115
116
|
from upgini.utils.sample_utils import SampleColumns, SampleConfig, _num_samples, sample
|
116
117
|
from upgini.utils.sort import sort_columns
|
117
118
|
from upgini.utils.target_utils import calculate_psi, define_task
|
@@ -297,7 +298,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
297
298
|
self.feature_names_ = []
|
298
299
|
self.external_source_feature_names = []
|
299
300
|
self.zero_shap_client_features = []
|
301
|
+
self.unstable_client_features = []
|
300
302
|
self.feature_importances_ = []
|
303
|
+
self.psi_values: Optional[Dict[str, float]] = None
|
301
304
|
self.search_id = search_id
|
302
305
|
self.disable_force_downsampling = disable_force_downsampling
|
303
306
|
self.print_trace_id = print_trace_id
|
@@ -398,13 +401,26 @@ class FeaturesEnricher(TransformerMixin):
|
|
398
401
|
@staticmethod
|
399
402
|
def _check_eval_set(eval_set, X, bundle: ResourceBundle):
|
400
403
|
checked_eval_set = []
|
401
|
-
if eval_set is
|
404
|
+
if eval_set is None:
|
405
|
+
return checked_eval_set
|
406
|
+
if isinstance(eval_set, tuple):
|
402
407
|
eval_set = [eval_set]
|
403
|
-
if
|
408
|
+
if not isinstance(eval_set, list):
|
404
409
|
raise ValidationError(bundle.get("unsupported_type_eval_set").format(type(eval_set)))
|
405
410
|
for eval_pair in eval_set or []:
|
411
|
+
# Handle OOT
|
412
|
+
if isinstance(eval_pair, pd.DataFrame):
|
413
|
+
empty_target = pd.Series([np.nan] * len(eval_pair), index=eval_pair.index)
|
414
|
+
eval_pair = (eval_pair, empty_target)
|
415
|
+
elif isinstance(eval_pair, tuple) and len(eval_pair) == 1:
|
416
|
+
empty_target = pd.Series([np.nan] * len(eval_pair[0]), index=eval_pair[0].index)
|
417
|
+
eval_pair = (eval_pair[0], empty_target)
|
418
|
+
|
406
419
|
if not isinstance(eval_pair, tuple) or len(eval_pair) != 2:
|
407
420
|
raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
421
|
+
if eval_pair[1] is None:
|
422
|
+
empty_target = pd.Series([np.nan] * len(eval_pair[0]), index=eval_pair[0].index)
|
423
|
+
eval_pair = (eval_pair[0], empty_target)
|
408
424
|
if not is_frames_equal(X, eval_pair[0], bundle):
|
409
425
|
checked_eval_set.append(eval_pair)
|
410
426
|
return checked_eval_set
|
@@ -415,7 +431,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
415
431
|
y: Union[pd.Series, np.ndarray, List],
|
416
432
|
eval_set: Optional[Union[List[tuple], tuple]] = None,
|
417
433
|
*args,
|
418
|
-
oot: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
|
419
434
|
exclude_features_sources: Optional[List[str]] = None,
|
420
435
|
calculate_metrics: Optional[bool] = None,
|
421
436
|
estimator: Optional[Any] = None,
|
@@ -427,6 +442,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
427
442
|
search_id_callback: Optional[Callable[[str], Any]] = None,
|
428
443
|
select_features: bool = True,
|
429
444
|
auto_fe_parameters: Optional[AutoFEParameters] = None,
|
445
|
+
stability_threshold: float = 0.15,
|
430
446
|
**kwargs,
|
431
447
|
):
|
432
448
|
"""Fit to data.
|
@@ -444,9 +460,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
444
460
|
eval_set: List[tuple], optional (default=None)
|
445
461
|
List of pairs (X, y) for validation.
|
446
462
|
|
447
|
-
oot: pandas.DataFrame of shape (n_samples, n_features)
|
448
|
-
Out of time data.
|
449
|
-
|
450
463
|
importance_threshold: float, optional (default=None)
|
451
464
|
Minimum SHAP value to select a feature. Default value is 0.0.
|
452
465
|
|
@@ -512,14 +525,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
512
525
|
X,
|
513
526
|
y,
|
514
527
|
self.eval_set,
|
515
|
-
|
516
|
-
progress_bar=progress_bar,
|
528
|
+
progress_bar,
|
517
529
|
start_time=start_time,
|
518
530
|
exclude_features_sources=exclude_features_sources,
|
519
531
|
calculate_metrics=calculate_metrics,
|
520
532
|
estimator=estimator,
|
521
533
|
scoring=scoring,
|
522
534
|
importance_threshold=importance_threshold,
|
535
|
+
stability_threshold=stability_threshold,
|
523
536
|
max_features=max_features,
|
524
537
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
525
538
|
auto_fe_parameters=auto_fe_parameters,
|
@@ -568,7 +581,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
568
581
|
y: Union[pd.DataFrame, pd.Series, np.ndarray, List],
|
569
582
|
eval_set: Optional[Union[List[tuple], tuple]] = None,
|
570
583
|
*args,
|
571
|
-
oot: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
|
572
584
|
exclude_features_sources: Optional[List[str]] = None,
|
573
585
|
keep_input: bool = True,
|
574
586
|
importance_threshold: Optional[float] = None,
|
@@ -580,6 +592,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
580
592
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
581
593
|
select_features: bool = True,
|
582
594
|
auto_fe_parameters: Optional[AutoFEParameters] = None,
|
595
|
+
stability_threshold: float = 0.15,
|
583
596
|
**kwargs,
|
584
597
|
) -> pd.DataFrame:
|
585
598
|
"""Fit to data, then transform it.
|
@@ -624,6 +637,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
624
637
|
If True, return only selected features both from input and data sources.
|
625
638
|
Otherwise, return all features from input and only selected features from data sources.
|
626
639
|
|
640
|
+
stability_threshold: float, optional (default=0.15)
|
641
|
+
Stability threshold for selected features PSI calculation. If PSI is less than this threshold,
|
642
|
+
then feature will be dropped.
|
643
|
+
|
627
644
|
Returns
|
628
645
|
-------
|
629
646
|
X_new: pandas.DataFrame of shape (n_samples, n_features_new)
|
@@ -673,14 +690,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
673
690
|
X,
|
674
691
|
y,
|
675
692
|
self.eval_set,
|
676
|
-
|
677
|
-
progress_bar=progress_bar,
|
693
|
+
progress_bar,
|
678
694
|
start_time=start_time,
|
679
695
|
exclude_features_sources=exclude_features_sources,
|
680
696
|
calculate_metrics=calculate_metrics,
|
681
697
|
scoring=scoring,
|
682
698
|
estimator=estimator,
|
683
699
|
importance_threshold=importance_threshold,
|
700
|
+
stability_threshold=stability_threshold,
|
684
701
|
max_features=max_features,
|
685
702
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
686
703
|
auto_fe_parameters=auto_fe_parameters,
|
@@ -947,8 +964,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
947
964
|
):
|
948
965
|
raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
|
949
966
|
|
950
|
-
validated_X, validated_y, validated_eval_set
|
951
|
-
effective_X, effective_y, effective_eval_set
|
967
|
+
validated_X, validated_y, validated_eval_set = self._validate_train_eval(
|
968
|
+
effective_X, effective_y, effective_eval_set, silent=internal_call
|
952
969
|
)
|
953
970
|
|
954
971
|
if self.X is None:
|
@@ -1040,6 +1057,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1040
1057
|
groups,
|
1041
1058
|
_cv,
|
1042
1059
|
columns_renaming,
|
1060
|
+
eval_set_dates,
|
1043
1061
|
) = prepared_data
|
1044
1062
|
|
1045
1063
|
# rename cat_features
|
@@ -1081,9 +1099,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
1081
1099
|
self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
|
1082
1100
|
|
1083
1101
|
has_date = self._get_date_column(search_keys) is not None
|
1084
|
-
has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
|
1085
1102
|
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
1086
1103
|
cat_features = list(set(client_cat_features + cat_features_from_backend))
|
1104
|
+
has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
|
1087
1105
|
baseline_cat_features = [f for f in cat_features if f in fitting_X.columns]
|
1088
1106
|
enriched_cat_features = [f for f in cat_features if f in fitting_enriched_X.columns]
|
1089
1107
|
if len(enriched_cat_features) < len(cat_features):
|
@@ -1203,8 +1221,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
1203
1221
|
# max_initial_eval_set_hit_rate = self._search_task.get_max_initial_eval_set_hit_rate_v2()
|
1204
1222
|
if len(fitting_eval_set_dict) > 0:
|
1205
1223
|
for idx in fitting_eval_set_dict.keys():
|
1206
|
-
# eval_hit_rate = max_initial_eval_set_hit_rate[idx + 1]
|
1207
|
-
|
1208
1224
|
(
|
1209
1225
|
eval_X_sorted,
|
1210
1226
|
eval_y_sorted,
|
@@ -1212,6 +1228,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
1212
1228
|
enriched_eval_y_sorted,
|
1213
1229
|
) = fitting_eval_set_dict[idx]
|
1214
1230
|
|
1231
|
+
if eval_y_sorted.isna().all():
|
1232
|
+
# Skip OOT eval set
|
1233
|
+
continue
|
1234
|
+
|
1215
1235
|
if baseline_estimator is not None:
|
1216
1236
|
self.logger.info(
|
1217
1237
|
f"Calculate baseline {metric} on eval set {idx + 1} "
|
@@ -1254,17 +1274,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
1254
1274
|
"quality_metrics_eval_segment"
|
1255
1275
|
).format(idx + 1),
|
1256
1276
|
self.bundle.get("quality_metrics_rows_header"): _num_samples(
|
1257
|
-
# effective_eval_set[idx][0]
|
1258
1277
|
# Use actually used for metrics dataset
|
1259
1278
|
eval_X_sorted
|
1260
1279
|
),
|
1261
|
-
# self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
|
1262
1280
|
}
|
1263
1281
|
if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
|
1264
1282
|
eval_y_sorted
|
1265
1283
|
):
|
1266
1284
|
eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
|
1267
|
-
# np.mean(validated_eval_set[idx][1]), 4
|
1268
1285
|
# Use actually used for metrics dataset
|
1269
1286
|
np.mean(eval_y_sorted),
|
1270
1287
|
4,
|
@@ -1337,6 +1354,199 @@ class FeaturesEnricher(TransformerMixin):
|
|
1337
1354
|
finally:
|
1338
1355
|
self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
|
1339
1356
|
|
1357
|
+
def _select_features_by_psi(
|
1358
|
+
self,
|
1359
|
+
trace_id: str,
|
1360
|
+
X: Union[pd.DataFrame, pd.Series, np.ndarray],
|
1361
|
+
y: Union[pd.DataFrame, pd.Series, np.ndarray, List],
|
1362
|
+
eval_set: Optional[Union[List[tuple], tuple]],
|
1363
|
+
stability_threshold: float,
|
1364
|
+
cv: Union[BaseCrossValidator, CVType, str, None] = None,
|
1365
|
+
estimator=None,
|
1366
|
+
exclude_features_sources: Optional[List[str]] = None,
|
1367
|
+
importance_threshold: Optional[float] = None,
|
1368
|
+
max_features: Optional[int] = None,
|
1369
|
+
progress_bar: bool = True,
|
1370
|
+
progress_callback: Optional[Callable] = None,
|
1371
|
+
):
|
1372
|
+
search_keys = self.search_keys.copy()
|
1373
|
+
validated_X, _, validated_eval_set = self._validate_train_eval(X, y, eval_set, silent=True)
|
1374
|
+
if isinstance(X, np.ndarray):
|
1375
|
+
search_keys = {str(k): v for k, v in search_keys.items()}
|
1376
|
+
|
1377
|
+
has_date = self._get_date_column(search_keys) is not None
|
1378
|
+
if not has_date or not validated_eval_set:
|
1379
|
+
self.logger.info("No date column or eval set for OOT psi calculation")
|
1380
|
+
return
|
1381
|
+
|
1382
|
+
cat_features_from_backend = self.__get_categorical_features()
|
1383
|
+
client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
|
1384
|
+
estimator, validated_X, search_keys
|
1385
|
+
)
|
1386
|
+
if self.id_columns and self.id_columns_encoder is not None:
|
1387
|
+
if cat_features_from_backend:
|
1388
|
+
cat_features_from_backend = [
|
1389
|
+
c
|
1390
|
+
for c in cat_features_from_backend
|
1391
|
+
if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
|
1392
|
+
]
|
1393
|
+
if client_cat_features:
|
1394
|
+
client_cat_features = [
|
1395
|
+
c
|
1396
|
+
for c in client_cat_features
|
1397
|
+
if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
|
1398
|
+
]
|
1399
|
+
|
1400
|
+
prepared_data = self._prepare_data_for_metrics(
|
1401
|
+
trace_id=trace_id,
|
1402
|
+
X=X,
|
1403
|
+
y=y,
|
1404
|
+
eval_set=eval_set,
|
1405
|
+
exclude_features_sources=exclude_features_sources,
|
1406
|
+
importance_threshold=importance_threshold,
|
1407
|
+
max_features=max_features,
|
1408
|
+
remove_outliers_calc_metrics=False,
|
1409
|
+
cv_override=cv,
|
1410
|
+
search_keys_for_metrics=search_keys_for_metrics,
|
1411
|
+
progress_bar=progress_bar,
|
1412
|
+
progress_callback=progress_callback,
|
1413
|
+
client_cat_features=client_cat_features,
|
1414
|
+
)
|
1415
|
+
if prepared_data is None:
|
1416
|
+
return None
|
1417
|
+
|
1418
|
+
(
|
1419
|
+
validated_X,
|
1420
|
+
fitting_X,
|
1421
|
+
y_sorted,
|
1422
|
+
fitting_enriched_X,
|
1423
|
+
_,
|
1424
|
+
fitting_eval_set_dict,
|
1425
|
+
_,
|
1426
|
+
_,
|
1427
|
+
_,
|
1428
|
+
columns_renaming,
|
1429
|
+
eval_set_dates,
|
1430
|
+
) = prepared_data
|
1431
|
+
|
1432
|
+
# rename cat_features
|
1433
|
+
if client_cat_features:
|
1434
|
+
for new_c, old_c in columns_renaming.items():
|
1435
|
+
if old_c in client_cat_features:
|
1436
|
+
client_cat_features.remove(old_c)
|
1437
|
+
client_cat_features.append(new_c)
|
1438
|
+
for cat_feature in client_cat_features:
|
1439
|
+
if cat_feature not in fitting_X.columns:
|
1440
|
+
self.logger.error(
|
1441
|
+
f"Client cat_feature `{cat_feature}` not found in" f" x columns: {fitting_X.columns.to_list()}"
|
1442
|
+
)
|
1443
|
+
else:
|
1444
|
+
client_cat_features = []
|
1445
|
+
|
1446
|
+
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
1447
|
+
cat_features = list(set(client_cat_features + cat_features_from_backend))
|
1448
|
+
|
1449
|
+
# Drop unstable features
|
1450
|
+
unstable_features = self._check_stability(
|
1451
|
+
validated_X,
|
1452
|
+
validated_eval_set,
|
1453
|
+
fitting_eval_set_dict,
|
1454
|
+
eval_set_dates,
|
1455
|
+
search_keys,
|
1456
|
+
stability_threshold,
|
1457
|
+
cat_features,
|
1458
|
+
model_task_type,
|
1459
|
+
)
|
1460
|
+
client_features_df = self.df_with_original_index.rename(columns=columns_renaming)
|
1461
|
+
# decoded_X = self._decode_id_columns(fitting_X, columns_renaming)
|
1462
|
+
self._update_report_psi(trace_id, client_features_df)
|
1463
|
+
|
1464
|
+
if unstable_features:
|
1465
|
+
msg = f"Some features are unstable: {unstable_features} and will be dropped"
|
1466
|
+
self.logger.warning(msg)
|
1467
|
+
print(msg)
|
1468
|
+
fitting_X = fitting_X.drop(columns=unstable_features, errors="ignore")
|
1469
|
+
fitting_enriched_X = fitting_enriched_X.drop(columns=unstable_features, errors="ignore")
|
1470
|
+
msg = f"Threre are {len(fitting_enriched_X.columns)} stable selected features left"
|
1471
|
+
self.logger.info(msg)
|
1472
|
+
print(msg)
|
1473
|
+
for idx, (
|
1474
|
+
eval_X,
|
1475
|
+
eval_y,
|
1476
|
+
eval_enriched_X,
|
1477
|
+
eval_enriched_y,
|
1478
|
+
) in fitting_eval_set_dict.items():
|
1479
|
+
eval_X = eval_X.drop(columns=unstable_features, errors="ignore")
|
1480
|
+
eval_enriched_X = eval_enriched_X.drop(columns=unstable_features, errors="ignore")
|
1481
|
+
fitting_eval_set_dict[idx] = (eval_X, eval_y, eval_enriched_X, eval_enriched_y)
|
1482
|
+
|
1483
|
+
def _check_stability(
|
1484
|
+
self,
|
1485
|
+
X: pd.DataFrame,
|
1486
|
+
eval_set: List[Tuple[pd.DataFrame, pd.Series]],
|
1487
|
+
enriched_eval_set: Dict,
|
1488
|
+
eval_set_dates: Dict[int, pd.Series],
|
1489
|
+
search_keys: Dict[str, SearchKey],
|
1490
|
+
stability_threshold: float,
|
1491
|
+
cat_features: List[str],
|
1492
|
+
model_task_type: ModelTaskType,
|
1493
|
+
) -> List[str]:
|
1494
|
+
# Find latest eval set or earliest if all eval sets are before train set
|
1495
|
+
date_column = self._get_date_column(search_keys)
|
1496
|
+
|
1497
|
+
if (
|
1498
|
+
date_column is None
|
1499
|
+
or not eval_set
|
1500
|
+
or not eval_set_dates
|
1501
|
+
or (self.cv is not None and self.cv.is_time_series())
|
1502
|
+
):
|
1503
|
+
return []
|
1504
|
+
|
1505
|
+
# Get minimum date from main dataset X
|
1506
|
+
main_min_date = X[date_column].min()
|
1507
|
+
|
1508
|
+
# Find minimum date for each eval_set and compare with main dataset
|
1509
|
+
eval_dates = []
|
1510
|
+
for i, (eval_x, _) in enumerate(eval_set):
|
1511
|
+
if date_column in eval_x.columns:
|
1512
|
+
eval_min_date = eval_x[date_column].min()
|
1513
|
+
eval_max_date = eval_x[date_column].max()
|
1514
|
+
eval_dates.append((i, eval_min_date, eval_max_date))
|
1515
|
+
|
1516
|
+
if not eval_dates:
|
1517
|
+
return []
|
1518
|
+
|
1519
|
+
# Check if any eval_set has minimum date >= main dataset minimum date
|
1520
|
+
later_eval_sets = [(i, min_date, max_date) for i, min_date, max_date in eval_dates if min_date >= main_min_date]
|
1521
|
+
|
1522
|
+
if later_eval_sets:
|
1523
|
+
# If there are eval_sets with date >= main date, choose the one with highest maximum date
|
1524
|
+
selected_eval_set_idx = max(later_eval_sets, key=lambda x: x[2])[0]
|
1525
|
+
else:
|
1526
|
+
# If all eval_sets have dates < main date, choose the one with lowest minimux date
|
1527
|
+
selected_eval_set_idx = max(eval_dates, key=lambda x: x[1])[0]
|
1528
|
+
|
1529
|
+
checking_eval_set = enriched_eval_set[selected_eval_set_idx]
|
1530
|
+
|
1531
|
+
checking_eval_set_df = (
|
1532
|
+
checking_eval_set[2]
|
1533
|
+
if checking_eval_set[1] is None or checking_eval_set[1].isna().all()
|
1534
|
+
else pd.concat([checking_eval_set[2], checking_eval_set[1].to_frame(TARGET)], axis=1)
|
1535
|
+
)
|
1536
|
+
checking_eval_set_df = checking_eval_set_df.copy()
|
1537
|
+
|
1538
|
+
checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
|
1539
|
+
|
1540
|
+
psi_values = calculate_features_psi(
|
1541
|
+
checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
|
1542
|
+
)
|
1543
|
+
|
1544
|
+
self.psi_values = {
|
1545
|
+
feature: psi_value for feature, psi_value in psi_values.items() if psi_value <= stability_threshold
|
1546
|
+
}
|
1547
|
+
|
1548
|
+
return [feature for feature, psi in psi_values.items() if psi > stability_threshold]
|
1549
|
+
|
1340
1550
|
def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float], silent: bool = False):
|
1341
1551
|
renaming = self.fit_columns_renaming or {}
|
1342
1552
|
self.logger.info(f"Updating SHAP values: {new_shaps}")
|
@@ -1392,6 +1602,56 @@ class FeaturesEnricher(TransformerMixin):
|
|
1392
1602
|
except (ImportError, NameError):
|
1393
1603
|
pass
|
1394
1604
|
|
1605
|
+
def _update_report_psi(self, trace_id: str, clients_features_df: pd.DataFrame):
|
1606
|
+
self.__prepare_feature_importances(trace_id, clients_features_df)
|
1607
|
+
|
1608
|
+
if self.features_info_display_handle is not None:
|
1609
|
+
try:
|
1610
|
+
_ = get_ipython() # type: ignore
|
1611
|
+
|
1612
|
+
display_html_dataframe(
|
1613
|
+
self.features_info,
|
1614
|
+
self._features_info_without_links,
|
1615
|
+
self.bundle.get("relevant_features_header"),
|
1616
|
+
display_handle=self.features_info_display_handle,
|
1617
|
+
)
|
1618
|
+
except (ImportError, NameError):
|
1619
|
+
pass
|
1620
|
+
|
1621
|
+
if self.data_sources_display_handle is not None:
|
1622
|
+
try:
|
1623
|
+
_ = get_ipython() # type: ignore
|
1624
|
+
|
1625
|
+
display_html_dataframe(
|
1626
|
+
self.relevant_data_sources,
|
1627
|
+
self._relevant_data_sources_wo_links,
|
1628
|
+
self.bundle.get("relevant_data_sources_header"),
|
1629
|
+
display_handle=self.data_sources_display_handle,
|
1630
|
+
)
|
1631
|
+
except (ImportError, NameError):
|
1632
|
+
pass
|
1633
|
+
|
1634
|
+
if self.autofe_features_display_handle is not None:
|
1635
|
+
try:
|
1636
|
+
_ = get_ipython() # type: ignore
|
1637
|
+
autofe_descriptions_df = self.get_autofe_features_description()
|
1638
|
+
if autofe_descriptions_df is not None:
|
1639
|
+
display_html_dataframe(
|
1640
|
+
df=autofe_descriptions_df,
|
1641
|
+
internal_df=autofe_descriptions_df,
|
1642
|
+
header=self.bundle.get("autofe_descriptions_header"),
|
1643
|
+
display_handle=self.autofe_features_display_handle,
|
1644
|
+
)
|
1645
|
+
except (ImportError, NameError):
|
1646
|
+
pass
|
1647
|
+
if self.report_button_handle is not None:
|
1648
|
+
try:
|
1649
|
+
_ = get_ipython() # type: ignore
|
1650
|
+
|
1651
|
+
self.__show_report_button(display_handle=self.report_button_handle)
|
1652
|
+
except (ImportError, NameError):
|
1653
|
+
pass
|
1654
|
+
|
1395
1655
|
def _check_train_and_eval_target_distribution(self, y, eval_set_dict):
|
1396
1656
|
uneven_distribution = False
|
1397
1657
|
for eval_set in eval_set_dict.values():
|
@@ -1542,7 +1802,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1542
1802
|
is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
|
1543
1803
|
is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
|
1544
1804
|
checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
1545
|
-
validated_X, validated_y, validated_eval_set
|
1805
|
+
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set, silent=True)
|
1546
1806
|
|
1547
1807
|
sampled_data = self._get_enriched_for_metrics(
|
1548
1808
|
trace_id,
|
@@ -1556,7 +1816,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1556
1816
|
progress_bar,
|
1557
1817
|
progress_callback,
|
1558
1818
|
)
|
1559
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = dataclasses.astuple(
|
1819
|
+
(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming) = dataclasses.astuple(
|
1560
1820
|
sampled_data
|
1561
1821
|
)
|
1562
1822
|
|
@@ -1579,8 +1839,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1579
1839
|
or c in set(self.feature_names_).union(self.id_columns or [])
|
1580
1840
|
or (self.fit_columns_renaming or {}).get(c, c) in set(self.feature_names_).union(self.id_columns or [])
|
1581
1841
|
)
|
1582
|
-
and c
|
1583
|
-
not in (
|
1842
|
+
and c not in (
|
1584
1843
|
excluding_search_keys
|
1585
1844
|
+ list(self.fit_dropped_features)
|
1586
1845
|
+ [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
|
@@ -1679,12 +1938,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
1679
1938
|
fitting_enriched_X = fitting_enriched_X[fitting_enriched_x_columns]
|
1680
1939
|
fitting_enriched_X, _ = self._encode_id_columns(fitting_enriched_X, self.fit_columns_renaming)
|
1681
1940
|
self.logger.info(f"Final sorted list of fitting enriched X columns: {fitting_enriched_x_columns}")
|
1941
|
+
date_column = self._get_date_column(search_keys)
|
1942
|
+
eval_set_dates = {}
|
1682
1943
|
for idx, eval_tuple in eval_set_sampled_dict.items():
|
1683
1944
|
eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
|
1684
1945
|
eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
|
1685
1946
|
enriched_eval_X_sorted, enriched_eval_y_sorted = self._sort_by_system_record_id(
|
1686
1947
|
enriched_eval_X, eval_y_sampled, self.cv
|
1687
1948
|
)
|
1949
|
+
if date_column is not None:
|
1950
|
+
eval_set_dates[idx] = eval_X_sorted[date_column]
|
1688
1951
|
fitting_eval_X = eval_X_sorted[fitting_x_columns].copy()
|
1689
1952
|
fitting_enriched_eval_X = enriched_eval_X_sorted[fitting_enriched_x_columns].copy()
|
1690
1953
|
|
@@ -1729,6 +1992,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1729
1992
|
groups,
|
1730
1993
|
cv,
|
1731
1994
|
columns_renaming,
|
1995
|
+
eval_set_dates,
|
1732
1996
|
)
|
1733
1997
|
|
1734
1998
|
@dataclass
|
@@ -1891,14 +2155,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
1891
2155
|
remove_outliers_calc_metrics: Optional[bool],
|
1892
2156
|
) -> _EnrichedDataForMetrics:
|
1893
2157
|
eval_set_sampled_dict = {}
|
1894
|
-
search_keys = self.fit_search_keys
|
2158
|
+
search_keys = self.fit_search_keys.copy()
|
1895
2159
|
|
1896
2160
|
rows_to_drop = None
|
1897
2161
|
has_date = self._get_date_column(search_keys) is not None
|
1898
2162
|
self.model_task_type = self.model_task_type or define_task(
|
1899
2163
|
self.df_with_original_index[TARGET], has_date, self.logger, silent=True
|
1900
2164
|
)
|
1901
|
-
if
|
2165
|
+
if remove_outliers_calc_metrics is None:
|
2166
|
+
remove_outliers_calc_metrics = True
|
2167
|
+
if self.model_task_type == ModelTaskType.REGRESSION and remove_outliers_calc_metrics:
|
1902
2168
|
target_outliers_df = self._search_task.get_target_outliers(trace_id)
|
1903
2169
|
if target_outliers_df is not None and len(target_outliers_df) > 0:
|
1904
2170
|
outliers = pd.merge(
|
@@ -1908,11 +2174,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1908
2174
|
how="inner",
|
1909
2175
|
)
|
1910
2176
|
top_outliers = outliers.sort_values(by=TARGET, ascending=False)[TARGET].head(3)
|
1911
|
-
|
1912
|
-
|
1913
|
-
not_msg = ""
|
1914
|
-
else:
|
1915
|
-
not_msg = "not "
|
2177
|
+
rows_to_drop = outliers
|
2178
|
+
not_msg = ""
|
1916
2179
|
msg = self.bundle.get("target_outliers_warning").format(len(target_outliers_df), top_outliers, not_msg)
|
1917
2180
|
print(msg)
|
1918
2181
|
self.logger.warning(msg)
|
@@ -1938,8 +2201,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
1938
2201
|
)
|
1939
2202
|
|
1940
2203
|
# Handle eval sets extraction based on EVAL_SET_INDEX
|
1941
|
-
if EVAL_SET_INDEX in enriched_Xy.columns
|
1942
|
-
|
2204
|
+
if EVAL_SET_INDEX in enriched_Xy.columns:
|
2205
|
+
eval_set_indices = list(enriched_Xy[EVAL_SET_INDEX].unique())
|
2206
|
+
if 0 in eval_set_indices:
|
2207
|
+
eval_set_indices.remove(0)
|
2208
|
+
for eval_set_index in eval_set_indices:
|
1943
2209
|
enriched_eval_sets[eval_set_index] = enriched_Xy.loc[
|
1944
2210
|
enriched_Xy[EVAL_SET_INDEX] == eval_set_index
|
1945
2211
|
].copy()
|
@@ -1967,12 +2233,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
1967
2233
|
enriched_eval_X = enriched_eval_sets[idx + 1][enriched_X_columns].copy()
|
1968
2234
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
1969
2235
|
|
1970
|
-
reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
|
1971
|
-
X_sampled.rename(columns=
|
1972
|
-
enriched_X.rename(columns=
|
2236
|
+
# reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
|
2237
|
+
X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
|
2238
|
+
enriched_X.rename(columns=self.fit_columns_renaming, inplace=True)
|
1973
2239
|
for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
|
1974
|
-
eval_X_sampled.rename(columns=
|
1975
|
-
enriched_eval_X.rename(columns=
|
2240
|
+
eval_X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
|
2241
|
+
enriched_eval_X.rename(columns=self.fit_columns_renaming, inplace=True)
|
2242
|
+
search_keys = {self.fit_columns_renaming.get(k, k): v for k, v in search_keys.items()}
|
1976
2243
|
|
1977
2244
|
datasets_hash = hash_input(self.X, self.y, self.eval_set)
|
1978
2245
|
return self.__cache_and_return_results(
|
@@ -2051,11 +2318,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2051
2318
|
)
|
2052
2319
|
|
2053
2320
|
def __combine_train_and_eval_sets(
|
2054
|
-
self,
|
2055
|
-
X: pd.DataFrame,
|
2056
|
-
y: Optional[pd.Series] = None,
|
2057
|
-
eval_set: Optional[List[tuple]] = None,
|
2058
|
-
oot: Optional[pd.DataFrame] = None,
|
2321
|
+
self, X: pd.DataFrame, y: Optional[pd.Series] = None, eval_set: Optional[List[tuple]] = None
|
2059
2322
|
) -> pd.DataFrame:
|
2060
2323
|
df = X.copy()
|
2061
2324
|
if y is not None:
|
@@ -2071,11 +2334,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
2071
2334
|
eval_df_with_index[TARGET] = eval_y
|
2072
2335
|
eval_df_with_index[EVAL_SET_INDEX] = idx + 1
|
2073
2336
|
df = pd.concat([df, eval_df_with_index])
|
2074
|
-
|
2075
|
-
if oot is not None:
|
2076
|
-
oot_df_with_index = oot.copy()
|
2077
|
-
oot_df_with_index[EVAL_SET_INDEX] = -1
|
2078
|
-
df = pd.concat([df, oot_df_with_index])
|
2079
2337
|
|
2080
2338
|
return df
|
2081
2339
|
|
@@ -2125,15 +2383,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
2125
2383
|
|
2126
2384
|
def __extract_eval_data(
|
2127
2385
|
self, enriched_df: pd.DataFrame, x_columns: List[str], enriched_X_columns: List[str], eval_set_len: int
|
2128
|
-
) -> Dict[int, Tuple]:
|
2386
|
+
) -> Tuple[Dict[int, Tuple], Dict[int, pd.Series]]:
|
2129
2387
|
eval_set_sampled_dict = {}
|
2130
2388
|
|
2131
|
-
for idx in range(
|
2132
|
-
enriched_eval_xy = enriched_df.query(f"{EVAL_SET_INDEX} == {idx}")
|
2389
|
+
for idx in range(eval_set_len):
|
2390
|
+
enriched_eval_xy = enriched_df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
|
2133
2391
|
eval_x_sampled = enriched_eval_xy[x_columns].copy()
|
2134
2392
|
eval_y_sampled = enriched_eval_xy[TARGET].copy()
|
2135
2393
|
enriched_eval_x = enriched_eval_xy[enriched_X_columns].copy()
|
2136
|
-
eval_set_sampled_dict[idx
|
2394
|
+
eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
|
2137
2395
|
|
2138
2396
|
return eval_set_sampled_dict
|
2139
2397
|
|
@@ -2171,12 +2429,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
2171
2429
|
columns_renaming: Dict[str, str],
|
2172
2430
|
):
|
2173
2431
|
# X_sampled - with hash-suffixes
|
2174
|
-
reversed_renaming = {v: k for k, v in columns_renaming.items()}
|
2175
|
-
search_keys = {
|
2176
|
-
|
2177
|
-
|
2178
|
-
|
2179
|
-
}
|
2432
|
+
# reversed_renaming = {v: k for k, v in columns_renaming.items()}
|
2433
|
+
# search_keys = {
|
2434
|
+
# reversed_renaming.get(k, k): v
|
2435
|
+
# for k, v in search_keys.items()
|
2436
|
+
# if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
|
2437
|
+
# }
|
2180
2438
|
return FeaturesEnricher._EnrichedDataForMetrics(
|
2181
2439
|
X_sampled=X_sampled,
|
2182
2440
|
y_sampled=y_sampled,
|
@@ -2325,10 +2583,10 @@ if response.status_code == 200:
|
|
2325
2583
|
with MDC(trace_id=trace_id, search_id=search_id):
|
2326
2584
|
self.logger.info("Start transform")
|
2327
2585
|
|
2328
|
-
validated_X, validated_y,
|
2329
|
-
X, y, is_transform=True
|
2586
|
+
validated_X, validated_y, validated_eval_set = self._validate_train_eval(
|
2587
|
+
X, y, eval_set=None, is_transform=True, silent=True
|
2330
2588
|
)
|
2331
|
-
df = self.__combine_train_and_eval_sets(validated_X, validated_y)
|
2589
|
+
df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
|
2332
2590
|
|
2333
2591
|
validated_Xy = df.copy()
|
2334
2592
|
|
@@ -2428,7 +2686,7 @@ if response.status_code == 200:
|
|
2428
2686
|
else:
|
2429
2687
|
self.logger.info("Input dataset hasn't date column")
|
2430
2688
|
if self.__should_add_date_column():
|
2431
|
-
df = self._add_current_date_as_key(df, search_keys, self.
|
2689
|
+
df = self._add_current_date_as_key(df, search_keys, self.bundle, silent=True)
|
2432
2690
|
|
2433
2691
|
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
2434
2692
|
if email_columns and self.generate_search_key_features:
|
@@ -2677,7 +2935,8 @@ if response.status_code == 200:
|
|
2677
2935
|
selecting_columns = [
|
2678
2936
|
c
|
2679
2937
|
for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
|
2680
|
-
if c not in self.zero_shap_client_features
|
2938
|
+
if (c not in self.zero_shap_client_features and c not in self.unstable_client_features)
|
2939
|
+
or c in (self.id_columns or [])
|
2681
2940
|
]
|
2682
2941
|
selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
|
2683
2942
|
if add_fit_system_record_id:
|
@@ -2803,15 +3062,15 @@ if response.status_code == 200:
|
|
2803
3062
|
X: Union[pd.DataFrame, pd.Series, np.ndarray],
|
2804
3063
|
y: Union[pd.DataFrame, pd.Series, np.ndarray, List, None],
|
2805
3064
|
eval_set: Optional[List[tuple]],
|
2806
|
-
*,
|
2807
|
-
oot: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
|
2808
3065
|
progress_bar: Optional[ProgressBar],
|
2809
3066
|
start_time: int,
|
3067
|
+
*,
|
2810
3068
|
exclude_features_sources: Optional[List[str]] = None,
|
2811
3069
|
calculate_metrics: Optional[bool],
|
2812
3070
|
scoring: Union[Callable, str, None],
|
2813
3071
|
estimator: Optional[Any],
|
2814
3072
|
importance_threshold: Optional[float],
|
3073
|
+
stability_threshold: float,
|
2815
3074
|
max_features: Optional[int],
|
2816
3075
|
remove_outliers_calc_metrics: Optional[bool],
|
2817
3076
|
auto_fe_parameters: AutoFEParameters,
|
@@ -2826,8 +3085,9 @@ if response.status_code == 200:
|
|
2826
3085
|
self.fit_columns_renaming = None
|
2827
3086
|
self.fit_dropped_features = set()
|
2828
3087
|
self.fit_generated_features = []
|
3088
|
+
self.psi_values = None
|
2829
3089
|
|
2830
|
-
validated_X, validated_y, validated_eval_set
|
3090
|
+
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, eval_set)
|
2831
3091
|
|
2832
3092
|
is_demo_dataset = hash_input(validated_X, validated_y, validated_eval_set) in DEMO_DATASET_HASHES
|
2833
3093
|
if is_demo_dataset:
|
@@ -2868,7 +3128,6 @@ if response.status_code == 200:
|
|
2868
3128
|
validated_X,
|
2869
3129
|
validated_y,
|
2870
3130
|
validated_eval_set,
|
2871
|
-
validated_oot,
|
2872
3131
|
exclude_features_sources=exclude_features_sources,
|
2873
3132
|
calculate_metrics=calculate_metrics,
|
2874
3133
|
scoring=scoring,
|
@@ -2876,7 +3135,7 @@ if response.status_code == 200:
|
|
2876
3135
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
2877
3136
|
)
|
2878
3137
|
|
2879
|
-
df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set
|
3138
|
+
df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
|
2880
3139
|
self.id_columns_encoder = OrdinalEncoder().fit(df[self.id_columns or []])
|
2881
3140
|
|
2882
3141
|
self.fit_search_keys = self.search_keys.copy()
|
@@ -2923,7 +3182,7 @@ if response.status_code == 200:
|
|
2923
3182
|
self.logger.info("Input dataset hasn't date column")
|
2924
3183
|
# TODO remove when this logic will be implemented on the back
|
2925
3184
|
if self.__should_add_date_column():
|
2926
|
-
df = self._add_current_date_as_key(df, self.fit_search_keys, self.
|
3185
|
+
df = self._add_current_date_as_key(df, self.fit_search_keys, self.bundle)
|
2927
3186
|
|
2928
3187
|
email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
|
2929
3188
|
if email_columns and self.generate_search_key_features:
|
@@ -2938,10 +3197,13 @@ if response.status_code == 200:
|
|
2938
3197
|
except Exception:
|
2939
3198
|
self.logger.exception("Failed to check dates distribution validity")
|
2940
3199
|
|
3200
|
+
self.__adjust_cv(df)
|
3201
|
+
|
2941
3202
|
if (
|
2942
3203
|
is_numeric_dtype(df[self.TARGET_NAME])
|
2943
3204
|
and self.model_task_type in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]
|
2944
3205
|
and has_date
|
3206
|
+
and not self.cv.is_time_series()
|
2945
3207
|
):
|
2946
3208
|
self._validate_PSI(df.sort_values(by=maybe_date_column))
|
2947
3209
|
|
@@ -2973,8 +3235,9 @@ if response.status_code == 200:
|
|
2973
3235
|
|
2974
3236
|
self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
|
2975
3237
|
|
3238
|
+
# Group columns should have normalized names
|
3239
|
+
self.cv = None
|
2976
3240
|
self.__adjust_cv(df)
|
2977
|
-
|
2978
3241
|
if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
|
2979
3242
|
id_columns = self.__get_renamed_id_columns()
|
2980
3243
|
if id_columns:
|
@@ -3179,6 +3442,21 @@ if response.status_code == 200:
|
|
3179
3442
|
display_id=f"autofe_descriptions_{uuid.uuid4()}",
|
3180
3443
|
)
|
3181
3444
|
|
3445
|
+
self._select_features_by_psi(
|
3446
|
+
trace_id=trace_id,
|
3447
|
+
X=X,
|
3448
|
+
y=y,
|
3449
|
+
eval_set=eval_set,
|
3450
|
+
stability_threshold=stability_threshold,
|
3451
|
+
cv=self.cv,
|
3452
|
+
estimator=estimator,
|
3453
|
+
exclude_features_sources=exclude_features_sources,
|
3454
|
+
importance_threshold=importance_threshold,
|
3455
|
+
max_features=max_features,
|
3456
|
+
progress_bar=progress_bar,
|
3457
|
+
progress_callback=progress_callback,
|
3458
|
+
)
|
3459
|
+
|
3182
3460
|
if self._has_paid_features(exclude_features_sources):
|
3183
3461
|
if calculate_metrics is not None and calculate_metrics:
|
3184
3462
|
msg = self.bundle.get("metrics_with_paid_features")
|
@@ -3303,14 +3581,13 @@ if response.status_code == 200:
|
|
3303
3581
|
X: pd.DataFrame,
|
3304
3582
|
y: Optional[pd.Series] = None,
|
3305
3583
|
eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]] = None,
|
3306
|
-
oot: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
|
3307
3584
|
is_transform: bool = False,
|
3585
|
+
silent: bool = False,
|
3308
3586
|
) -> Tuple[pd.DataFrame, pd.Series, Optional[List[Tuple[pd.DataFrame, pd.Series]]]]:
|
3309
3587
|
validated_X = self._validate_X(X, is_transform)
|
3310
3588
|
validated_y = self._validate_y(validated_X, y, enforce_y=not is_transform)
|
3311
|
-
validated_eval_set = self._validate_eval_set(validated_X, eval_set)
|
3312
|
-
|
3313
|
-
return validated_X, validated_y, validated_eval_set, validated_oot
|
3589
|
+
validated_eval_set = self._validate_eval_set(validated_X, eval_set, silent=silent)
|
3590
|
+
return validated_X, validated_y, validated_eval_set
|
3314
3591
|
|
3315
3592
|
def _encode_id_columns(
|
3316
3593
|
self,
|
@@ -3441,53 +3718,30 @@ if response.status_code == 200:
|
|
3441
3718
|
|
3442
3719
|
return validated_y
|
3443
3720
|
|
3444
|
-
def _validate_eval_set(
|
3721
|
+
def _validate_eval_set(
|
3722
|
+
self, X: pd.DataFrame, eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]], silent: bool = False
|
3723
|
+
):
|
3445
3724
|
if eval_set is None:
|
3446
3725
|
return None
|
3447
|
-
|
3448
|
-
|
3449
|
-
|
3450
|
-
|
3451
|
-
|
3452
|
-
|
3453
|
-
|
3454
|
-
|
3455
|
-
|
3456
|
-
|
3457
|
-
|
3458
|
-
|
3459
|
-
|
3460
|
-
|
3461
|
-
|
3462
|
-
|
3463
|
-
|
3464
|
-
validated_oot = validated_oot.rename(columns=renaming)
|
3465
|
-
else:
|
3466
|
-
raise ValidationError(self.bundle.get("unsupported_type_oot").format(type(oot)))
|
3467
|
-
|
3468
|
-
if not validated_oot.index.is_unique:
|
3469
|
-
raise ValidationError(self.bundle.get("non_unique_index_oot"))
|
3470
|
-
|
3471
|
-
if self.exclude_columns is not None:
|
3472
|
-
validated_oot = validated_oot.drop(columns=self.exclude_columns, errors="ignore")
|
3473
|
-
|
3474
|
-
if self.baseline_score_column:
|
3475
|
-
validated_oot[self.baseline_score_column] = validated_oot[self.baseline_score_column].astype(
|
3476
|
-
"float64", errors="ignore"
|
3477
|
-
)
|
3478
|
-
|
3479
|
-
if validated_oot.columns.to_list() != X.columns.to_list():
|
3480
|
-
if set(validated_oot.columns.to_list()) == set(X.columns.to_list()):
|
3481
|
-
validated_oot = validated_oot[X.columns.to_list()]
|
3482
|
-
else:
|
3483
|
-
raise ValidationError(self.bundle.get("oot_and_x_diff_shape"))
|
3484
|
-
|
3485
|
-
# Check for duplicates between train and eval sets by comparing all values
|
3486
|
-
train_eval_intersection = pd.merge(X, validated_oot, how="inner")
|
3487
|
-
if len(train_eval_intersection) > 0:
|
3488
|
-
raise ValidationError(self.bundle.get("oot_has_train_samples"))
|
3726
|
+
validated_eval_set = []
|
3727
|
+
has_date = self._get_date_column(self.search_keys) is not None
|
3728
|
+
for idx, eval_pair in enumerate(eval_set):
|
3729
|
+
validated_pair = self._validate_eval_set_pair(X, eval_pair)
|
3730
|
+
if validated_pair[1].isna().all():
|
3731
|
+
if not has_date:
|
3732
|
+
msg = self.bundle.get("oot_without_date_not_supported").format(idx + 1)
|
3733
|
+
elif self.columns_for_online_api:
|
3734
|
+
msg = self.bundle.get("oot_with_online_sources_not_supported").format(idx + 1)
|
3735
|
+
else:
|
3736
|
+
msg = None
|
3737
|
+
if msg:
|
3738
|
+
if not silent:
|
3739
|
+
print(msg)
|
3740
|
+
self.logger.warning(msg)
|
3741
|
+
continue
|
3742
|
+
validated_eval_set.append(validated_pair)
|
3489
3743
|
|
3490
|
-
return
|
3744
|
+
return validated_eval_set
|
3491
3745
|
|
3492
3746
|
def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
|
3493
3747
|
if len(eval_pair) != 2:
|
@@ -3562,16 +3816,18 @@ if response.status_code == 200:
|
|
3562
3816
|
raise ValidationError(self.bundle.get("unsupported_y_type_eval_set").format(type(eval_y)))
|
3563
3817
|
|
3564
3818
|
eval_y_nunique = validated_eval_y.nunique()
|
3565
|
-
|
3819
|
+
is_oot = validated_eval_y.isna().all()
|
3820
|
+
if not is_oot and eval_y_nunique < 2:
|
3566
3821
|
raise ValidationError(self.bundle.get("y_is_constant_eval_set"))
|
3567
3822
|
|
3568
|
-
if self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
|
3823
|
+
if not is_oot and self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
|
3569
3824
|
raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
|
3570
3825
|
|
3571
|
-
|
3572
|
-
|
3573
|
-
|
3574
|
-
|
3826
|
+
if not is_oot:
|
3827
|
+
# Check for duplicates between train and eval sets by comparing all values
|
3828
|
+
train_eval_intersection = pd.merge(X, validated_eval_X, how="inner")
|
3829
|
+
if len(train_eval_intersection) > 0:
|
3830
|
+
raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
|
3575
3831
|
|
3576
3832
|
return validated_eval_X, validated_eval_y
|
3577
3833
|
|
@@ -3587,10 +3843,12 @@ if response.status_code == 200:
|
|
3587
3843
|
if isinstance(eval_set, tuple):
|
3588
3844
|
eval_set = [eval_set]
|
3589
3845
|
for eval in eval_set:
|
3590
|
-
|
3591
|
-
|
3592
|
-
|
3593
|
-
|
3846
|
+
is_oot = eval[1].isna().all()
|
3847
|
+
if not is_oot:
|
3848
|
+
if self.baseline_score_column not in eval[0].columns:
|
3849
|
+
raise ValidationError(self.bundle.get("baseline_score_column_not_exists"))
|
3850
|
+
if eval[0][self.baseline_score_column].isna().any():
|
3851
|
+
raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
|
3594
3852
|
|
3595
3853
|
@staticmethod
|
3596
3854
|
def _sample_X_and_y(X: pd.DataFrame, y: pd.Series, enriched_X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
|
@@ -3660,7 +3918,6 @@ if response.status_code == 200:
|
|
3660
3918
|
X: pd.DataFrame,
|
3661
3919
|
y: Union[pd.Series, np.ndarray, list, None] = None,
|
3662
3920
|
eval_set: Optional[List[tuple]] = None,
|
3663
|
-
oot: Optional[pd.DataFrame] = None,
|
3664
3921
|
exclude_features_sources: Optional[List[str]] = None,
|
3665
3922
|
calculate_metrics: Optional[bool] = None,
|
3666
3923
|
cv: Optional[Any] = None,
|
@@ -3729,8 +3986,6 @@ if response.status_code == 200:
|
|
3729
3986
|
self.logger.info(
|
3730
3987
|
f"First 10 rows of the eval_y_{idx} with shape {_num_samples(eval_y)}:\n{sample(eval_y)}"
|
3731
3988
|
)
|
3732
|
-
if oot is not None:
|
3733
|
-
self.logger.info(f"First 10 rows of the oot with shape {oot.shape}:\n{sample(oot)}")
|
3734
3989
|
|
3735
3990
|
do_without_pandas_limits(print_datasets_sample)
|
3736
3991
|
|
@@ -3767,7 +4022,7 @@ if response.status_code == 200:
|
|
3767
4022
|
return df
|
3768
4023
|
|
3769
4024
|
def _add_current_date_as_key(
|
3770
|
-
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey],
|
4025
|
+
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], bundle: ResourceBundle, silent: bool = False
|
3771
4026
|
) -> pd.DataFrame:
|
3772
4027
|
if (
|
3773
4028
|
set(search_keys.values()) == {SearchKey.PHONE}
|
@@ -3775,7 +4030,8 @@ if response.status_code == 200:
|
|
3775
4030
|
or set(search_keys.values()) == {SearchKey.HEM}
|
3776
4031
|
or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
|
3777
4032
|
):
|
3778
|
-
|
4033
|
+
if not silent:
|
4034
|
+
self.__log_warning(bundle.get("current_date_added"))
|
3779
4035
|
df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
|
3780
4036
|
search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
|
3781
4037
|
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, generate_cyclical_features=False)
|
@@ -4110,7 +4366,11 @@ if response.status_code == 200:
|
|
4110
4366
|
return [f.name for f in features_meta if f.type == "categorical" and f.name not in (self.id_columns or [])]
|
4111
4367
|
|
4112
4368
|
def __prepare_feature_importances(
|
4113
|
-
self,
|
4369
|
+
self,
|
4370
|
+
trace_id: str,
|
4371
|
+
clients_features_df: pd.DataFrame,
|
4372
|
+
updated_shaps: Optional[Dict[str, float]] = None,
|
4373
|
+
silent=False,
|
4114
4374
|
):
|
4115
4375
|
if self._search_task is None:
|
4116
4376
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
@@ -4123,11 +4383,12 @@ if response.status_code == 200:
|
|
4123
4383
|
features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
|
4124
4384
|
|
4125
4385
|
# To be sure that names with hash suffixes
|
4126
|
-
|
4386
|
+
clients_features_df = clients_features_df.rename(columns=original_names_dict)
|
4127
4387
|
|
4128
4388
|
self.feature_names_ = []
|
4129
4389
|
self.external_source_feature_names = []
|
4130
4390
|
self.zero_shap_client_features = []
|
4391
|
+
self.unstable_client_features = []
|
4131
4392
|
self.feature_importances_ = []
|
4132
4393
|
features_info = []
|
4133
4394
|
features_info_without_links = []
|
@@ -4136,10 +4397,10 @@ if response.status_code == 200:
|
|
4136
4397
|
original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
|
4137
4398
|
|
4138
4399
|
for feature_meta in features_meta:
|
4139
|
-
|
4140
|
-
|
4400
|
+
original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
|
4401
|
+
feature_meta.name = original_name
|
4141
4402
|
|
4142
|
-
is_client_feature =
|
4403
|
+
is_client_feature = original_name in clients_features_df.columns
|
4143
4404
|
|
4144
4405
|
# Show and update shap values for client features only if select_features is True
|
4145
4406
|
if updated_shaps is not None and (not is_client_feature or self.fit_select_features):
|
@@ -4156,12 +4417,21 @@ if response.status_code == 200:
|
|
4156
4417
|
|
4157
4418
|
for feature_meta in features_meta:
|
4158
4419
|
original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
|
4159
|
-
is_client_feature = original_name in
|
4420
|
+
is_client_feature = original_name in clients_features_df.columns
|
4160
4421
|
|
4161
4422
|
if not is_client_feature:
|
4162
4423
|
self.external_source_feature_names.append(original_name)
|
4163
4424
|
|
4425
|
+
if self.psi_values is not None:
|
4426
|
+
if original_name in self.psi_values:
|
4427
|
+
feature_meta.psi_value = self.psi_values[original_name]
|
4428
|
+
else:
|
4429
|
+
if is_client_feature and self.fit_select_features:
|
4430
|
+
self.unstable_client_features.append(original_name)
|
4431
|
+
continue
|
4432
|
+
|
4164
4433
|
# TODO make a decision about selected features based on special flag from mlb
|
4434
|
+
|
4165
4435
|
if original_shaps.get(feature_meta.name, 0.0) == 0.0:
|
4166
4436
|
if is_client_feature and self.fit_select_features:
|
4167
4437
|
self.zero_shap_client_features.append(original_name)
|
@@ -4185,7 +4455,7 @@ if response.status_code == 200:
|
|
4185
4455
|
self.feature_names_.append(feature_meta.name)
|
4186
4456
|
self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
|
4187
4457
|
|
4188
|
-
df_for_sample = features_df if feature_meta.name in features_df.columns else
|
4458
|
+
df_for_sample = features_df if feature_meta.name in features_df.columns else clients_features_df
|
4189
4459
|
feature_info = FeatureInfo.from_metadata(feature_meta, df_for_sample, is_client_feature)
|
4190
4460
|
features_info.append(feature_info.to_row(self.bundle))
|
4191
4461
|
features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
|
@@ -4193,6 +4463,8 @@ if response.status_code == 200:
|
|
4193
4463
|
|
4194
4464
|
if len(features_info) > 0:
|
4195
4465
|
self.features_info = pd.DataFrame(features_info)
|
4466
|
+
if self.features_info[self.bundle.get("features_info_psi")].isna().all():
|
4467
|
+
self.features_info.drop(columns=[self.bundle.get("features_info_psi")], inplace=True)
|
4196
4468
|
self._features_info_without_links = pd.DataFrame(features_info_without_links)
|
4197
4469
|
self._internal_features_info = pd.DataFrame(internal_features_info)
|
4198
4470
|
if not silent:
|
@@ -4640,7 +4912,7 @@ if response.status_code == 200:
|
|
4640
4912
|
print(msg)
|
4641
4913
|
|
4642
4914
|
def _validate_PSI(self, df: pd.DataFrame):
|
4643
|
-
if EVAL_SET_INDEX in df.columns
|
4915
|
+
if EVAL_SET_INDEX in df.columns:
|
4644
4916
|
train = df.query(f"{EVAL_SET_INDEX} == 0")
|
4645
4917
|
eval1 = df.query(f"{EVAL_SET_INDEX} == 1")
|
4646
4918
|
else:
|