upgini 1.2.113a3974.dev2__py3-none-any.whl → 1.2.114a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/dataset.py +3 -1
- upgini/features_enricher.py +462 -136
- upgini/metadata.py +1 -0
- upgini/metrics.py +6 -2
- upgini/resource_bundle/strings.properties +4 -1
- upgini/sampler/base.py +3 -1
- upgini/sampler/random_under_sampler.py +18 -8
- upgini/utils/deduplicate_utils.py +43 -7
- upgini/utils/feature_info.py +5 -0
- upgini/utils/psi.py +294 -0
- {upgini-1.2.113a3974.dev2.dist-info → upgini-1.2.114a1.dist-info}/METADATA +31 -17
- {upgini-1.2.113a3974.dev2.dist-info → upgini-1.2.114a1.dist-info}/RECORD +15 -14
- {upgini-1.2.113a3974.dev2.dist-info → upgini-1.2.114a1.dist-info}/WHEEL +1 -1
- {upgini-1.2.113a3974.dev2.dist-info → upgini-1.2.114a1.dist-info}/licenses/LICENSE +0 -0
upgini/features_enricher.py
CHANGED
@@ -112,6 +112,7 @@ except Exception:
|
|
112
112
|
CustomFallbackProgressBar as ProgressBar,
|
113
113
|
)
|
114
114
|
|
115
|
+
from upgini.utils.psi import calculate_features_psi, calculate_sparsity_psi
|
115
116
|
from upgini.utils.sample_utils import SampleColumns, SampleConfig, _num_samples, sample
|
116
117
|
from upgini.utils.sort import sort_columns
|
117
118
|
from upgini.utils.target_utils import calculate_psi, define_task
|
@@ -297,7 +298,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
297
298
|
self.feature_names_ = []
|
298
299
|
self.external_source_feature_names = []
|
299
300
|
self.zero_shap_client_features = []
|
301
|
+
self.unstable_client_features = []
|
300
302
|
self.feature_importances_ = []
|
303
|
+
self.psi_values: Optional[Dict[str, float]] = None
|
301
304
|
self.search_id = search_id
|
302
305
|
self.disable_force_downsampling = disable_force_downsampling
|
303
306
|
self.print_trace_id = print_trace_id
|
@@ -398,13 +401,26 @@ class FeaturesEnricher(TransformerMixin):
|
|
398
401
|
@staticmethod
|
399
402
|
def _check_eval_set(eval_set, X, bundle: ResourceBundle):
|
400
403
|
checked_eval_set = []
|
401
|
-
if eval_set is
|
404
|
+
if eval_set is None:
|
405
|
+
return checked_eval_set
|
406
|
+
if isinstance(eval_set, tuple):
|
402
407
|
eval_set = [eval_set]
|
403
|
-
if
|
408
|
+
if not isinstance(eval_set, list):
|
404
409
|
raise ValidationError(bundle.get("unsupported_type_eval_set").format(type(eval_set)))
|
405
410
|
for eval_pair in eval_set or []:
|
411
|
+
# Handle OOT
|
412
|
+
if isinstance(eval_pair, pd.DataFrame):
|
413
|
+
empty_target = pd.Series([np.nan] * len(eval_pair), index=eval_pair.index)
|
414
|
+
eval_pair = (eval_pair, empty_target)
|
415
|
+
elif isinstance(eval_pair, tuple) and len(eval_pair) == 1:
|
416
|
+
empty_target = pd.Series([np.nan] * len(eval_pair[0]), index=eval_pair[0].index)
|
417
|
+
eval_pair = (eval_pair[0], empty_target)
|
418
|
+
|
406
419
|
if not isinstance(eval_pair, tuple) or len(eval_pair) != 2:
|
407
420
|
raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
421
|
+
if eval_pair[1] is None:
|
422
|
+
empty_target = pd.Series([np.nan] * len(eval_pair[0]), index=eval_pair[0].index)
|
423
|
+
eval_pair = (eval_pair[0], empty_target)
|
408
424
|
if not is_frames_equal(X, eval_pair[0], bundle):
|
409
425
|
checked_eval_set.append(eval_pair)
|
410
426
|
return checked_eval_set
|
@@ -426,6 +442,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
426
442
|
search_id_callback: Optional[Callable[[str], Any]] = None,
|
427
443
|
select_features: bool = True,
|
428
444
|
auto_fe_parameters: Optional[AutoFEParameters] = None,
|
445
|
+
stability_threshold: float = 0.15,
|
429
446
|
**kwargs,
|
430
447
|
):
|
431
448
|
"""Fit to data.
|
@@ -515,6 +532,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
515
532
|
estimator=estimator,
|
516
533
|
scoring=scoring,
|
517
534
|
importance_threshold=importance_threshold,
|
535
|
+
stability_threshold=stability_threshold,
|
518
536
|
max_features=max_features,
|
519
537
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
520
538
|
auto_fe_parameters=auto_fe_parameters,
|
@@ -574,6 +592,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
574
592
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
575
593
|
select_features: bool = True,
|
576
594
|
auto_fe_parameters: Optional[AutoFEParameters] = None,
|
595
|
+
stability_threshold: float = 0.15,
|
577
596
|
**kwargs,
|
578
597
|
) -> pd.DataFrame:
|
579
598
|
"""Fit to data, then transform it.
|
@@ -618,6 +637,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
618
637
|
If True, return only selected features both from input and data sources.
|
619
638
|
Otherwise, return all features from input and only selected features from data sources.
|
620
639
|
|
640
|
+
stability_threshold: float, optional (default=0.15)
|
641
|
+
Stability threshold for selected features PSI calculation. If PSI is less than this threshold,
|
642
|
+
then feature will be dropped.
|
643
|
+
|
621
644
|
Returns
|
622
645
|
-------
|
623
646
|
X_new: pandas.DataFrame of shape (n_samples, n_features_new)
|
@@ -674,6 +697,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
674
697
|
scoring=scoring,
|
675
698
|
estimator=estimator,
|
676
699
|
importance_threshold=importance_threshold,
|
700
|
+
stability_threshold=stability_threshold,
|
677
701
|
max_features=max_features,
|
678
702
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
679
703
|
auto_fe_parameters=auto_fe_parameters,
|
@@ -941,7 +965,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
941
965
|
raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
|
942
966
|
|
943
967
|
validated_X, validated_y, validated_eval_set = self._validate_train_eval(
|
944
|
-
effective_X, effective_y, effective_eval_set
|
968
|
+
effective_X, effective_y, effective_eval_set, silent=internal_call
|
945
969
|
)
|
946
970
|
|
947
971
|
if self.X is None:
|
@@ -979,29 +1003,27 @@ class FeaturesEnricher(TransformerMixin):
|
|
979
1003
|
return None
|
980
1004
|
|
981
1005
|
cat_features_from_backend = self.__get_categorical_features()
|
1006
|
+
# Convert to original names
|
1007
|
+
cat_features_from_backend = [self.fit_columns_renaming.get(c, c) for c in cat_features_from_backend]
|
982
1008
|
client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
|
983
1009
|
estimator, validated_X, self.search_keys
|
984
1010
|
)
|
1011
|
+
# Exclude id columns from cat_features
|
985
1012
|
if self.id_columns and self.id_columns_encoder is not None:
|
986
1013
|
if cat_features_from_backend:
|
987
1014
|
cat_features_from_backend = [
|
988
|
-
c
|
989
|
-
for c in cat_features_from_backend
|
990
|
-
if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
|
1015
|
+
c for c in cat_features_from_backend if c not in self.id_columns_encoder.feature_names_in_
|
991
1016
|
]
|
992
1017
|
if client_cat_features:
|
993
1018
|
client_cat_features = [
|
994
|
-
c
|
995
|
-
for c in client_cat_features
|
996
|
-
if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
|
1019
|
+
c for c in client_cat_features if c not in self.id_columns_encoder.feature_names_in_
|
997
1020
|
]
|
998
1021
|
for cat_feature in cat_features_from_backend:
|
999
|
-
|
1000
|
-
|
1001
|
-
|
1002
|
-
search_keys_for_metrics.append(original_cat_feature)
|
1022
|
+
if cat_feature in self.search_keys:
|
1023
|
+
if self.search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
|
1024
|
+
search_keys_for_metrics.append(cat_feature)
|
1003
1025
|
else:
|
1004
|
-
self.logger.warning(self.bundle.get("cat_feature_search_key").format(
|
1026
|
+
self.logger.warning(self.bundle.get("cat_feature_search_key").format(cat_feature))
|
1005
1027
|
search_keys_for_metrics.extend([c for c in self.id_columns or [] if c not in search_keys_for_metrics])
|
1006
1028
|
self.logger.info(f"Search keys for metrics: {search_keys_for_metrics}")
|
1007
1029
|
|
@@ -1033,23 +1055,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
1033
1055
|
groups,
|
1034
1056
|
_cv,
|
1035
1057
|
columns_renaming,
|
1058
|
+
_,
|
1036
1059
|
) = prepared_data
|
1037
1060
|
|
1038
|
-
# rename cat_features
|
1039
|
-
if client_cat_features:
|
1040
|
-
for new_c, old_c in columns_renaming.items():
|
1041
|
-
if old_c in client_cat_features:
|
1042
|
-
client_cat_features.remove(old_c)
|
1043
|
-
client_cat_features.append(new_c)
|
1044
|
-
for cat_feature in client_cat_features:
|
1045
|
-
if cat_feature not in fitting_X.columns:
|
1046
|
-
self.logger.error(
|
1047
|
-
f"Client cat_feature `{cat_feature}` not found in"
|
1048
|
-
f" x columns: {fitting_X.columns.to_list()}"
|
1049
|
-
)
|
1050
|
-
else:
|
1051
|
-
client_cat_features = []
|
1052
|
-
|
1053
1061
|
# rename baseline_score_column
|
1054
1062
|
reversed_renaming = {v: k for k, v in columns_renaming.items()}
|
1055
1063
|
baseline_score_column = self.baseline_score_column
|
@@ -1074,9 +1082,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
1074
1082
|
self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
|
1075
1083
|
|
1076
1084
|
has_date = self._get_date_column(search_keys) is not None
|
1077
|
-
has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
|
1078
1085
|
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
1079
1086
|
cat_features = list(set(client_cat_features + cat_features_from_backend))
|
1087
|
+
has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
|
1080
1088
|
baseline_cat_features = [f for f in cat_features if f in fitting_X.columns]
|
1081
1089
|
enriched_cat_features = [f for f in cat_features if f in fitting_enriched_X.columns]
|
1082
1090
|
if len(enriched_cat_features) < len(cat_features):
|
@@ -1196,8 +1204,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
1196
1204
|
# max_initial_eval_set_hit_rate = self._search_task.get_max_initial_eval_set_hit_rate_v2()
|
1197
1205
|
if len(fitting_eval_set_dict) > 0:
|
1198
1206
|
for idx in fitting_eval_set_dict.keys():
|
1199
|
-
# eval_hit_rate = max_initial_eval_set_hit_rate[idx + 1]
|
1200
|
-
|
1201
1207
|
(
|
1202
1208
|
eval_X_sorted,
|
1203
1209
|
eval_y_sorted,
|
@@ -1205,6 +1211,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
1205
1211
|
enriched_eval_y_sorted,
|
1206
1212
|
) = fitting_eval_set_dict[idx]
|
1207
1213
|
|
1214
|
+
if eval_y_sorted.isna().all():
|
1215
|
+
# Skip OOT eval set
|
1216
|
+
continue
|
1217
|
+
|
1208
1218
|
if baseline_estimator is not None:
|
1209
1219
|
self.logger.info(
|
1210
1220
|
f"Calculate baseline {metric} on eval set {idx + 1} "
|
@@ -1247,17 +1257,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
1247
1257
|
"quality_metrics_eval_segment"
|
1248
1258
|
).format(idx + 1),
|
1249
1259
|
self.bundle.get("quality_metrics_rows_header"): _num_samples(
|
1250
|
-
# effective_eval_set[idx][0]
|
1251
1260
|
# Use actually used for metrics dataset
|
1252
1261
|
eval_X_sorted
|
1253
1262
|
),
|
1254
|
-
# self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
|
1255
1263
|
}
|
1256
1264
|
if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
|
1257
1265
|
eval_y_sorted
|
1258
1266
|
):
|
1259
1267
|
eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
|
1260
|
-
# np.mean(validated_eval_set[idx][1]), 4
|
1261
1268
|
# Use actually used for metrics dataset
|
1262
1269
|
np.mean(eval_y_sorted),
|
1263
1270
|
4,
|
@@ -1279,7 +1286,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1279
1286
|
metrics.append(eval_metrics)
|
1280
1287
|
|
1281
1288
|
if updating_shaps is not None:
|
1282
|
-
decoded_X = self._decode_id_columns(fitting_X
|
1289
|
+
decoded_X = self._decode_id_columns(fitting_X)
|
1283
1290
|
self._update_shap_values(trace_id, decoded_X, updating_shaps, silent=not internal_call)
|
1284
1291
|
|
1285
1292
|
metrics_df = pd.DataFrame(metrics)
|
@@ -1330,6 +1337,198 @@ class FeaturesEnricher(TransformerMixin):
|
|
1330
1337
|
finally:
|
1331
1338
|
self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
|
1332
1339
|
|
1340
|
+
def _select_features_by_psi(
|
1341
|
+
self,
|
1342
|
+
trace_id: str,
|
1343
|
+
X: Union[pd.DataFrame, pd.Series, np.ndarray],
|
1344
|
+
y: Union[pd.DataFrame, pd.Series, np.ndarray, List],
|
1345
|
+
eval_set: Optional[Union[List[tuple], tuple]],
|
1346
|
+
stability_threshold: float,
|
1347
|
+
cv: Union[BaseCrossValidator, CVType, str, None] = None,
|
1348
|
+
estimator=None,
|
1349
|
+
exclude_features_sources: Optional[List[str]] = None,
|
1350
|
+
importance_threshold: Optional[float] = None,
|
1351
|
+
max_features: Optional[int] = None,
|
1352
|
+
progress_bar: bool = True,
|
1353
|
+
progress_callback: Optional[Callable] = None,
|
1354
|
+
):
|
1355
|
+
search_keys = self.search_keys.copy()
|
1356
|
+
validated_X, _, validated_eval_set = self._validate_train_eval(X, y, eval_set, silent=True)
|
1357
|
+
if isinstance(X, np.ndarray):
|
1358
|
+
search_keys = {str(k): v for k, v in search_keys.items()}
|
1359
|
+
|
1360
|
+
date_column = self._get_date_column(search_keys)
|
1361
|
+
has_date = date_column is not None
|
1362
|
+
if not has_date:
|
1363
|
+
self.logger.info("No date column for OOT PSI calculation")
|
1364
|
+
return
|
1365
|
+
if not validated_eval_set:
|
1366
|
+
self.logger.info("No eval set for OOT PSI calculation")
|
1367
|
+
return
|
1368
|
+
if validated_X[date_column].nunique() <= 1:
|
1369
|
+
self.logger.warning("Constant date for OOT PSI calculation")
|
1370
|
+
return
|
1371
|
+
if self.cv is not None and self.cv.is_time_series():
|
1372
|
+
self.logger.warning("Time series CV is not supported for OOT PSI calculation")
|
1373
|
+
return
|
1374
|
+
|
1375
|
+
cat_features_from_backend = self.__get_categorical_features()
|
1376
|
+
cat_features_from_backend = [self.fit_columns_renaming.get(c, c) for c in cat_features_from_backend]
|
1377
|
+
client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
|
1378
|
+
estimator, validated_X, search_keys
|
1379
|
+
)
|
1380
|
+
if self.id_columns and self.id_columns_encoder is not None:
|
1381
|
+
if cat_features_from_backend:
|
1382
|
+
cat_features_from_backend = [
|
1383
|
+
c for c in cat_features_from_backend if c not in self.id_columns_encoder.feature_names_in_
|
1384
|
+
]
|
1385
|
+
if client_cat_features:
|
1386
|
+
client_cat_features = [
|
1387
|
+
c for c in client_cat_features if c not in self.id_columns_encoder.feature_names_in_
|
1388
|
+
]
|
1389
|
+
|
1390
|
+
prepared_data = self._prepare_data_for_metrics(
|
1391
|
+
trace_id=trace_id,
|
1392
|
+
X=X,
|
1393
|
+
y=y,
|
1394
|
+
eval_set=eval_set,
|
1395
|
+
exclude_features_sources=exclude_features_sources,
|
1396
|
+
importance_threshold=importance_threshold,
|
1397
|
+
max_features=max_features,
|
1398
|
+
remove_outliers_calc_metrics=False,
|
1399
|
+
cv_override=cv,
|
1400
|
+
search_keys_for_metrics=search_keys_for_metrics,
|
1401
|
+
progress_bar=progress_bar,
|
1402
|
+
progress_callback=progress_callback,
|
1403
|
+
client_cat_features=client_cat_features,
|
1404
|
+
)
|
1405
|
+
if prepared_data is None:
|
1406
|
+
return None
|
1407
|
+
|
1408
|
+
(
|
1409
|
+
validated_X,
|
1410
|
+
fitting_X,
|
1411
|
+
y_sorted,
|
1412
|
+
fitting_enriched_X,
|
1413
|
+
_,
|
1414
|
+
fitting_eval_set_dict,
|
1415
|
+
_,
|
1416
|
+
_,
|
1417
|
+
_,
|
1418
|
+
columns_renaming,
|
1419
|
+
eval_set_dates,
|
1420
|
+
) = prepared_data
|
1421
|
+
|
1422
|
+
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
1423
|
+
cat_features = list(set(client_cat_features + cat_features_from_backend))
|
1424
|
+
|
1425
|
+
# Drop unstable features
|
1426
|
+
unstable_features = self._check_stability(
|
1427
|
+
validated_X,
|
1428
|
+
validated_eval_set,
|
1429
|
+
fitting_eval_set_dict,
|
1430
|
+
eval_set_dates,
|
1431
|
+
search_keys,
|
1432
|
+
stability_threshold,
|
1433
|
+
cat_features,
|
1434
|
+
model_task_type,
|
1435
|
+
)
|
1436
|
+
client_features_df = self.df_with_original_index.rename(columns=columns_renaming)
|
1437
|
+
# decoded_X = self._decode_id_columns(fitting_X, columns_renaming)
|
1438
|
+
self._update_report_psi(trace_id, client_features_df)
|
1439
|
+
|
1440
|
+
if unstable_features:
|
1441
|
+
msg = f"Some features are unstable: {unstable_features} and will be dropped"
|
1442
|
+
self.logger.warning(msg)
|
1443
|
+
print(msg)
|
1444
|
+
fitting_X = fitting_X.drop(columns=unstable_features, errors="ignore")
|
1445
|
+
fitting_enriched_X = fitting_enriched_X.drop(columns=unstable_features, errors="ignore")
|
1446
|
+
msg = f"Threre are {len(fitting_enriched_X.columns)} stable selected features left"
|
1447
|
+
self.logger.info(msg)
|
1448
|
+
print(msg)
|
1449
|
+
for idx, (
|
1450
|
+
eval_X,
|
1451
|
+
eval_y,
|
1452
|
+
eval_enriched_X,
|
1453
|
+
eval_enriched_y,
|
1454
|
+
) in fitting_eval_set_dict.items():
|
1455
|
+
eval_X = eval_X.drop(columns=unstable_features, errors="ignore")
|
1456
|
+
eval_enriched_X = eval_enriched_X.drop(columns=unstable_features, errors="ignore")
|
1457
|
+
fitting_eval_set_dict[idx] = (eval_X, eval_y, eval_enriched_X, eval_enriched_y)
|
1458
|
+
|
1459
|
+
def _check_stability(
|
1460
|
+
self,
|
1461
|
+
X: pd.DataFrame,
|
1462
|
+
eval_set: List[Tuple[pd.DataFrame, pd.Series]],
|
1463
|
+
enriched_eval_set: Dict,
|
1464
|
+
eval_set_dates: Dict[int, pd.Series],
|
1465
|
+
search_keys: Dict[str, SearchKey],
|
1466
|
+
stability_threshold: float,
|
1467
|
+
cat_features: List[str],
|
1468
|
+
model_task_type: ModelTaskType,
|
1469
|
+
) -> List[str]:
|
1470
|
+
# Find latest eval set or earliest if all eval sets are before train set
|
1471
|
+
date_column = self._get_date_column(search_keys)
|
1472
|
+
|
1473
|
+
# Get minimum date from main dataset X
|
1474
|
+
main_min_date = X[date_column].min()
|
1475
|
+
|
1476
|
+
# Find minimum date for each eval_set and compare with main dataset
|
1477
|
+
eval_dates = []
|
1478
|
+
for i, (eval_x, _) in enumerate(eval_set):
|
1479
|
+
if date_column in eval_x.columns:
|
1480
|
+
eval_min_date = eval_x[date_column].min()
|
1481
|
+
eval_max_date = eval_x[date_column].max()
|
1482
|
+
eval_dates.append((i, eval_min_date, eval_max_date))
|
1483
|
+
|
1484
|
+
if not eval_dates:
|
1485
|
+
return []
|
1486
|
+
|
1487
|
+
# Check if any eval_set has minimum date >= main dataset minimum date
|
1488
|
+
later_eval_sets = [(i, min_date, max_date) for i, min_date, max_date in eval_dates if min_date >= main_min_date]
|
1489
|
+
|
1490
|
+
if later_eval_sets:
|
1491
|
+
# If there are eval_sets with date >= main date, choose the one with highest maximum date
|
1492
|
+
selected_eval_set_idx = max(later_eval_sets, key=lambda x: x[2])[0]
|
1493
|
+
else:
|
1494
|
+
# If all eval_sets have dates < main date, choose the one with lowest minimux date
|
1495
|
+
selected_eval_set_idx = max(eval_dates, key=lambda x: x[1])[0]
|
1496
|
+
|
1497
|
+
checking_eval_set = enriched_eval_set[selected_eval_set_idx]
|
1498
|
+
|
1499
|
+
checking_eval_set_df = (
|
1500
|
+
checking_eval_set[2]
|
1501
|
+
if checking_eval_set[1] is None or checking_eval_set[1].isna().all()
|
1502
|
+
else pd.concat([checking_eval_set[2], checking_eval_set[1].to_frame(TARGET)], axis=1)
|
1503
|
+
)
|
1504
|
+
checking_eval_set_df = checking_eval_set_df.copy()
|
1505
|
+
|
1506
|
+
checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
|
1507
|
+
|
1508
|
+
psi_values_sparse = calculate_sparsity_psi(
|
1509
|
+
checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
|
1510
|
+
)
|
1511
|
+
|
1512
|
+
unstable_by_sparsity = [feature for feature, psi in psi_values_sparse.items() if psi > stability_threshold]
|
1513
|
+
if unstable_by_sparsity:
|
1514
|
+
self.logger.info(f"Unstable by sparsity features: {sorted(unstable_by_sparsity)}")
|
1515
|
+
|
1516
|
+
psi_values = calculate_features_psi(
|
1517
|
+
checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
|
1518
|
+
)
|
1519
|
+
|
1520
|
+
unstable_by_value = [feature for feature, psi in psi_values.items() if psi > stability_threshold]
|
1521
|
+
if unstable_by_value:
|
1522
|
+
self.logger.info(f"Unstable by value features: {sorted(unstable_by_value)}")
|
1523
|
+
|
1524
|
+
self.psi_values = {
|
1525
|
+
feature: psi_value for feature, psi_value in psi_values.items() if psi_value <= stability_threshold
|
1526
|
+
}
|
1527
|
+
|
1528
|
+
total_unstable_features = sorted(set(unstable_by_sparsity + unstable_by_value))
|
1529
|
+
|
1530
|
+
return total_unstable_features
|
1531
|
+
|
1333
1532
|
def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float], silent: bool = False):
|
1334
1533
|
renaming = self.fit_columns_renaming or {}
|
1335
1534
|
self.logger.info(f"Updating SHAP values: {new_shaps}")
|
@@ -1385,6 +1584,56 @@ class FeaturesEnricher(TransformerMixin):
|
|
1385
1584
|
except (ImportError, NameError):
|
1386
1585
|
pass
|
1387
1586
|
|
1587
|
+
def _update_report_psi(self, trace_id: str, clients_features_df: pd.DataFrame):
|
1588
|
+
self.__prepare_feature_importances(trace_id, clients_features_df)
|
1589
|
+
|
1590
|
+
if self.features_info_display_handle is not None:
|
1591
|
+
try:
|
1592
|
+
_ = get_ipython() # type: ignore
|
1593
|
+
|
1594
|
+
display_html_dataframe(
|
1595
|
+
self.features_info,
|
1596
|
+
self._features_info_without_links,
|
1597
|
+
self.bundle.get("relevant_features_header"),
|
1598
|
+
display_handle=self.features_info_display_handle,
|
1599
|
+
)
|
1600
|
+
except (ImportError, NameError):
|
1601
|
+
pass
|
1602
|
+
|
1603
|
+
if self.data_sources_display_handle is not None:
|
1604
|
+
try:
|
1605
|
+
_ = get_ipython() # type: ignore
|
1606
|
+
|
1607
|
+
display_html_dataframe(
|
1608
|
+
self.relevant_data_sources,
|
1609
|
+
self._relevant_data_sources_wo_links,
|
1610
|
+
self.bundle.get("relevant_data_sources_header"),
|
1611
|
+
display_handle=self.data_sources_display_handle,
|
1612
|
+
)
|
1613
|
+
except (ImportError, NameError):
|
1614
|
+
pass
|
1615
|
+
|
1616
|
+
if self.autofe_features_display_handle is not None:
|
1617
|
+
try:
|
1618
|
+
_ = get_ipython() # type: ignore
|
1619
|
+
autofe_descriptions_df = self.get_autofe_features_description()
|
1620
|
+
if autofe_descriptions_df is not None:
|
1621
|
+
display_html_dataframe(
|
1622
|
+
df=autofe_descriptions_df,
|
1623
|
+
internal_df=autofe_descriptions_df,
|
1624
|
+
header=self.bundle.get("autofe_descriptions_header"),
|
1625
|
+
display_handle=self.autofe_features_display_handle,
|
1626
|
+
)
|
1627
|
+
except (ImportError, NameError):
|
1628
|
+
pass
|
1629
|
+
if self.report_button_handle is not None:
|
1630
|
+
try:
|
1631
|
+
_ = get_ipython() # type: ignore
|
1632
|
+
|
1633
|
+
self.__show_report_button(display_handle=self.report_button_handle)
|
1634
|
+
except (ImportError, NameError):
|
1635
|
+
pass
|
1636
|
+
|
1388
1637
|
def _check_train_and_eval_target_distribution(self, y, eval_set_dict):
|
1389
1638
|
uneven_distribution = False
|
1390
1639
|
for eval_set in eval_set_dict.values():
|
@@ -1488,7 +1737,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1488
1737
|
def _get_and_validate_client_cat_features(
|
1489
1738
|
self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
1490
1739
|
) -> Tuple[Optional[List[str]], List[str]]:
|
1491
|
-
cat_features =
|
1740
|
+
cat_features = []
|
1492
1741
|
search_keys_for_metrics = []
|
1493
1742
|
if (
|
1494
1743
|
estimator is not None
|
@@ -1535,7 +1784,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1535
1784
|
is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
|
1536
1785
|
is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
|
1537
1786
|
checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
1538
|
-
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set)
|
1787
|
+
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set, silent=True)
|
1539
1788
|
|
1540
1789
|
sampled_data = self._get_enriched_for_metrics(
|
1541
1790
|
trace_id,
|
@@ -1549,7 +1798,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1549
1798
|
progress_bar,
|
1550
1799
|
progress_callback,
|
1551
1800
|
)
|
1552
|
-
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = dataclasses.astuple(
|
1801
|
+
(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming) = dataclasses.astuple(
|
1553
1802
|
sampled_data
|
1554
1803
|
)
|
1555
1804
|
|
@@ -1658,7 +1907,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1658
1907
|
fitting_X, y_sorted, search_keys, self.model_task_type, sort_all_columns=True, logger=self.logger
|
1659
1908
|
)
|
1660
1909
|
fitting_X = fitting_X[fitting_x_columns]
|
1661
|
-
fitting_X, _ = self._encode_id_columns(fitting_X
|
1910
|
+
fitting_X, _ = self._encode_id_columns(fitting_X)
|
1662
1911
|
self.logger.info(f"Final sorted list of fitting X columns: {fitting_x_columns}")
|
1663
1912
|
fitting_enriched_x_columns = fitting_enriched_X.columns.to_list()
|
1664
1913
|
fitting_enriched_x_columns = sort_columns(
|
@@ -1670,14 +1919,18 @@ class FeaturesEnricher(TransformerMixin):
|
|
1670
1919
|
logger=self.logger,
|
1671
1920
|
)
|
1672
1921
|
fitting_enriched_X = fitting_enriched_X[fitting_enriched_x_columns]
|
1673
|
-
fitting_enriched_X, _ = self._encode_id_columns(fitting_enriched_X
|
1922
|
+
fitting_enriched_X, _ = self._encode_id_columns(fitting_enriched_X)
|
1674
1923
|
self.logger.info(f"Final sorted list of fitting enriched X columns: {fitting_enriched_x_columns}")
|
1924
|
+
date_column = self._get_date_column(search_keys)
|
1925
|
+
eval_set_dates = {}
|
1675
1926
|
for idx, eval_tuple in eval_set_sampled_dict.items():
|
1676
1927
|
eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
|
1677
1928
|
eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
|
1678
1929
|
enriched_eval_X_sorted, enriched_eval_y_sorted = self._sort_by_system_record_id(
|
1679
1930
|
enriched_eval_X, eval_y_sampled, self.cv
|
1680
1931
|
)
|
1932
|
+
if date_column is not None:
|
1933
|
+
eval_set_dates[idx] = eval_X_sorted[date_column]
|
1681
1934
|
fitting_eval_X = eval_X_sorted[fitting_x_columns].copy()
|
1682
1935
|
fitting_enriched_eval_X = enriched_eval_X_sorted[fitting_enriched_x_columns].copy()
|
1683
1936
|
|
@@ -1698,8 +1951,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1698
1951
|
.astype(np.float64)
|
1699
1952
|
)
|
1700
1953
|
|
1701
|
-
fitting_eval_X, unknown_dict = self._encode_id_columns(fitting_eval_X
|
1702
|
-
fitting_enriched_eval_X, _ = self._encode_id_columns(fitting_enriched_eval_X
|
1954
|
+
fitting_eval_X, unknown_dict = self._encode_id_columns(fitting_eval_X)
|
1955
|
+
fitting_enriched_eval_X, _ = self._encode_id_columns(fitting_enriched_eval_X)
|
1703
1956
|
|
1704
1957
|
if len(unknown_dict) > 0:
|
1705
1958
|
print(self.bundle.get("unknown_id_column_value_in_eval_set").format(unknown_dict))
|
@@ -1722,6 +1975,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1722
1975
|
groups,
|
1723
1976
|
cv,
|
1724
1977
|
columns_renaming,
|
1978
|
+
eval_set_dates,
|
1725
1979
|
)
|
1726
1980
|
|
1727
1981
|
@dataclass
|
@@ -1884,14 +2138,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
1884
2138
|
remove_outliers_calc_metrics: Optional[bool],
|
1885
2139
|
) -> _EnrichedDataForMetrics:
|
1886
2140
|
eval_set_sampled_dict = {}
|
1887
|
-
search_keys = self.fit_search_keys
|
2141
|
+
search_keys = self.fit_search_keys.copy()
|
1888
2142
|
|
1889
2143
|
rows_to_drop = None
|
1890
2144
|
has_date = self._get_date_column(search_keys) is not None
|
1891
2145
|
self.model_task_type = self.model_task_type or define_task(
|
1892
2146
|
self.df_with_original_index[TARGET], has_date, self.logger, silent=True
|
1893
2147
|
)
|
1894
|
-
if
|
2148
|
+
if remove_outliers_calc_metrics is None:
|
2149
|
+
remove_outliers_calc_metrics = True
|
2150
|
+
if self.model_task_type == ModelTaskType.REGRESSION and remove_outliers_calc_metrics:
|
1895
2151
|
target_outliers_df = self._search_task.get_target_outliers(trace_id)
|
1896
2152
|
if target_outliers_df is not None and len(target_outliers_df) > 0:
|
1897
2153
|
outliers = pd.merge(
|
@@ -1901,11 +2157,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1901
2157
|
how="inner",
|
1902
2158
|
)
|
1903
2159
|
top_outliers = outliers.sort_values(by=TARGET, ascending=False)[TARGET].head(3)
|
1904
|
-
|
1905
|
-
|
1906
|
-
not_msg = ""
|
1907
|
-
else:
|
1908
|
-
not_msg = "not "
|
2160
|
+
rows_to_drop = outliers
|
2161
|
+
not_msg = ""
|
1909
2162
|
msg = self.bundle.get("target_outliers_warning").format(len(target_outliers_df), top_outliers, not_msg)
|
1910
2163
|
print(msg)
|
1911
2164
|
self.logger.warning(msg)
|
@@ -1963,12 +2216,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
1963
2216
|
enriched_eval_X = enriched_eval_sets[idx + 1][enriched_X_columns].copy()
|
1964
2217
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
1965
2218
|
|
1966
|
-
reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
|
1967
|
-
X_sampled.rename(columns=
|
1968
|
-
enriched_X.rename(columns=
|
2219
|
+
# reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
|
2220
|
+
X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
|
2221
|
+
enriched_X.rename(columns=self.fit_columns_renaming, inplace=True)
|
1969
2222
|
for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
|
1970
|
-
eval_X_sampled.rename(columns=
|
1971
|
-
enriched_eval_X.rename(columns=
|
2223
|
+
eval_X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
|
2224
|
+
enriched_eval_X.rename(columns=self.fit_columns_renaming, inplace=True)
|
2225
|
+
search_keys = {self.fit_columns_renaming.get(k, k): v for k, v in search_keys.items()}
|
1972
2226
|
|
1973
2227
|
datasets_hash = hash_input(self.X, self.y, self.eval_set)
|
1974
2228
|
return self.__cache_and_return_results(
|
@@ -2026,13 +2280,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2026
2280
|
enriched_df, x_columns, enriched_X.columns.tolist(), len(eval_set) if has_eval_set else 0
|
2027
2281
|
)
|
2028
2282
|
|
2029
|
-
|
2030
|
-
reversed_renaming = {v: k for k, v in columns_renaming.items()}
|
2031
|
-
X_sampled.rename(columns=reversed_renaming, inplace=True)
|
2032
|
-
enriched_X.rename(columns=reversed_renaming, inplace=True)
|
2033
|
-
for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
|
2034
|
-
eval_X_sampled.rename(columns=reversed_renaming, inplace=True)
|
2035
|
-
enriched_eval_X.rename(columns=reversed_renaming, inplace=True)
|
2283
|
+
search_keys = {columns_renaming.get(k, k): v for k, v in search_keys.items()}
|
2036
2284
|
|
2037
2285
|
# Cache and return results
|
2038
2286
|
datasets_hash = hash_input(validated_X, validated_y, eval_set)
|
@@ -2112,7 +2360,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2112
2360
|
|
2113
2361
|
def __extract_eval_data(
|
2114
2362
|
self, enriched_df: pd.DataFrame, x_columns: List[str], enriched_X_columns: List[str], eval_set_len: int
|
2115
|
-
) -> Dict[int, Tuple]:
|
2363
|
+
) -> Tuple[Dict[int, Tuple], Dict[int, pd.Series]]:
|
2116
2364
|
eval_set_sampled_dict = {}
|
2117
2365
|
|
2118
2366
|
for idx in range(eval_set_len):
|
@@ -2158,12 +2406,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
2158
2406
|
columns_renaming: Dict[str, str],
|
2159
2407
|
):
|
2160
2408
|
# X_sampled - with hash-suffixes
|
2161
|
-
reversed_renaming = {v: k for k, v in columns_renaming.items()}
|
2162
|
-
search_keys = {
|
2163
|
-
|
2164
|
-
|
2165
|
-
|
2166
|
-
}
|
2409
|
+
# reversed_renaming = {v: k for k, v in columns_renaming.items()}
|
2410
|
+
# search_keys = {
|
2411
|
+
# reversed_renaming.get(k, k): v
|
2412
|
+
# for k, v in search_keys.items()
|
2413
|
+
# if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
|
2414
|
+
# }
|
2167
2415
|
return FeaturesEnricher._EnrichedDataForMetrics(
|
2168
2416
|
X_sampled=X_sampled,
|
2169
2417
|
y_sampled=y_sampled,
|
@@ -2313,7 +2561,7 @@ if response.status_code == 200:
|
|
2313
2561
|
self.logger.info("Start transform")
|
2314
2562
|
|
2315
2563
|
validated_X, validated_y, validated_eval_set = self._validate_train_eval(
|
2316
|
-
X, y, eval_set=None, is_transform=True
|
2564
|
+
X, y, eval_set=None, is_transform=True, silent=True
|
2317
2565
|
)
|
2318
2566
|
df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
|
2319
2567
|
|
@@ -2415,7 +2663,7 @@ if response.status_code == 200:
|
|
2415
2663
|
else:
|
2416
2664
|
self.logger.info("Input dataset hasn't date column")
|
2417
2665
|
if self.__should_add_date_column():
|
2418
|
-
df = self._add_current_date_as_key(df, search_keys, self.
|
2666
|
+
df = self._add_current_date_as_key(df, search_keys, self.bundle, silent=True)
|
2419
2667
|
|
2420
2668
|
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
2421
2669
|
if email_columns and self.generate_search_key_features:
|
@@ -2664,7 +2912,8 @@ if response.status_code == 200:
|
|
2664
2912
|
selecting_columns = [
|
2665
2913
|
c
|
2666
2914
|
for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
|
2667
|
-
if c not in self.zero_shap_client_features
|
2915
|
+
if (c not in self.zero_shap_client_features and c not in self.unstable_client_features)
|
2916
|
+
or c in (self.id_columns or [])
|
2668
2917
|
]
|
2669
2918
|
selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
|
2670
2919
|
if add_fit_system_record_id:
|
@@ -2798,6 +3047,7 @@ if response.status_code == 200:
|
|
2798
3047
|
scoring: Union[Callable, str, None],
|
2799
3048
|
estimator: Optional[Any],
|
2800
3049
|
importance_threshold: Optional[float],
|
3050
|
+
stability_threshold: float,
|
2801
3051
|
max_features: Optional[int],
|
2802
3052
|
remove_outliers_calc_metrics: Optional[bool],
|
2803
3053
|
auto_fe_parameters: AutoFEParameters,
|
@@ -2812,6 +3062,7 @@ if response.status_code == 200:
|
|
2812
3062
|
self.fit_columns_renaming = None
|
2813
3063
|
self.fit_dropped_features = set()
|
2814
3064
|
self.fit_generated_features = []
|
3065
|
+
self.psi_values = None
|
2815
3066
|
|
2816
3067
|
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, eval_set)
|
2817
3068
|
|
@@ -2908,7 +3159,7 @@ if response.status_code == 200:
|
|
2908
3159
|
self.logger.info("Input dataset hasn't date column")
|
2909
3160
|
# TODO remove when this logic will be implemented on the back
|
2910
3161
|
if self.__should_add_date_column():
|
2911
|
-
df = self._add_current_date_as_key(df, self.fit_search_keys, self.
|
3162
|
+
df = self._add_current_date_as_key(df, self.fit_search_keys, self.bundle)
|
2912
3163
|
|
2913
3164
|
email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
|
2914
3165
|
if email_columns and self.generate_search_key_features:
|
@@ -2923,10 +3174,13 @@ if response.status_code == 200:
|
|
2923
3174
|
except Exception:
|
2924
3175
|
self.logger.exception("Failed to check dates distribution validity")
|
2925
3176
|
|
3177
|
+
self.__adjust_cv(df)
|
3178
|
+
|
2926
3179
|
if (
|
2927
3180
|
is_numeric_dtype(df[self.TARGET_NAME])
|
2928
3181
|
and self.model_task_type in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]
|
2929
3182
|
and has_date
|
3183
|
+
and (self.cv is None or not self.cv.is_time_series())
|
2930
3184
|
):
|
2931
3185
|
self._validate_PSI(df.sort_values(by=maybe_date_column))
|
2932
3186
|
|
@@ -2958,7 +3212,15 @@ if response.status_code == 200:
|
|
2958
3212
|
|
2959
3213
|
self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
|
2960
3214
|
|
2961
|
-
|
3215
|
+
# Group columns should have normalized names
|
3216
|
+
if self.runtime_parameters.properties.get("cv_params.group_columns") is not None:
|
3217
|
+
original_to_hash = {v: k for k, v in self.fit_columns_renaming.items()}
|
3218
|
+
self.runtime_parameters.properties["cv_params.group_columns"] = ",".join(
|
3219
|
+
[
|
3220
|
+
original_to_hash.get(c, c)
|
3221
|
+
for c in self.runtime_parameters.properties["cv_params.group_columns"].split(",")
|
3222
|
+
]
|
3223
|
+
)
|
2962
3224
|
|
2963
3225
|
if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
|
2964
3226
|
id_columns = self.__get_renamed_id_columns()
|
@@ -3164,6 +3426,21 @@ if response.status_code == 200:
|
|
3164
3426
|
display_id=f"autofe_descriptions_{uuid.uuid4()}",
|
3165
3427
|
)
|
3166
3428
|
|
3429
|
+
self._select_features_by_psi(
|
3430
|
+
trace_id=trace_id,
|
3431
|
+
X=X,
|
3432
|
+
y=y,
|
3433
|
+
eval_set=eval_set,
|
3434
|
+
stability_threshold=stability_threshold,
|
3435
|
+
cv=self.cv,
|
3436
|
+
estimator=estimator,
|
3437
|
+
exclude_features_sources=exclude_features_sources,
|
3438
|
+
importance_threshold=importance_threshold,
|
3439
|
+
max_features=max_features,
|
3440
|
+
progress_bar=progress_bar,
|
3441
|
+
progress_callback=progress_callback,
|
3442
|
+
)
|
3443
|
+
|
3167
3444
|
if self._has_paid_features(exclude_features_sources):
|
3168
3445
|
if calculate_metrics is not None and calculate_metrics:
|
3169
3446
|
msg = self.bundle.get("metrics_with_paid_features")
|
@@ -3250,20 +3527,23 @@ if response.status_code == 200:
|
|
3250
3527
|
return None if self.id_columns is None else [reverse_renaming.get(c) or c for c in self.id_columns]
|
3251
3528
|
|
3252
3529
|
def __adjust_cv(self, df: pd.DataFrame):
|
3253
|
-
|
3254
|
-
|
3255
|
-
|
3256
|
-
|
3257
|
-
|
3258
|
-
|
3259
|
-
|
3260
|
-
|
3261
|
-
|
3262
|
-
|
3263
|
-
|
3264
|
-
|
3265
|
-
|
3266
|
-
|
3530
|
+
if self.cv is None:
|
3531
|
+
date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
3532
|
+
# Check Multivariate time series
|
3533
|
+
if (
|
3534
|
+
date_column
|
3535
|
+
and self.model_task_type == ModelTaskType.REGRESSION
|
3536
|
+
and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys()))
|
3537
|
+
== 0
|
3538
|
+
and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
|
3539
|
+
):
|
3540
|
+
msg = self.bundle.get("multivariate_timeseries_detected")
|
3541
|
+
self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
|
3542
|
+
elif self.model_task_type != ModelTaskType.REGRESSION:
|
3543
|
+
msg = self.bundle.get("group_k_fold_in_classification")
|
3544
|
+
self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
|
3545
|
+
|
3546
|
+
if self.cv == CVType.group_k_fold:
|
3267
3547
|
group_columns = self._get_group_columns(df, self.fit_search_keys)
|
3268
3548
|
self.runtime_parameters.properties["cv_params.group_columns"] = ",".join(group_columns)
|
3269
3549
|
self.runtime_parameters.properties["cv_params.shuffle_kfold"] = "True"
|
@@ -3289,48 +3569,42 @@ if response.status_code == 200:
|
|
3289
3569
|
y: Optional[pd.Series] = None,
|
3290
3570
|
eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]] = None,
|
3291
3571
|
is_transform: bool = False,
|
3572
|
+
silent: bool = False,
|
3292
3573
|
) -> Tuple[pd.DataFrame, pd.Series, Optional[List[Tuple[pd.DataFrame, pd.Series]]]]:
|
3293
3574
|
validated_X = self._validate_X(X, is_transform)
|
3294
3575
|
validated_y = self._validate_y(validated_X, y, enforce_y=not is_transform)
|
3295
|
-
validated_eval_set = self._validate_eval_set(validated_X, eval_set)
|
3576
|
+
validated_eval_set = self._validate_eval_set(validated_X, eval_set, silent=silent)
|
3296
3577
|
return validated_X, validated_y, validated_eval_set
|
3297
3578
|
|
3298
3579
|
def _encode_id_columns(
|
3299
3580
|
self,
|
3300
3581
|
X: pd.DataFrame,
|
3301
|
-
columns_renaming: Optional[Dict[str, str]] = None,
|
3302
3582
|
) -> Tuple[pd.DataFrame, Dict[str, List[Any]]]:
|
3303
|
-
columns_renaming = columns_renaming or {}
|
3304
3583
|
unknown_dict = {}
|
3305
3584
|
|
3306
3585
|
if self.id_columns and self.id_columns_encoder is not None:
|
3307
|
-
|
3308
|
-
|
3309
|
-
|
3310
|
-
|
3311
|
-
|
3312
|
-
|
3313
|
-
|
3314
|
-
|
3315
|
-
|
3316
|
-
|
3317
|
-
|
3318
|
-
|
3319
|
-
|
3320
|
-
if len(unknown_dict) > 0:
|
3321
|
-
self.logger.warning(f"Unknown values in id columns: {unknown_dict}")
|
3586
|
+
encoding_id_columns = [c for c in self.id_columns if c in X.columns]
|
3587
|
+
if len(encoding_id_columns) > 0:
|
3588
|
+
self.logger.info(f"Convert id columns to int: {encoding_id_columns}")
|
3589
|
+
encoded = self.id_columns_encoder.transform(X[encoding_id_columns])
|
3590
|
+
for i, c in enumerate(encoding_id_columns):
|
3591
|
+
unknown_values = X[encoded[:, i] == -1][c].unique().tolist()
|
3592
|
+
if len(unknown_values) > 0:
|
3593
|
+
unknown_dict[c] = unknown_values
|
3594
|
+
X[encoding_id_columns] = encoded
|
3595
|
+
X = X.loc[(X[encoding_id_columns] != -1).all(axis=1)]
|
3596
|
+
|
3597
|
+
if len(unknown_dict) > 0:
|
3598
|
+
self.logger.warning(f"Unknown values in id columns: {unknown_dict}")
|
3322
3599
|
|
3323
3600
|
return X, unknown_dict
|
3324
3601
|
|
3325
|
-
def _decode_id_columns(self, X: pd.DataFrame
|
3326
|
-
columns_renaming = columns_renaming or {}
|
3602
|
+
def _decode_id_columns(self, X: pd.DataFrame):
|
3327
3603
|
if self.id_columns and self.id_columns_encoder is not None:
|
3328
|
-
|
3329
|
-
|
3330
|
-
|
3331
|
-
|
3332
|
-
decoded = self.id_columns_encoder.inverse_transform(X[renamed_id_columns].rename(columns=columns_renaming))
|
3333
|
-
X[renamed_id_columns] = decoded
|
3604
|
+
decoding_id_columns = [c for c in self.id_columns if c in X.columns]
|
3605
|
+
if len(decoding_id_columns) > 0:
|
3606
|
+
decoded = self.id_columns_encoder.inverse_transform(X[self.id_columns])
|
3607
|
+
X[self.id_columns] = decoded
|
3334
3608
|
|
3335
3609
|
return X
|
3336
3610
|
|
@@ -3424,10 +3698,30 @@ if response.status_code == 200:
|
|
3424
3698
|
|
3425
3699
|
return validated_y
|
3426
3700
|
|
3427
|
-
def _validate_eval_set(
|
3701
|
+
def _validate_eval_set(
|
3702
|
+
self, X: pd.DataFrame, eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]], silent: bool = False
|
3703
|
+
):
|
3428
3704
|
if eval_set is None:
|
3429
3705
|
return None
|
3430
|
-
|
3706
|
+
validated_eval_set = []
|
3707
|
+
has_date = self._get_date_column(self.search_keys) is not None
|
3708
|
+
for idx, eval_pair in enumerate(eval_set):
|
3709
|
+
validated_pair = self._validate_eval_set_pair(X, eval_pair)
|
3710
|
+
if validated_pair[1].isna().all():
|
3711
|
+
if not has_date:
|
3712
|
+
msg = self.bundle.get("oot_without_date_not_supported").format(idx + 1)
|
3713
|
+
elif self.columns_for_online_api:
|
3714
|
+
msg = self.bundle.get("oot_with_online_sources_not_supported").format(idx + 1)
|
3715
|
+
else:
|
3716
|
+
msg = None
|
3717
|
+
if msg:
|
3718
|
+
if not silent:
|
3719
|
+
print(msg)
|
3720
|
+
self.logger.warning(msg)
|
3721
|
+
continue
|
3722
|
+
validated_eval_set.append(validated_pair)
|
3723
|
+
|
3724
|
+
return validated_eval_set
|
3431
3725
|
|
3432
3726
|
def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
|
3433
3727
|
if len(eval_pair) != 2:
|
@@ -3502,16 +3796,18 @@ if response.status_code == 200:
|
|
3502
3796
|
raise ValidationError(self.bundle.get("unsupported_y_type_eval_set").format(type(eval_y)))
|
3503
3797
|
|
3504
3798
|
eval_y_nunique = validated_eval_y.nunique()
|
3505
|
-
|
3799
|
+
is_oot = validated_eval_y.isna().all()
|
3800
|
+
if not is_oot and eval_y_nunique < 2:
|
3506
3801
|
raise ValidationError(self.bundle.get("y_is_constant_eval_set"))
|
3507
3802
|
|
3508
|
-
if self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
|
3803
|
+
if not is_oot and self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
|
3509
3804
|
raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
|
3510
3805
|
|
3511
|
-
|
3512
|
-
|
3513
|
-
|
3514
|
-
|
3806
|
+
if not is_oot:
|
3807
|
+
# Check for duplicates between train and eval sets by comparing all values
|
3808
|
+
train_eval_intersection = pd.merge(X, validated_eval_X, how="inner")
|
3809
|
+
if len(train_eval_intersection) > 0:
|
3810
|
+
raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
|
3515
3811
|
|
3516
3812
|
return validated_eval_X, validated_eval_y
|
3517
3813
|
|
@@ -3527,10 +3823,12 @@ if response.status_code == 200:
|
|
3527
3823
|
if isinstance(eval_set, tuple):
|
3528
3824
|
eval_set = [eval_set]
|
3529
3825
|
for eval in eval_set:
|
3530
|
-
|
3531
|
-
|
3532
|
-
|
3533
|
-
|
3826
|
+
is_oot = eval[1].isna().all()
|
3827
|
+
if not is_oot:
|
3828
|
+
if self.baseline_score_column not in eval[0].columns:
|
3829
|
+
raise ValidationError(self.bundle.get("baseline_score_column_not_exists"))
|
3830
|
+
if eval[0][self.baseline_score_column].isna().any():
|
3831
|
+
raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
|
3534
3832
|
|
3535
3833
|
@staticmethod
|
3536
3834
|
def _sample_X_and_y(X: pd.DataFrame, y: pd.Series, enriched_X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
|
@@ -3704,7 +4002,7 @@ if response.status_code == 200:
|
|
3704
4002
|
return df
|
3705
4003
|
|
3706
4004
|
def _add_current_date_as_key(
|
3707
|
-
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey],
|
4005
|
+
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], bundle: ResourceBundle, silent: bool = False
|
3708
4006
|
) -> pd.DataFrame:
|
3709
4007
|
if (
|
3710
4008
|
set(search_keys.values()) == {SearchKey.PHONE}
|
@@ -3712,7 +4010,8 @@ if response.status_code == 200:
|
|
3712
4010
|
or set(search_keys.values()) == {SearchKey.HEM}
|
3713
4011
|
or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
|
3714
4012
|
):
|
3715
|
-
|
4013
|
+
if not silent:
|
4014
|
+
self.__log_warning(bundle.get("current_date_added"))
|
3716
4015
|
df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
|
3717
4016
|
search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
|
3718
4017
|
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, generate_cyclical_features=False)
|
@@ -3851,7 +4150,7 @@ if response.status_code == 200:
|
|
3851
4150
|
columns_to_sort = [date_column] if date_column is not None else []
|
3852
4151
|
|
3853
4152
|
do_sorting = True
|
3854
|
-
if self.id_columns and self.cv.is_time_series():
|
4153
|
+
if self.id_columns and self.cv is not None and self.cv.is_time_series():
|
3855
4154
|
# Check duplicates by date and id_columns
|
3856
4155
|
reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
3857
4156
|
renamed_id_columns = [reversed_columns_renaming.get(c, c) for c in self.id_columns]
|
@@ -3978,6 +4277,17 @@ if response.status_code == 200:
|
|
3978
4277
|
|
3979
4278
|
# TODO drop system_record_id before merge
|
3980
4279
|
# Merge with result features
|
4280
|
+
# Align dtypes for join key to avoid int/float merge warnings
|
4281
|
+
if ENTITY_SYSTEM_RECORD_ID in input_df.columns and ENTITY_SYSTEM_RECORD_ID in result_features.columns:
|
4282
|
+
input_is_float = pd.api.types.is_float_dtype(input_df[ENTITY_SYSTEM_RECORD_ID])
|
4283
|
+
result_is_float = pd.api.types.is_float_dtype(result_features[ENTITY_SYSTEM_RECORD_ID])
|
4284
|
+
if input_is_float or result_is_float:
|
4285
|
+
input_df[ENTITY_SYSTEM_RECORD_ID] = pd.to_numeric(
|
4286
|
+
input_df[ENTITY_SYSTEM_RECORD_ID], errors="coerce"
|
4287
|
+
).astype("float64")
|
4288
|
+
result_features[ENTITY_SYSTEM_RECORD_ID] = pd.to_numeric(
|
4289
|
+
result_features[ENTITY_SYSTEM_RECORD_ID], errors="coerce"
|
4290
|
+
).astype("float64")
|
3981
4291
|
result_features = pd.merge(
|
3982
4292
|
input_df,
|
3983
4293
|
result_features,
|
@@ -4047,7 +4357,11 @@ if response.status_code == 200:
|
|
4047
4357
|
return [f.name for f in features_meta if f.type == "categorical" and f.name not in (self.id_columns or [])]
|
4048
4358
|
|
4049
4359
|
def __prepare_feature_importances(
|
4050
|
-
self,
|
4360
|
+
self,
|
4361
|
+
trace_id: str,
|
4362
|
+
clients_features_df: pd.DataFrame,
|
4363
|
+
updated_shaps: Optional[Dict[str, float]] = None,
|
4364
|
+
silent=False,
|
4051
4365
|
):
|
4052
4366
|
if self._search_task is None:
|
4053
4367
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
@@ -4060,11 +4374,12 @@ if response.status_code == 200:
|
|
4060
4374
|
features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
|
4061
4375
|
|
4062
4376
|
# To be sure that names with hash suffixes
|
4063
|
-
|
4377
|
+
clients_features_df = clients_features_df.rename(columns=original_names_dict)
|
4064
4378
|
|
4065
4379
|
self.feature_names_ = []
|
4066
4380
|
self.external_source_feature_names = []
|
4067
4381
|
self.zero_shap_client_features = []
|
4382
|
+
self.unstable_client_features = []
|
4068
4383
|
self.feature_importances_ = []
|
4069
4384
|
features_info = []
|
4070
4385
|
features_info_without_links = []
|
@@ -4073,10 +4388,10 @@ if response.status_code == 200:
|
|
4073
4388
|
original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
|
4074
4389
|
|
4075
4390
|
for feature_meta in features_meta:
|
4076
|
-
|
4077
|
-
|
4391
|
+
original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
|
4392
|
+
feature_meta.name = original_name
|
4078
4393
|
|
4079
|
-
is_client_feature =
|
4394
|
+
is_client_feature = original_name in clients_features_df.columns
|
4080
4395
|
|
4081
4396
|
# Show and update shap values for client features only if select_features is True
|
4082
4397
|
if updated_shaps is not None and (not is_client_feature or self.fit_select_features):
|
@@ -4093,12 +4408,21 @@ if response.status_code == 200:
|
|
4093
4408
|
|
4094
4409
|
for feature_meta in features_meta:
|
4095
4410
|
original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
|
4096
|
-
is_client_feature = original_name in
|
4411
|
+
is_client_feature = original_name in clients_features_df.columns
|
4097
4412
|
|
4098
4413
|
if not is_client_feature:
|
4099
4414
|
self.external_source_feature_names.append(original_name)
|
4100
4415
|
|
4416
|
+
if self.psi_values is not None:
|
4417
|
+
if original_name in self.psi_values:
|
4418
|
+
feature_meta.psi_value = self.psi_values[original_name]
|
4419
|
+
else:
|
4420
|
+
if is_client_feature and self.fit_select_features:
|
4421
|
+
self.unstable_client_features.append(original_name)
|
4422
|
+
continue
|
4423
|
+
|
4101
4424
|
# TODO make a decision about selected features based on special flag from mlb
|
4425
|
+
|
4102
4426
|
if original_shaps.get(feature_meta.name, 0.0) == 0.0:
|
4103
4427
|
if is_client_feature and self.fit_select_features:
|
4104
4428
|
self.zero_shap_client_features.append(original_name)
|
@@ -4122,7 +4446,7 @@ if response.status_code == 200:
|
|
4122
4446
|
self.feature_names_.append(feature_meta.name)
|
4123
4447
|
self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
|
4124
4448
|
|
4125
|
-
df_for_sample = features_df if feature_meta.name in features_df.columns else
|
4449
|
+
df_for_sample = features_df if feature_meta.name in features_df.columns else clients_features_df
|
4126
4450
|
feature_info = FeatureInfo.from_metadata(feature_meta, df_for_sample, is_client_feature)
|
4127
4451
|
features_info.append(feature_info.to_row(self.bundle))
|
4128
4452
|
features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
|
@@ -4130,6 +4454,8 @@ if response.status_code == 200:
|
|
4130
4454
|
|
4131
4455
|
if len(features_info) > 0:
|
4132
4456
|
self.features_info = pd.DataFrame(features_info)
|
4457
|
+
if self.features_info[self.bundle.get("features_info_psi")].isna().all():
|
4458
|
+
self.features_info.drop(columns=[self.bundle.get("features_info_psi")], inplace=True)
|
4133
4459
|
self._features_info_without_links = pd.DataFrame(features_info_without_links)
|
4134
4460
|
self._internal_features_info = pd.DataFrame(internal_features_info)
|
4135
4461
|
if not silent:
|