upgini 1.2.113a4__py3-none-any.whl → 1.2.113a3974.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/autofe/date.py +2 -0
- upgini/features_enricher.py +118 -423
- upgini/metadata.py +0 -1
- upgini/metrics.py +1 -4
- upgini/resource_bundle/strings.properties +1 -4
- upgini/sampler/base.py +1 -3
- upgini/sampler/random_under_sampler.py +8 -18
- upgini/utils/deduplicate_utils.py +7 -43
- upgini/utils/feature_info.py +0 -5
- {upgini-1.2.113a4.dist-info → upgini-1.2.113a3974.dev1.dist-info}/METADATA +1 -1
- {upgini-1.2.113a4.dist-info → upgini-1.2.113a3974.dev1.dist-info}/RECORD +14 -15
- {upgini-1.2.113a4.dist-info → upgini-1.2.113a3974.dev1.dist-info}/WHEEL +1 -1
- upgini/utils/psi.py +0 -268
- {upgini-1.2.113a4.dist-info → upgini-1.2.113a3974.dev1.dist-info}/licenses/LICENSE +0 -0
upgini/features_enricher.py
CHANGED
@@ -112,7 +112,6 @@ except Exception:
|
|
112
112
|
CustomFallbackProgressBar as ProgressBar,
|
113
113
|
)
|
114
114
|
|
115
|
-
from upgini.utils.psi import calculate_features_psi
|
116
115
|
from upgini.utils.sample_utils import SampleColumns, SampleConfig, _num_samples, sample
|
117
116
|
from upgini.utils.sort import sort_columns
|
118
117
|
from upgini.utils.target_utils import calculate_psi, define_task
|
@@ -298,9 +297,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
298
297
|
self.feature_names_ = []
|
299
298
|
self.external_source_feature_names = []
|
300
299
|
self.zero_shap_client_features = []
|
301
|
-
self.unstable_client_features = []
|
302
300
|
self.feature_importances_ = []
|
303
|
-
self.psi_values: Optional[Dict[str, float]] = None
|
304
301
|
self.search_id = search_id
|
305
302
|
self.disable_force_downsampling = disable_force_downsampling
|
306
303
|
self.print_trace_id = print_trace_id
|
@@ -401,26 +398,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
401
398
|
@staticmethod
|
402
399
|
def _check_eval_set(eval_set, X, bundle: ResourceBundle):
|
403
400
|
checked_eval_set = []
|
404
|
-
if eval_set is None:
|
405
|
-
return checked_eval_set
|
406
|
-
if isinstance(eval_set, tuple):
|
401
|
+
if eval_set is not None and isinstance(eval_set, tuple):
|
407
402
|
eval_set = [eval_set]
|
408
|
-
if not isinstance(eval_set, list):
|
403
|
+
if eval_set is not None and not isinstance(eval_set, list):
|
409
404
|
raise ValidationError(bundle.get("unsupported_type_eval_set").format(type(eval_set)))
|
410
405
|
for eval_pair in eval_set or []:
|
411
|
-
# Handle OOT
|
412
|
-
if isinstance(eval_pair, pd.DataFrame):
|
413
|
-
empty_target = pd.Series([np.nan] * len(eval_pair), index=eval_pair.index)
|
414
|
-
eval_pair = (eval_pair, empty_target)
|
415
|
-
elif isinstance(eval_pair, tuple) and len(eval_pair) == 1:
|
416
|
-
empty_target = pd.Series([np.nan] * len(eval_pair[0]), index=eval_pair[0].index)
|
417
|
-
eval_pair = (eval_pair[0], empty_target)
|
418
|
-
|
419
406
|
if not isinstance(eval_pair, tuple) or len(eval_pair) != 2:
|
420
407
|
raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
421
|
-
if eval_pair[1] is None:
|
422
|
-
empty_target = pd.Series([np.nan] * len(eval_pair[0]), index=eval_pair[0].index)
|
423
|
-
eval_pair = (eval_pair[0], empty_target)
|
424
408
|
if not is_frames_equal(X, eval_pair[0], bundle):
|
425
409
|
checked_eval_set.append(eval_pair)
|
426
410
|
return checked_eval_set
|
@@ -442,7 +426,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
442
426
|
search_id_callback: Optional[Callable[[str], Any]] = None,
|
443
427
|
select_features: bool = True,
|
444
428
|
auto_fe_parameters: Optional[AutoFEParameters] = None,
|
445
|
-
stability_threshold: float = 0.15,
|
446
429
|
**kwargs,
|
447
430
|
):
|
448
431
|
"""Fit to data.
|
@@ -532,7 +515,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
532
515
|
estimator=estimator,
|
533
516
|
scoring=scoring,
|
534
517
|
importance_threshold=importance_threshold,
|
535
|
-
stability_threshold=stability_threshold,
|
536
518
|
max_features=max_features,
|
537
519
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
538
520
|
auto_fe_parameters=auto_fe_parameters,
|
@@ -592,7 +574,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
592
574
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
593
575
|
select_features: bool = True,
|
594
576
|
auto_fe_parameters: Optional[AutoFEParameters] = None,
|
595
|
-
stability_threshold: float = 0.15,
|
596
577
|
**kwargs,
|
597
578
|
) -> pd.DataFrame:
|
598
579
|
"""Fit to data, then transform it.
|
@@ -637,10 +618,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
637
618
|
If True, return only selected features both from input and data sources.
|
638
619
|
Otherwise, return all features from input and only selected features from data sources.
|
639
620
|
|
640
|
-
stability_threshold: float, optional (default=0.15)
|
641
|
-
Stability threshold for selected features PSI calculation. If PSI is less than this threshold,
|
642
|
-
then feature will be dropped.
|
643
|
-
|
644
621
|
Returns
|
645
622
|
-------
|
646
623
|
X_new: pandas.DataFrame of shape (n_samples, n_features_new)
|
@@ -697,7 +674,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
697
674
|
scoring=scoring,
|
698
675
|
estimator=estimator,
|
699
676
|
importance_threshold=importance_threshold,
|
700
|
-
stability_threshold=stability_threshold,
|
701
677
|
max_features=max_features,
|
702
678
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
703
679
|
auto_fe_parameters=auto_fe_parameters,
|
@@ -965,7 +941,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
965
941
|
raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
|
966
942
|
|
967
943
|
validated_X, validated_y, validated_eval_set = self._validate_train_eval(
|
968
|
-
effective_X, effective_y, effective_eval_set
|
944
|
+
effective_X, effective_y, effective_eval_set
|
969
945
|
)
|
970
946
|
|
971
947
|
if self.X is None:
|
@@ -1003,31 +979,29 @@ class FeaturesEnricher(TransformerMixin):
|
|
1003
979
|
return None
|
1004
980
|
|
1005
981
|
cat_features_from_backend = self.__get_categorical_features()
|
1006
|
-
# Convert to original names
|
1007
|
-
cat_features_from_backend = [self.fit_columns_renaming.get(c, c) for c in cat_features_from_backend]
|
1008
982
|
client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
|
1009
983
|
estimator, validated_X, self.search_keys
|
1010
984
|
)
|
1011
|
-
# Exclude id columns from cat_features
|
1012
985
|
if self.id_columns and self.id_columns_encoder is not None:
|
1013
986
|
if cat_features_from_backend:
|
1014
987
|
cat_features_from_backend = [
|
1015
988
|
c
|
1016
989
|
for c in cat_features_from_backend
|
1017
|
-
if c not in self.id_columns_encoder.feature_names_in_
|
990
|
+
if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
|
1018
991
|
]
|
1019
992
|
if client_cat_features:
|
1020
993
|
client_cat_features = [
|
1021
994
|
c
|
1022
995
|
for c in client_cat_features
|
1023
|
-
if c not in self.id_columns_encoder.feature_names_in_
|
996
|
+
if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
|
1024
997
|
]
|
1025
998
|
for cat_feature in cat_features_from_backend:
|
1026
|
-
|
1027
|
-
|
1028
|
-
|
999
|
+
original_cat_feature = self.fit_columns_renaming.get(cat_feature)
|
1000
|
+
if original_cat_feature in self.search_keys:
|
1001
|
+
if self.search_keys[original_cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
|
1002
|
+
search_keys_for_metrics.append(original_cat_feature)
|
1029
1003
|
else:
|
1030
|
-
self.logger.warning(self.bundle.get("cat_feature_search_key").format(
|
1004
|
+
self.logger.warning(self.bundle.get("cat_feature_search_key").format(original_cat_feature))
|
1031
1005
|
search_keys_for_metrics.extend([c for c in self.id_columns or [] if c not in search_keys_for_metrics])
|
1032
1006
|
self.logger.info(f"Search keys for metrics: {search_keys_for_metrics}")
|
1033
1007
|
|
@@ -1059,9 +1033,23 @@ class FeaturesEnricher(TransformerMixin):
|
|
1059
1033
|
groups,
|
1060
1034
|
_cv,
|
1061
1035
|
columns_renaming,
|
1062
|
-
_,
|
1063
1036
|
) = prepared_data
|
1064
1037
|
|
1038
|
+
# rename cat_features
|
1039
|
+
if client_cat_features:
|
1040
|
+
for new_c, old_c in columns_renaming.items():
|
1041
|
+
if old_c in client_cat_features:
|
1042
|
+
client_cat_features.remove(old_c)
|
1043
|
+
client_cat_features.append(new_c)
|
1044
|
+
for cat_feature in client_cat_features:
|
1045
|
+
if cat_feature not in fitting_X.columns:
|
1046
|
+
self.logger.error(
|
1047
|
+
f"Client cat_feature `{cat_feature}` not found in"
|
1048
|
+
f" x columns: {fitting_X.columns.to_list()}"
|
1049
|
+
)
|
1050
|
+
else:
|
1051
|
+
client_cat_features = []
|
1052
|
+
|
1065
1053
|
# rename baseline_score_column
|
1066
1054
|
reversed_renaming = {v: k for k, v in columns_renaming.items()}
|
1067
1055
|
baseline_score_column = self.baseline_score_column
|
@@ -1086,9 +1074,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
1086
1074
|
self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
|
1087
1075
|
|
1088
1076
|
has_date = self._get_date_column(search_keys) is not None
|
1077
|
+
has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
|
1089
1078
|
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
1090
1079
|
cat_features = list(set(client_cat_features + cat_features_from_backend))
|
1091
|
-
has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
|
1092
1080
|
baseline_cat_features = [f for f in cat_features if f in fitting_X.columns]
|
1093
1081
|
enriched_cat_features = [f for f in cat_features if f in fitting_enriched_X.columns]
|
1094
1082
|
if len(enriched_cat_features) < len(cat_features):
|
@@ -1208,6 +1196,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1208
1196
|
# max_initial_eval_set_hit_rate = self._search_task.get_max_initial_eval_set_hit_rate_v2()
|
1209
1197
|
if len(fitting_eval_set_dict) > 0:
|
1210
1198
|
for idx in fitting_eval_set_dict.keys():
|
1199
|
+
# eval_hit_rate = max_initial_eval_set_hit_rate[idx + 1]
|
1200
|
+
|
1211
1201
|
(
|
1212
1202
|
eval_X_sorted,
|
1213
1203
|
eval_y_sorted,
|
@@ -1215,10 +1205,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
1215
1205
|
enriched_eval_y_sorted,
|
1216
1206
|
) = fitting_eval_set_dict[idx]
|
1217
1207
|
|
1218
|
-
if eval_y_sorted.isna().all():
|
1219
|
-
# Skip OOT eval set
|
1220
|
-
continue
|
1221
|
-
|
1222
1208
|
if baseline_estimator is not None:
|
1223
1209
|
self.logger.info(
|
1224
1210
|
f"Calculate baseline {metric} on eval set {idx + 1} "
|
@@ -1261,14 +1247,17 @@ class FeaturesEnricher(TransformerMixin):
|
|
1261
1247
|
"quality_metrics_eval_segment"
|
1262
1248
|
).format(idx + 1),
|
1263
1249
|
self.bundle.get("quality_metrics_rows_header"): _num_samples(
|
1250
|
+
# effective_eval_set[idx][0]
|
1264
1251
|
# Use actually used for metrics dataset
|
1265
1252
|
eval_X_sorted
|
1266
1253
|
),
|
1254
|
+
# self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
|
1267
1255
|
}
|
1268
1256
|
if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
|
1269
1257
|
eval_y_sorted
|
1270
1258
|
):
|
1271
1259
|
eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
|
1260
|
+
# np.mean(validated_eval_set[idx][1]), 4
|
1272
1261
|
# Use actually used for metrics dataset
|
1273
1262
|
np.mean(eval_y_sorted),
|
1274
1263
|
4,
|
@@ -1290,7 +1279,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1290
1279
|
metrics.append(eval_metrics)
|
1291
1280
|
|
1292
1281
|
if updating_shaps is not None:
|
1293
|
-
decoded_X = self._decode_id_columns(fitting_X)
|
1282
|
+
decoded_X = self._decode_id_columns(fitting_X, columns_renaming)
|
1294
1283
|
self._update_shap_values(trace_id, decoded_X, updating_shaps, silent=not internal_call)
|
1295
1284
|
|
1296
1285
|
metrics_df = pd.DataFrame(metrics)
|
@@ -1341,188 +1330,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
1341
1330
|
finally:
|
1342
1331
|
self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
|
1343
1332
|
|
1344
|
-
def _select_features_by_psi(
|
1345
|
-
self,
|
1346
|
-
trace_id: str,
|
1347
|
-
X: Union[pd.DataFrame, pd.Series, np.ndarray],
|
1348
|
-
y: Union[pd.DataFrame, pd.Series, np.ndarray, List],
|
1349
|
-
eval_set: Optional[Union[List[tuple], tuple]],
|
1350
|
-
stability_threshold: float,
|
1351
|
-
cv: Union[BaseCrossValidator, CVType, str, None] = None,
|
1352
|
-
estimator=None,
|
1353
|
-
exclude_features_sources: Optional[List[str]] = None,
|
1354
|
-
importance_threshold: Optional[float] = None,
|
1355
|
-
max_features: Optional[int] = None,
|
1356
|
-
progress_bar: bool = True,
|
1357
|
-
progress_callback: Optional[Callable] = None,
|
1358
|
-
):
|
1359
|
-
search_keys = self.search_keys.copy()
|
1360
|
-
validated_X, _, validated_eval_set = self._validate_train_eval(X, y, eval_set, silent=True)
|
1361
|
-
if isinstance(X, np.ndarray):
|
1362
|
-
search_keys = {str(k): v for k, v in search_keys.items()}
|
1363
|
-
|
1364
|
-
date_column = self._get_date_column(search_keys)
|
1365
|
-
has_date = date_column is not None
|
1366
|
-
if not has_date:
|
1367
|
-
self.logger.info("No date column for OOT PSI calculation")
|
1368
|
-
return
|
1369
|
-
if not validated_eval_set:
|
1370
|
-
self.logger.info("No eval set for OOT PSI calculation")
|
1371
|
-
return
|
1372
|
-
if validated_X[date_column].nunique() <= 1:
|
1373
|
-
self.logger.warning("Constant date for OOT PSI calculation")
|
1374
|
-
return
|
1375
|
-
if self.cv is not None and self.cv.is_time_series():
|
1376
|
-
self.logger.warning("Time series CV is not supported for OOT PSI calculation")
|
1377
|
-
return
|
1378
|
-
|
1379
|
-
cat_features_from_backend = self.__get_categorical_features()
|
1380
|
-
cat_features_from_backend = [self.fit_columns_renaming.get(c, c) for c in cat_features_from_backend]
|
1381
|
-
client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
|
1382
|
-
estimator, validated_X, search_keys
|
1383
|
-
)
|
1384
|
-
if self.id_columns and self.id_columns_encoder is not None:
|
1385
|
-
if cat_features_from_backend:
|
1386
|
-
cat_features_from_backend = [
|
1387
|
-
c
|
1388
|
-
for c in cat_features_from_backend
|
1389
|
-
if c not in self.id_columns_encoder.feature_names_in_
|
1390
|
-
]
|
1391
|
-
if client_cat_features:
|
1392
|
-
client_cat_features = [
|
1393
|
-
c
|
1394
|
-
for c in client_cat_features
|
1395
|
-
if c not in self.id_columns_encoder.feature_names_in_
|
1396
|
-
]
|
1397
|
-
|
1398
|
-
prepared_data = self._prepare_data_for_metrics(
|
1399
|
-
trace_id=trace_id,
|
1400
|
-
X=X,
|
1401
|
-
y=y,
|
1402
|
-
eval_set=eval_set,
|
1403
|
-
exclude_features_sources=exclude_features_sources,
|
1404
|
-
importance_threshold=importance_threshold,
|
1405
|
-
max_features=max_features,
|
1406
|
-
remove_outliers_calc_metrics=False,
|
1407
|
-
cv_override=cv,
|
1408
|
-
search_keys_for_metrics=search_keys_for_metrics,
|
1409
|
-
progress_bar=progress_bar,
|
1410
|
-
progress_callback=progress_callback,
|
1411
|
-
client_cat_features=client_cat_features,
|
1412
|
-
)
|
1413
|
-
if prepared_data is None:
|
1414
|
-
return None
|
1415
|
-
|
1416
|
-
(
|
1417
|
-
validated_X,
|
1418
|
-
fitting_X,
|
1419
|
-
y_sorted,
|
1420
|
-
fitting_enriched_X,
|
1421
|
-
_,
|
1422
|
-
fitting_eval_set_dict,
|
1423
|
-
_,
|
1424
|
-
_,
|
1425
|
-
_,
|
1426
|
-
columns_renaming,
|
1427
|
-
eval_set_dates,
|
1428
|
-
) = prepared_data
|
1429
|
-
|
1430
|
-
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
1431
|
-
cat_features = list(set(client_cat_features + cat_features_from_backend))
|
1432
|
-
|
1433
|
-
# Drop unstable features
|
1434
|
-
unstable_features = self._check_stability(
|
1435
|
-
validated_X,
|
1436
|
-
validated_eval_set,
|
1437
|
-
fitting_eval_set_dict,
|
1438
|
-
eval_set_dates,
|
1439
|
-
search_keys,
|
1440
|
-
stability_threshold,
|
1441
|
-
cat_features,
|
1442
|
-
model_task_type,
|
1443
|
-
)
|
1444
|
-
client_features_df = self.df_with_original_index.rename(columns=columns_renaming)
|
1445
|
-
# decoded_X = self._decode_id_columns(fitting_X, columns_renaming)
|
1446
|
-
self._update_report_psi(trace_id, client_features_df)
|
1447
|
-
|
1448
|
-
if unstable_features:
|
1449
|
-
msg = f"Some features are unstable: {unstable_features} and will be dropped"
|
1450
|
-
self.logger.warning(msg)
|
1451
|
-
print(msg)
|
1452
|
-
fitting_X = fitting_X.drop(columns=unstable_features, errors="ignore")
|
1453
|
-
fitting_enriched_X = fitting_enriched_X.drop(columns=unstable_features, errors="ignore")
|
1454
|
-
msg = f"Threre are {len(fitting_enriched_X.columns)} stable selected features left"
|
1455
|
-
self.logger.info(msg)
|
1456
|
-
print(msg)
|
1457
|
-
for idx, (
|
1458
|
-
eval_X,
|
1459
|
-
eval_y,
|
1460
|
-
eval_enriched_X,
|
1461
|
-
eval_enriched_y,
|
1462
|
-
) in fitting_eval_set_dict.items():
|
1463
|
-
eval_X = eval_X.drop(columns=unstable_features, errors="ignore")
|
1464
|
-
eval_enriched_X = eval_enriched_X.drop(columns=unstable_features, errors="ignore")
|
1465
|
-
fitting_eval_set_dict[idx] = (eval_X, eval_y, eval_enriched_X, eval_enriched_y)
|
1466
|
-
|
1467
|
-
def _check_stability(
|
1468
|
-
self,
|
1469
|
-
X: pd.DataFrame,
|
1470
|
-
eval_set: List[Tuple[pd.DataFrame, pd.Series]],
|
1471
|
-
enriched_eval_set: Dict,
|
1472
|
-
eval_set_dates: Dict[int, pd.Series],
|
1473
|
-
search_keys: Dict[str, SearchKey],
|
1474
|
-
stability_threshold: float,
|
1475
|
-
cat_features: List[str],
|
1476
|
-
model_task_type: ModelTaskType,
|
1477
|
-
) -> List[str]:
|
1478
|
-
# Find latest eval set or earliest if all eval sets are before train set
|
1479
|
-
date_column = self._get_date_column(search_keys)
|
1480
|
-
|
1481
|
-
# Get minimum date from main dataset X
|
1482
|
-
main_min_date = X[date_column].min()
|
1483
|
-
|
1484
|
-
# Find minimum date for each eval_set and compare with main dataset
|
1485
|
-
eval_dates = []
|
1486
|
-
for i, (eval_x, _) in enumerate(eval_set):
|
1487
|
-
if date_column in eval_x.columns:
|
1488
|
-
eval_min_date = eval_x[date_column].min()
|
1489
|
-
eval_max_date = eval_x[date_column].max()
|
1490
|
-
eval_dates.append((i, eval_min_date, eval_max_date))
|
1491
|
-
|
1492
|
-
if not eval_dates:
|
1493
|
-
return []
|
1494
|
-
|
1495
|
-
# Check if any eval_set has minimum date >= main dataset minimum date
|
1496
|
-
later_eval_sets = [(i, min_date, max_date) for i, min_date, max_date in eval_dates if min_date >= main_min_date]
|
1497
|
-
|
1498
|
-
if later_eval_sets:
|
1499
|
-
# If there are eval_sets with date >= main date, choose the one with highest maximum date
|
1500
|
-
selected_eval_set_idx = max(later_eval_sets, key=lambda x: x[2])[0]
|
1501
|
-
else:
|
1502
|
-
# If all eval_sets have dates < main date, choose the one with lowest minimux date
|
1503
|
-
selected_eval_set_idx = max(eval_dates, key=lambda x: x[1])[0]
|
1504
|
-
|
1505
|
-
checking_eval_set = enriched_eval_set[selected_eval_set_idx]
|
1506
|
-
|
1507
|
-
checking_eval_set_df = (
|
1508
|
-
checking_eval_set[2]
|
1509
|
-
if checking_eval_set[1] is None or checking_eval_set[1].isna().all()
|
1510
|
-
else pd.concat([checking_eval_set[2], checking_eval_set[1].to_frame(TARGET)], axis=1)
|
1511
|
-
)
|
1512
|
-
checking_eval_set_df = checking_eval_set_df.copy()
|
1513
|
-
|
1514
|
-
checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
|
1515
|
-
|
1516
|
-
psi_values = calculate_features_psi(
|
1517
|
-
checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
|
1518
|
-
)
|
1519
|
-
|
1520
|
-
self.psi_values = {
|
1521
|
-
feature: psi_value for feature, psi_value in psi_values.items() if psi_value <= stability_threshold
|
1522
|
-
}
|
1523
|
-
|
1524
|
-
return [feature for feature, psi in psi_values.items() if psi > stability_threshold]
|
1525
|
-
|
1526
1333
|
def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float], silent: bool = False):
|
1527
1334
|
renaming = self.fit_columns_renaming or {}
|
1528
1335
|
self.logger.info(f"Updating SHAP values: {new_shaps}")
|
@@ -1578,56 +1385,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
1578
1385
|
except (ImportError, NameError):
|
1579
1386
|
pass
|
1580
1387
|
|
1581
|
-
def _update_report_psi(self, trace_id: str, clients_features_df: pd.DataFrame):
|
1582
|
-
self.__prepare_feature_importances(trace_id, clients_features_df)
|
1583
|
-
|
1584
|
-
if self.features_info_display_handle is not None:
|
1585
|
-
try:
|
1586
|
-
_ = get_ipython() # type: ignore
|
1587
|
-
|
1588
|
-
display_html_dataframe(
|
1589
|
-
self.features_info,
|
1590
|
-
self._features_info_without_links,
|
1591
|
-
self.bundle.get("relevant_features_header"),
|
1592
|
-
display_handle=self.features_info_display_handle,
|
1593
|
-
)
|
1594
|
-
except (ImportError, NameError):
|
1595
|
-
pass
|
1596
|
-
|
1597
|
-
if self.data_sources_display_handle is not None:
|
1598
|
-
try:
|
1599
|
-
_ = get_ipython() # type: ignore
|
1600
|
-
|
1601
|
-
display_html_dataframe(
|
1602
|
-
self.relevant_data_sources,
|
1603
|
-
self._relevant_data_sources_wo_links,
|
1604
|
-
self.bundle.get("relevant_data_sources_header"),
|
1605
|
-
display_handle=self.data_sources_display_handle,
|
1606
|
-
)
|
1607
|
-
except (ImportError, NameError):
|
1608
|
-
pass
|
1609
|
-
|
1610
|
-
if self.autofe_features_display_handle is not None:
|
1611
|
-
try:
|
1612
|
-
_ = get_ipython() # type: ignore
|
1613
|
-
autofe_descriptions_df = self.get_autofe_features_description()
|
1614
|
-
if autofe_descriptions_df is not None:
|
1615
|
-
display_html_dataframe(
|
1616
|
-
df=autofe_descriptions_df,
|
1617
|
-
internal_df=autofe_descriptions_df,
|
1618
|
-
header=self.bundle.get("autofe_descriptions_header"),
|
1619
|
-
display_handle=self.autofe_features_display_handle,
|
1620
|
-
)
|
1621
|
-
except (ImportError, NameError):
|
1622
|
-
pass
|
1623
|
-
if self.report_button_handle is not None:
|
1624
|
-
try:
|
1625
|
-
_ = get_ipython() # type: ignore
|
1626
|
-
|
1627
|
-
self.__show_report_button(display_handle=self.report_button_handle)
|
1628
|
-
except (ImportError, NameError):
|
1629
|
-
pass
|
1630
|
-
|
1631
1388
|
def _check_train_and_eval_target_distribution(self, y, eval_set_dict):
|
1632
1389
|
uneven_distribution = False
|
1633
1390
|
for eval_set in eval_set_dict.values():
|
@@ -1731,7 +1488,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1731
1488
|
def _get_and_validate_client_cat_features(
|
1732
1489
|
self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
1733
1490
|
) -> Tuple[Optional[List[str]], List[str]]:
|
1734
|
-
cat_features =
|
1491
|
+
cat_features = None
|
1735
1492
|
search_keys_for_metrics = []
|
1736
1493
|
if (
|
1737
1494
|
estimator is not None
|
@@ -1778,7 +1535,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1778
1535
|
is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
|
1779
1536
|
is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
|
1780
1537
|
checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
1781
|
-
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set
|
1538
|
+
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set)
|
1782
1539
|
|
1783
1540
|
sampled_data = self._get_enriched_for_metrics(
|
1784
1541
|
trace_id,
|
@@ -1792,7 +1549,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1792
1549
|
progress_bar,
|
1793
1550
|
progress_callback,
|
1794
1551
|
)
|
1795
|
-
|
1552
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = dataclasses.astuple(
|
1796
1553
|
sampled_data
|
1797
1554
|
)
|
1798
1555
|
|
@@ -1815,7 +1572,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1815
1572
|
or c in set(self.feature_names_).union(self.id_columns or [])
|
1816
1573
|
or (self.fit_columns_renaming or {}).get(c, c) in set(self.feature_names_).union(self.id_columns or [])
|
1817
1574
|
)
|
1818
|
-
and c
|
1575
|
+
and c
|
1576
|
+
not in (
|
1819
1577
|
excluding_search_keys
|
1820
1578
|
+ list(self.fit_dropped_features)
|
1821
1579
|
+ [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
|
@@ -1900,7 +1658,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1900
1658
|
fitting_X, y_sorted, search_keys, self.model_task_type, sort_all_columns=True, logger=self.logger
|
1901
1659
|
)
|
1902
1660
|
fitting_X = fitting_X[fitting_x_columns]
|
1903
|
-
fitting_X, _ = self._encode_id_columns(fitting_X)
|
1661
|
+
fitting_X, _ = self._encode_id_columns(fitting_X, self.fit_columns_renaming)
|
1904
1662
|
self.logger.info(f"Final sorted list of fitting X columns: {fitting_x_columns}")
|
1905
1663
|
fitting_enriched_x_columns = fitting_enriched_X.columns.to_list()
|
1906
1664
|
fitting_enriched_x_columns = sort_columns(
|
@@ -1912,18 +1670,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
1912
1670
|
logger=self.logger,
|
1913
1671
|
)
|
1914
1672
|
fitting_enriched_X = fitting_enriched_X[fitting_enriched_x_columns]
|
1915
|
-
fitting_enriched_X, _ = self._encode_id_columns(fitting_enriched_X)
|
1673
|
+
fitting_enriched_X, _ = self._encode_id_columns(fitting_enriched_X, self.fit_columns_renaming)
|
1916
1674
|
self.logger.info(f"Final sorted list of fitting enriched X columns: {fitting_enriched_x_columns}")
|
1917
|
-
date_column = self._get_date_column(search_keys)
|
1918
|
-
eval_set_dates = {}
|
1919
1675
|
for idx, eval_tuple in eval_set_sampled_dict.items():
|
1920
1676
|
eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
|
1921
1677
|
eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
|
1922
1678
|
enriched_eval_X_sorted, enriched_eval_y_sorted = self._sort_by_system_record_id(
|
1923
1679
|
enriched_eval_X, eval_y_sampled, self.cv
|
1924
1680
|
)
|
1925
|
-
if date_column is not None:
|
1926
|
-
eval_set_dates[idx] = eval_X_sorted[date_column]
|
1927
1681
|
fitting_eval_X = eval_X_sorted[fitting_x_columns].copy()
|
1928
1682
|
fitting_enriched_eval_X = enriched_eval_X_sorted[fitting_enriched_x_columns].copy()
|
1929
1683
|
|
@@ -1944,8 +1698,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1944
1698
|
.astype(np.float64)
|
1945
1699
|
)
|
1946
1700
|
|
1947
|
-
fitting_eval_X, unknown_dict = self._encode_id_columns(fitting_eval_X)
|
1948
|
-
fitting_enriched_eval_X, _ = self._encode_id_columns(fitting_enriched_eval_X)
|
1701
|
+
fitting_eval_X, unknown_dict = self._encode_id_columns(fitting_eval_X, self.fit_columns_renaming)
|
1702
|
+
fitting_enriched_eval_X, _ = self._encode_id_columns(fitting_enriched_eval_X, self.fit_columns_renaming)
|
1949
1703
|
|
1950
1704
|
if len(unknown_dict) > 0:
|
1951
1705
|
print(self.bundle.get("unknown_id_column_value_in_eval_set").format(unknown_dict))
|
@@ -1968,7 +1722,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
1968
1722
|
groups,
|
1969
1723
|
cv,
|
1970
1724
|
columns_renaming,
|
1971
|
-
eval_set_dates,
|
1972
1725
|
)
|
1973
1726
|
|
1974
1727
|
@dataclass
|
@@ -2131,16 +1884,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
2131
1884
|
remove_outliers_calc_metrics: Optional[bool],
|
2132
1885
|
) -> _EnrichedDataForMetrics:
|
2133
1886
|
eval_set_sampled_dict = {}
|
2134
|
-
search_keys = self.fit_search_keys
|
1887
|
+
search_keys = self.fit_search_keys
|
2135
1888
|
|
2136
1889
|
rows_to_drop = None
|
2137
1890
|
has_date = self._get_date_column(search_keys) is not None
|
2138
1891
|
self.model_task_type = self.model_task_type or define_task(
|
2139
1892
|
self.df_with_original_index[TARGET], has_date, self.logger, silent=True
|
2140
1893
|
)
|
2141
|
-
if
|
2142
|
-
remove_outliers_calc_metrics = True
|
2143
|
-
if self.model_task_type == ModelTaskType.REGRESSION and remove_outliers_calc_metrics:
|
1894
|
+
if self.model_task_type == ModelTaskType.REGRESSION:
|
2144
1895
|
target_outliers_df = self._search_task.get_target_outliers(trace_id)
|
2145
1896
|
if target_outliers_df is not None and len(target_outliers_df) > 0:
|
2146
1897
|
outliers = pd.merge(
|
@@ -2150,8 +1901,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
2150
1901
|
how="inner",
|
2151
1902
|
)
|
2152
1903
|
top_outliers = outliers.sort_values(by=TARGET, ascending=False)[TARGET].head(3)
|
2153
|
-
|
2154
|
-
|
1904
|
+
if remove_outliers_calc_metrics is None or remove_outliers_calc_metrics is True:
|
1905
|
+
rows_to_drop = outliers
|
1906
|
+
not_msg = ""
|
1907
|
+
else:
|
1908
|
+
not_msg = "not "
|
2155
1909
|
msg = self.bundle.get("target_outliers_warning").format(len(target_outliers_df), top_outliers, not_msg)
|
2156
1910
|
print(msg)
|
2157
1911
|
self.logger.warning(msg)
|
@@ -2209,13 +1963,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
2209
1963
|
enriched_eval_X = enriched_eval_sets[idx + 1][enriched_X_columns].copy()
|
2210
1964
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
2211
1965
|
|
2212
|
-
|
2213
|
-
X_sampled.rename(columns=
|
2214
|
-
enriched_X.rename(columns=
|
1966
|
+
reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
|
1967
|
+
X_sampled.rename(columns=reversed_renaming, inplace=True)
|
1968
|
+
enriched_X.rename(columns=reversed_renaming, inplace=True)
|
2215
1969
|
for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
|
2216
|
-
eval_X_sampled.rename(columns=
|
2217
|
-
enriched_eval_X.rename(columns=
|
2218
|
-
search_keys = {self.fit_columns_renaming.get(k, k): v for k, v in search_keys.items()}
|
1970
|
+
eval_X_sampled.rename(columns=reversed_renaming, inplace=True)
|
1971
|
+
enriched_eval_X.rename(columns=reversed_renaming, inplace=True)
|
2219
1972
|
|
2220
1973
|
datasets_hash = hash_input(self.X, self.y, self.eval_set)
|
2221
1974
|
return self.__cache_and_return_results(
|
@@ -2359,7 +2112,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2359
2112
|
|
2360
2113
|
def __extract_eval_data(
|
2361
2114
|
self, enriched_df: pd.DataFrame, x_columns: List[str], enriched_X_columns: List[str], eval_set_len: int
|
2362
|
-
) ->
|
2115
|
+
) -> Dict[int, Tuple]:
|
2363
2116
|
eval_set_sampled_dict = {}
|
2364
2117
|
|
2365
2118
|
for idx in range(eval_set_len):
|
@@ -2405,12 +2158,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
2405
2158
|
columns_renaming: Dict[str, str],
|
2406
2159
|
):
|
2407
2160
|
# X_sampled - with hash-suffixes
|
2408
|
-
|
2409
|
-
|
2410
|
-
|
2411
|
-
|
2412
|
-
|
2413
|
-
|
2161
|
+
reversed_renaming = {v: k for k, v in columns_renaming.items()}
|
2162
|
+
search_keys = {
|
2163
|
+
reversed_renaming.get(k, k): v
|
2164
|
+
for k, v in search_keys.items()
|
2165
|
+
if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
|
2166
|
+
}
|
2414
2167
|
return FeaturesEnricher._EnrichedDataForMetrics(
|
2415
2168
|
X_sampled=X_sampled,
|
2416
2169
|
y_sampled=y_sampled,
|
@@ -2560,7 +2313,7 @@ if response.status_code == 200:
|
|
2560
2313
|
self.logger.info("Start transform")
|
2561
2314
|
|
2562
2315
|
validated_X, validated_y, validated_eval_set = self._validate_train_eval(
|
2563
|
-
X, y, eval_set=None, is_transform=True
|
2316
|
+
X, y, eval_set=None, is_transform=True
|
2564
2317
|
)
|
2565
2318
|
df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
|
2566
2319
|
|
@@ -2662,7 +2415,7 @@ if response.status_code == 200:
|
|
2662
2415
|
else:
|
2663
2416
|
self.logger.info("Input dataset hasn't date column")
|
2664
2417
|
if self.__should_add_date_column():
|
2665
|
-
df = self._add_current_date_as_key(df, search_keys, self.
|
2418
|
+
df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
|
2666
2419
|
|
2667
2420
|
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
2668
2421
|
if email_columns and self.generate_search_key_features:
|
@@ -2911,8 +2664,7 @@ if response.status_code == 200:
|
|
2911
2664
|
selecting_columns = [
|
2912
2665
|
c
|
2913
2666
|
for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
|
2914
|
-
if
|
2915
|
-
or c in (self.id_columns or [])
|
2667
|
+
if c not in self.zero_shap_client_features or c in (self.id_columns or [])
|
2916
2668
|
]
|
2917
2669
|
selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
|
2918
2670
|
if add_fit_system_record_id:
|
@@ -3046,7 +2798,6 @@ if response.status_code == 200:
|
|
3046
2798
|
scoring: Union[Callable, str, None],
|
3047
2799
|
estimator: Optional[Any],
|
3048
2800
|
importance_threshold: Optional[float],
|
3049
|
-
stability_threshold: float,
|
3050
2801
|
max_features: Optional[int],
|
3051
2802
|
remove_outliers_calc_metrics: Optional[bool],
|
3052
2803
|
auto_fe_parameters: AutoFEParameters,
|
@@ -3061,7 +2812,6 @@ if response.status_code == 200:
|
|
3061
2812
|
self.fit_columns_renaming = None
|
3062
2813
|
self.fit_dropped_features = set()
|
3063
2814
|
self.fit_generated_features = []
|
3064
|
-
self.psi_values = None
|
3065
2815
|
|
3066
2816
|
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, eval_set)
|
3067
2817
|
|
@@ -3158,7 +2908,7 @@ if response.status_code == 200:
|
|
3158
2908
|
self.logger.info("Input dataset hasn't date column")
|
3159
2909
|
# TODO remove when this logic will be implemented on the back
|
3160
2910
|
if self.__should_add_date_column():
|
3161
|
-
df = self._add_current_date_as_key(df, self.fit_search_keys, self.bundle)
|
2911
|
+
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
3162
2912
|
|
3163
2913
|
email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
|
3164
2914
|
if email_columns and self.generate_search_key_features:
|
@@ -3173,13 +2923,10 @@ if response.status_code == 200:
|
|
3173
2923
|
except Exception:
|
3174
2924
|
self.logger.exception("Failed to check dates distribution validity")
|
3175
2925
|
|
3176
|
-
self.__adjust_cv(df)
|
3177
|
-
|
3178
2926
|
if (
|
3179
2927
|
is_numeric_dtype(df[self.TARGET_NAME])
|
3180
2928
|
and self.model_task_type in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]
|
3181
2929
|
and has_date
|
3182
|
-
and (self.cv is None or not self.cv.is_time_series())
|
3183
2930
|
):
|
3184
2931
|
self._validate_PSI(df.sort_values(by=maybe_date_column))
|
3185
2932
|
|
@@ -3211,8 +2958,8 @@ if response.status_code == 200:
|
|
3211
2958
|
|
3212
2959
|
self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
|
3213
2960
|
|
3214
|
-
|
3215
|
-
|
2961
|
+
self.__adjust_cv(df)
|
2962
|
+
|
3216
2963
|
if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
|
3217
2964
|
id_columns = self.__get_renamed_id_columns()
|
3218
2965
|
if id_columns:
|
@@ -3417,21 +3164,6 @@ if response.status_code == 200:
|
|
3417
3164
|
display_id=f"autofe_descriptions_{uuid.uuid4()}",
|
3418
3165
|
)
|
3419
3166
|
|
3420
|
-
self._select_features_by_psi(
|
3421
|
-
trace_id=trace_id,
|
3422
|
-
X=X,
|
3423
|
-
y=y,
|
3424
|
-
eval_set=eval_set,
|
3425
|
-
stability_threshold=stability_threshold,
|
3426
|
-
cv=self.cv,
|
3427
|
-
estimator=estimator,
|
3428
|
-
exclude_features_sources=exclude_features_sources,
|
3429
|
-
importance_threshold=importance_threshold,
|
3430
|
-
max_features=max_features,
|
3431
|
-
progress_bar=progress_bar,
|
3432
|
-
progress_callback=progress_callback,
|
3433
|
-
)
|
3434
|
-
|
3435
3167
|
if self._has_paid_features(exclude_features_sources):
|
3436
3168
|
if calculate_metrics is not None and calculate_metrics:
|
3437
3169
|
msg = self.bundle.get("metrics_with_paid_features")
|
@@ -3517,21 +3249,19 @@ if response.status_code == 200:
|
|
3517
3249
|
reverse_renaming = {v: k for k, v in renaming.items()}
|
3518
3250
|
return None if self.id_columns is None else [reverse_renaming.get(c) or c for c in self.id_columns]
|
3519
3251
|
|
3520
|
-
def __adjust_cv(self, df: pd.DataFrame
|
3521
|
-
if self.cv is not None and not force:
|
3522
|
-
return
|
3523
|
-
|
3252
|
+
def __adjust_cv(self, df: pd.DataFrame):
|
3524
3253
|
date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
3525
3254
|
# Check Multivariate time series
|
3526
3255
|
if (
|
3527
|
-
|
3256
|
+
self.cv is None
|
3257
|
+
and date_column
|
3528
3258
|
and self.model_task_type == ModelTaskType.REGRESSION
|
3529
3259
|
and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys())) == 0
|
3530
3260
|
and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
|
3531
3261
|
):
|
3532
3262
|
msg = self.bundle.get("multivariate_timeseries_detected")
|
3533
3263
|
self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
|
3534
|
-
elif self.model_task_type != ModelTaskType.REGRESSION:
|
3264
|
+
elif self.cv is None and self.model_task_type != ModelTaskType.REGRESSION:
|
3535
3265
|
msg = self.bundle.get("group_k_fold_in_classification")
|
3536
3266
|
self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
|
3537
3267
|
group_columns = self._get_group_columns(df, self.fit_search_keys)
|
@@ -3559,42 +3289,48 @@ if response.status_code == 200:
|
|
3559
3289
|
y: Optional[pd.Series] = None,
|
3560
3290
|
eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]] = None,
|
3561
3291
|
is_transform: bool = False,
|
3562
|
-
silent: bool = False,
|
3563
3292
|
) -> Tuple[pd.DataFrame, pd.Series, Optional[List[Tuple[pd.DataFrame, pd.Series]]]]:
|
3564
3293
|
validated_X = self._validate_X(X, is_transform)
|
3565
3294
|
validated_y = self._validate_y(validated_X, y, enforce_y=not is_transform)
|
3566
|
-
validated_eval_set = self._validate_eval_set(validated_X, eval_set
|
3295
|
+
validated_eval_set = self._validate_eval_set(validated_X, eval_set)
|
3567
3296
|
return validated_X, validated_y, validated_eval_set
|
3568
3297
|
|
3569
3298
|
def _encode_id_columns(
|
3570
3299
|
self,
|
3571
3300
|
X: pd.DataFrame,
|
3301
|
+
columns_renaming: Optional[Dict[str, str]] = None,
|
3572
3302
|
) -> Tuple[pd.DataFrame, Dict[str, List[Any]]]:
|
3303
|
+
columns_renaming = columns_renaming or {}
|
3573
3304
|
unknown_dict = {}
|
3574
3305
|
|
3575
3306
|
if self.id_columns and self.id_columns_encoder is not None:
|
3576
|
-
|
3577
|
-
|
3578
|
-
|
3579
|
-
|
3580
|
-
|
3581
|
-
|
3582
|
-
|
3583
|
-
|
3584
|
-
|
3585
|
-
|
3586
|
-
|
3587
|
-
|
3588
|
-
|
3307
|
+
inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
3308
|
+
renamed_id_columns = [
|
3309
|
+
inverse_columns_renaming.get(col, col) for col in self.id_columns_encoder.feature_names_in_
|
3310
|
+
]
|
3311
|
+
self.logger.info(f"Convert id columns to int: {renamed_id_columns}")
|
3312
|
+
encoded = self.id_columns_encoder.transform(X[renamed_id_columns].rename(columns=columns_renaming))
|
3313
|
+
for i, c in enumerate(renamed_id_columns):
|
3314
|
+
unknown_values = X[encoded[:, i] == -1][c].unique().tolist()
|
3315
|
+
if len(unknown_values) > 0:
|
3316
|
+
unknown_dict[c] = unknown_values
|
3317
|
+
X[renamed_id_columns] = encoded
|
3318
|
+
X = X.loc[(X[renamed_id_columns] != -1).all(axis=1)]
|
3319
|
+
|
3320
|
+
if len(unknown_dict) > 0:
|
3321
|
+
self.logger.warning(f"Unknown values in id columns: {unknown_dict}")
|
3589
3322
|
|
3590
3323
|
return X, unknown_dict
|
3591
3324
|
|
3592
|
-
def _decode_id_columns(self, X: pd.DataFrame):
|
3325
|
+
def _decode_id_columns(self, X: pd.DataFrame, columns_renaming: Dict[str, str]):
|
3326
|
+
columns_renaming = columns_renaming or {}
|
3593
3327
|
if self.id_columns and self.id_columns_encoder is not None:
|
3594
|
-
|
3595
|
-
|
3596
|
-
|
3597
|
-
|
3328
|
+
inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
3329
|
+
renamed_id_columns = [
|
3330
|
+
inverse_columns_renaming.get(col, col) for col in self.id_columns_encoder.feature_names_in_
|
3331
|
+
]
|
3332
|
+
decoded = self.id_columns_encoder.inverse_transform(X[renamed_id_columns].rename(columns=columns_renaming))
|
3333
|
+
X[renamed_id_columns] = decoded
|
3598
3334
|
|
3599
3335
|
return X
|
3600
3336
|
|
@@ -3688,30 +3424,10 @@ if response.status_code == 200:
|
|
3688
3424
|
|
3689
3425
|
return validated_y
|
3690
3426
|
|
3691
|
-
def _validate_eval_set(
|
3692
|
-
self, X: pd.DataFrame, eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]], silent: bool = False
|
3693
|
-
):
|
3427
|
+
def _validate_eval_set(self, X: pd.DataFrame, eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]]):
|
3694
3428
|
if eval_set is None:
|
3695
3429
|
return None
|
3696
|
-
|
3697
|
-
has_date = self._get_date_column(self.search_keys) is not None
|
3698
|
-
for idx, eval_pair in enumerate(eval_set):
|
3699
|
-
validated_pair = self._validate_eval_set_pair(X, eval_pair)
|
3700
|
-
if validated_pair[1].isna().all():
|
3701
|
-
if not has_date:
|
3702
|
-
msg = self.bundle.get("oot_without_date_not_supported").format(idx + 1)
|
3703
|
-
elif self.columns_for_online_api:
|
3704
|
-
msg = self.bundle.get("oot_with_online_sources_not_supported").format(idx + 1)
|
3705
|
-
else:
|
3706
|
-
msg = None
|
3707
|
-
if msg:
|
3708
|
-
if not silent:
|
3709
|
-
print(msg)
|
3710
|
-
self.logger.warning(msg)
|
3711
|
-
continue
|
3712
|
-
validated_eval_set.append(validated_pair)
|
3713
|
-
|
3714
|
-
return validated_eval_set
|
3430
|
+
return [self._validate_eval_set_pair(X, eval_pair) for eval_pair in eval_set]
|
3715
3431
|
|
3716
3432
|
def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
|
3717
3433
|
if len(eval_pair) != 2:
|
@@ -3786,18 +3502,16 @@ if response.status_code == 200:
|
|
3786
3502
|
raise ValidationError(self.bundle.get("unsupported_y_type_eval_set").format(type(eval_y)))
|
3787
3503
|
|
3788
3504
|
eval_y_nunique = validated_eval_y.nunique()
|
3789
|
-
|
3790
|
-
if not is_oot and eval_y_nunique < 2:
|
3505
|
+
if eval_y_nunique < 2:
|
3791
3506
|
raise ValidationError(self.bundle.get("y_is_constant_eval_set"))
|
3792
3507
|
|
3793
|
-
if
|
3508
|
+
if self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
|
3794
3509
|
raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
|
3795
3510
|
|
3796
|
-
|
3797
|
-
|
3798
|
-
|
3799
|
-
|
3800
|
-
raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
|
3511
|
+
# Check for duplicates between train and eval sets by comparing all values
|
3512
|
+
train_eval_intersection = pd.merge(X, validated_eval_X, how="inner")
|
3513
|
+
if len(train_eval_intersection) > 0:
|
3514
|
+
raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
|
3801
3515
|
|
3802
3516
|
return validated_eval_X, validated_eval_y
|
3803
3517
|
|
@@ -3813,12 +3527,10 @@ if response.status_code == 200:
|
|
3813
3527
|
if isinstance(eval_set, tuple):
|
3814
3528
|
eval_set = [eval_set]
|
3815
3529
|
for eval in eval_set:
|
3816
|
-
|
3817
|
-
|
3818
|
-
|
3819
|
-
|
3820
|
-
if eval[0][self.baseline_score_column].isna().any():
|
3821
|
-
raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
|
3530
|
+
if self.baseline_score_column not in eval[0].columns:
|
3531
|
+
raise ValidationError(self.bundle.get("baseline_score_column_not_exists"))
|
3532
|
+
if eval[0][self.baseline_score_column].isna().any():
|
3533
|
+
raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
|
3822
3534
|
|
3823
3535
|
@staticmethod
|
3824
3536
|
def _sample_X_and_y(X: pd.DataFrame, y: pd.Series, enriched_X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
|
@@ -3992,7 +3704,7 @@ if response.status_code == 200:
|
|
3992
3704
|
return df
|
3993
3705
|
|
3994
3706
|
def _add_current_date_as_key(
|
3995
|
-
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey],
|
3707
|
+
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
|
3996
3708
|
) -> pd.DataFrame:
|
3997
3709
|
if (
|
3998
3710
|
set(search_keys.values()) == {SearchKey.PHONE}
|
@@ -4000,8 +3712,7 @@ if response.status_code == 200:
|
|
4000
3712
|
or set(search_keys.values()) == {SearchKey.HEM}
|
4001
3713
|
or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
|
4002
3714
|
):
|
4003
|
-
|
4004
|
-
self.__log_warning(bundle.get("current_date_added"))
|
3715
|
+
self.__log_warning(bundle.get("current_date_added"))
|
4005
3716
|
df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
|
4006
3717
|
search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
|
4007
3718
|
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, generate_cyclical_features=False)
|
@@ -4140,7 +3851,7 @@ if response.status_code == 200:
|
|
4140
3851
|
columns_to_sort = [date_column] if date_column is not None else []
|
4141
3852
|
|
4142
3853
|
do_sorting = True
|
4143
|
-
if self.id_columns and self.cv
|
3854
|
+
if self.id_columns and self.cv.is_time_series():
|
4144
3855
|
# Check duplicates by date and id_columns
|
4145
3856
|
reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
4146
3857
|
renamed_id_columns = [reversed_columns_renaming.get(c, c) for c in self.id_columns]
|
@@ -4336,11 +4047,7 @@ if response.status_code == 200:
|
|
4336
4047
|
return [f.name for f in features_meta if f.type == "categorical" and f.name not in (self.id_columns or [])]
|
4337
4048
|
|
4338
4049
|
def __prepare_feature_importances(
|
4339
|
-
self,
|
4340
|
-
trace_id: str,
|
4341
|
-
clients_features_df: pd.DataFrame,
|
4342
|
-
updated_shaps: Optional[Dict[str, float]] = None,
|
4343
|
-
silent=False,
|
4050
|
+
self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
|
4344
4051
|
):
|
4345
4052
|
if self._search_task is None:
|
4346
4053
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
@@ -4353,12 +4060,11 @@ if response.status_code == 200:
|
|
4353
4060
|
features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
|
4354
4061
|
|
4355
4062
|
# To be sure that names with hash suffixes
|
4356
|
-
|
4063
|
+
df = df.rename(columns=original_names_dict)
|
4357
4064
|
|
4358
4065
|
self.feature_names_ = []
|
4359
4066
|
self.external_source_feature_names = []
|
4360
4067
|
self.zero_shap_client_features = []
|
4361
|
-
self.unstable_client_features = []
|
4362
4068
|
self.feature_importances_ = []
|
4363
4069
|
features_info = []
|
4364
4070
|
features_info_without_links = []
|
@@ -4367,10 +4073,10 @@ if response.status_code == 200:
|
|
4367
4073
|
original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
|
4368
4074
|
|
4369
4075
|
for feature_meta in features_meta:
|
4370
|
-
|
4371
|
-
|
4076
|
+
if feature_meta.name in original_names_dict.keys():
|
4077
|
+
feature_meta.name = original_names_dict[feature_meta.name]
|
4372
4078
|
|
4373
|
-
is_client_feature =
|
4079
|
+
is_client_feature = original_names_dict.get(feature_meta.name, feature_meta.name) in df.columns
|
4374
4080
|
|
4375
4081
|
# Show and update shap values for client features only if select_features is True
|
4376
4082
|
if updated_shaps is not None and (not is_client_feature or self.fit_select_features):
|
@@ -4387,21 +4093,12 @@ if response.status_code == 200:
|
|
4387
4093
|
|
4388
4094
|
for feature_meta in features_meta:
|
4389
4095
|
original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
|
4390
|
-
is_client_feature = original_name in
|
4096
|
+
is_client_feature = original_name in df.columns
|
4391
4097
|
|
4392
4098
|
if not is_client_feature:
|
4393
4099
|
self.external_source_feature_names.append(original_name)
|
4394
4100
|
|
4395
|
-
if self.psi_values is not None:
|
4396
|
-
if original_name in self.psi_values:
|
4397
|
-
feature_meta.psi_value = self.psi_values[original_name]
|
4398
|
-
else:
|
4399
|
-
if is_client_feature and self.fit_select_features:
|
4400
|
-
self.unstable_client_features.append(original_name)
|
4401
|
-
continue
|
4402
|
-
|
4403
4101
|
# TODO make a decision about selected features based on special flag from mlb
|
4404
|
-
|
4405
4102
|
if original_shaps.get(feature_meta.name, 0.0) == 0.0:
|
4406
4103
|
if is_client_feature and self.fit_select_features:
|
4407
4104
|
self.zero_shap_client_features.append(original_name)
|
@@ -4425,7 +4122,7 @@ if response.status_code == 200:
|
|
4425
4122
|
self.feature_names_.append(feature_meta.name)
|
4426
4123
|
self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
|
4427
4124
|
|
4428
|
-
df_for_sample = features_df if feature_meta.name in features_df.columns else
|
4125
|
+
df_for_sample = features_df if feature_meta.name in features_df.columns else df
|
4429
4126
|
feature_info = FeatureInfo.from_metadata(feature_meta, df_for_sample, is_client_feature)
|
4430
4127
|
features_info.append(feature_info.to_row(self.bundle))
|
4431
4128
|
features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
|
@@ -4433,8 +4130,6 @@ if response.status_code == 200:
|
|
4433
4130
|
|
4434
4131
|
if len(features_info) > 0:
|
4435
4132
|
self.features_info = pd.DataFrame(features_info)
|
4436
|
-
if self.features_info[self.bundle.get("features_info_psi")].isna().all():
|
4437
|
-
self.features_info.drop(columns=[self.bundle.get("features_info_psi")], inplace=True)
|
4438
4133
|
self._features_info_without_links = pd.DataFrame(features_info_without_links)
|
4439
4134
|
self._internal_features_info = pd.DataFrame(internal_features_info)
|
4440
4135
|
if not silent:
|