upgini 1.2.113a5__py3-none-any.whl → 1.2.113a3974.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/autofe/date.py +8 -2
- upgini/features_enricher.py +125 -438
- upgini/metadata.py +0 -1
- upgini/metrics.py +1 -4
- upgini/resource_bundle/strings.properties +1 -4
- upgini/sampler/base.py +1 -3
- upgini/sampler/random_under_sampler.py +8 -18
- upgini/utils/deduplicate_utils.py +7 -43
- upgini/utils/feature_info.py +0 -5
- {upgini-1.2.113a5.dist-info → upgini-1.2.113a3974.dev2.dist-info}/METADATA +1 -1
- {upgini-1.2.113a5.dist-info → upgini-1.2.113a3974.dev2.dist-info}/RECORD +14 -15
- {upgini-1.2.113a5.dist-info → upgini-1.2.113a3974.dev2.dist-info}/WHEEL +1 -1
- upgini/utils/psi.py +0 -294
- {upgini-1.2.113a5.dist-info → upgini-1.2.113a3974.dev2.dist-info}/licenses/LICENSE +0 -0
upgini/features_enricher.py
CHANGED
@@ -112,7 +112,6 @@ except Exception:
|
|
112
112
|
CustomFallbackProgressBar as ProgressBar,
|
113
113
|
)
|
114
114
|
|
115
|
-
from upgini.utils.psi import calculate_features_psi, calculate_sparsity_psi
|
116
115
|
from upgini.utils.sample_utils import SampleColumns, SampleConfig, _num_samples, sample
|
117
116
|
from upgini.utils.sort import sort_columns
|
118
117
|
from upgini.utils.target_utils import calculate_psi, define_task
|
@@ -298,9 +297,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
298
297
|
self.feature_names_ = []
|
299
298
|
self.external_source_feature_names = []
|
300
299
|
self.zero_shap_client_features = []
|
301
|
-
self.unstable_client_features = []
|
302
300
|
self.feature_importances_ = []
|
303
|
-
self.psi_values: Optional[Dict[str, float]] = None
|
304
301
|
self.search_id = search_id
|
305
302
|
self.disable_force_downsampling = disable_force_downsampling
|
306
303
|
self.print_trace_id = print_trace_id
|
@@ -401,26 +398,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
401
398
|
@staticmethod
|
402
399
|
def _check_eval_set(eval_set, X, bundle: ResourceBundle):
|
403
400
|
checked_eval_set = []
|
404
|
-
if eval_set is None:
|
405
|
-
return checked_eval_set
|
406
|
-
if isinstance(eval_set, tuple):
|
401
|
+
if eval_set is not None and isinstance(eval_set, tuple):
|
407
402
|
eval_set = [eval_set]
|
408
|
-
if not isinstance(eval_set, list):
|
403
|
+
if eval_set is not None and not isinstance(eval_set, list):
|
409
404
|
raise ValidationError(bundle.get("unsupported_type_eval_set").format(type(eval_set)))
|
410
405
|
for eval_pair in eval_set or []:
|
411
|
-
# Handle OOT
|
412
|
-
if isinstance(eval_pair, pd.DataFrame):
|
413
|
-
empty_target = pd.Series([np.nan] * len(eval_pair), index=eval_pair.index)
|
414
|
-
eval_pair = (eval_pair, empty_target)
|
415
|
-
elif isinstance(eval_pair, tuple) and len(eval_pair) == 1:
|
416
|
-
empty_target = pd.Series([np.nan] * len(eval_pair[0]), index=eval_pair[0].index)
|
417
|
-
eval_pair = (eval_pair[0], empty_target)
|
418
|
-
|
419
406
|
if not isinstance(eval_pair, tuple) or len(eval_pair) != 2:
|
420
407
|
raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
421
|
-
if eval_pair[1] is None:
|
422
|
-
empty_target = pd.Series([np.nan] * len(eval_pair[0]), index=eval_pair[0].index)
|
423
|
-
eval_pair = (eval_pair[0], empty_target)
|
424
408
|
if not is_frames_equal(X, eval_pair[0], bundle):
|
425
409
|
checked_eval_set.append(eval_pair)
|
426
410
|
return checked_eval_set
|
@@ -442,7 +426,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
442
426
|
search_id_callback: Optional[Callable[[str], Any]] = None,
|
443
427
|
select_features: bool = True,
|
444
428
|
auto_fe_parameters: Optional[AutoFEParameters] = None,
|
445
|
-
stability_threshold: float = 0.15,
|
446
429
|
**kwargs,
|
447
430
|
):
|
448
431
|
"""Fit to data.
|
@@ -532,7 +515,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
532
515
|
estimator=estimator,
|
533
516
|
scoring=scoring,
|
534
517
|
importance_threshold=importance_threshold,
|
535
|
-
stability_threshold=stability_threshold,
|
536
518
|
max_features=max_features,
|
537
519
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
538
520
|
auto_fe_parameters=auto_fe_parameters,
|
@@ -592,7 +574,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
592
574
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
593
575
|
select_features: bool = True,
|
594
576
|
auto_fe_parameters: Optional[AutoFEParameters] = None,
|
595
|
-
stability_threshold: float = 0.15,
|
596
577
|
**kwargs,
|
597
578
|
) -> pd.DataFrame:
|
598
579
|
"""Fit to data, then transform it.
|
@@ -637,10 +618,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
637
618
|
If True, return only selected features both from input and data sources.
|
638
619
|
Otherwise, return all features from input and only selected features from data sources.
|
639
620
|
|
640
|
-
stability_threshold: float, optional (default=0.15)
|
641
|
-
Stability threshold for selected features PSI calculation. If PSI is less than this threshold,
|
642
|
-
then feature will be dropped.
|
643
|
-
|
644
621
|
Returns
|
645
622
|
-------
|
646
623
|
X_new: pandas.DataFrame of shape (n_samples, n_features_new)
|
@@ -697,7 +674,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
697
674
|
scoring=scoring,
|
698
675
|
estimator=estimator,
|
699
676
|
importance_threshold=importance_threshold,
|
700
|
-
stability_threshold=stability_threshold,
|
701
677
|
max_features=max_features,
|
702
678
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
703
679
|
auto_fe_parameters=auto_fe_parameters,
|
@@ -965,7 +941,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
965
941
|
raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
|
966
942
|
|
967
943
|
validated_X, validated_y, validated_eval_set = self._validate_train_eval(
|
968
|
-
effective_X, effective_y, effective_eval_set
|
944
|
+
effective_X, effective_y, effective_eval_set
|
969
945
|
)
|
970
946
|
|
971
947
|
if self.X is None:
|
@@ -1003,31 +979,29 @@ class FeaturesEnricher(TransformerMixin):
|
|
1003
979
|
return None
|
1004
980
|
|
1005
981
|
cat_features_from_backend = self.__get_categorical_features()
|
1006
|
-
# Convert to original names
|
1007
|
-
cat_features_from_backend = [self.fit_columns_renaming.get(c, c) for c in cat_features_from_backend]
|
1008
982
|
client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
|
1009
983
|
estimator, validated_X, self.search_keys
|
1010
984
|
)
|
1011
|
-
# Exclude id columns from cat_features
|
1012
985
|
if self.id_columns and self.id_columns_encoder is not None:
|
1013
986
|
if cat_features_from_backend:
|
1014
987
|
cat_features_from_backend = [
|
1015
988
|
c
|
1016
989
|
for c in cat_features_from_backend
|
1017
|
-
if c not in self.id_columns_encoder.feature_names_in_
|
990
|
+
if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
|
1018
991
|
]
|
1019
992
|
if client_cat_features:
|
1020
993
|
client_cat_features = [
|
1021
994
|
c
|
1022
995
|
for c in client_cat_features
|
1023
|
-
if c not in self.id_columns_encoder.feature_names_in_
|
996
|
+
if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
|
1024
997
|
]
|
1025
998
|
for cat_feature in cat_features_from_backend:
|
1026
|
-
|
1027
|
-
|
1028
|
-
|
999
|
+
original_cat_feature = self.fit_columns_renaming.get(cat_feature)
|
1000
|
+
if original_cat_feature in self.search_keys:
|
1001
|
+
if self.search_keys[original_cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
|
1002
|
+
search_keys_for_metrics.append(original_cat_feature)
|
1029
1003
|
else:
|
1030
|
-
self.logger.warning(self.bundle.get("cat_feature_search_key").format(
|
1004
|
+
self.logger.warning(self.bundle.get("cat_feature_search_key").format(original_cat_feature))
|
1031
1005
|
search_keys_for_metrics.extend([c for c in self.id_columns or [] if c not in search_keys_for_metrics])
|
1032
1006
|
self.logger.info(f"Search keys for metrics: {search_keys_for_metrics}")
|
1033
1007
|
|
@@ -1059,9 +1033,23 @@ class FeaturesEnricher(TransformerMixin):
|
|
1059
1033
|
groups,
|
1060
1034
|
_cv,
|
1061
1035
|
columns_renaming,
|
1062
|
-
_,
|
1063
1036
|
) = prepared_data
|
1064
1037
|
|
1038
|
+
# rename cat_features
|
1039
|
+
if client_cat_features:
|
1040
|
+
for new_c, old_c in columns_renaming.items():
|
1041
|
+
if old_c in client_cat_features:
|
1042
|
+
client_cat_features.remove(old_c)
|
1043
|
+
client_cat_features.append(new_c)
|
1044
|
+
for cat_feature in client_cat_features:
|
1045
|
+
if cat_feature not in fitting_X.columns:
|
1046
|
+
self.logger.error(
|
1047
|
+
f"Client cat_feature `{cat_feature}` not found in"
|
1048
|
+
f" x columns: {fitting_X.columns.to_list()}"
|
1049
|
+
)
|
1050
|
+
else:
|
1051
|
+
client_cat_features = []
|
1052
|
+
|
1065
1053
|
# rename baseline_score_column
|
1066
1054
|
reversed_renaming = {v: k for k, v in columns_renaming.items()}
|
1067
1055
|
baseline_score_column = self.baseline_score_column
|
@@ -1086,9 +1074,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
1086
1074
|
self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
|
1087
1075
|
|
1088
1076
|
has_date = self._get_date_column(search_keys) is not None
|
1077
|
+
has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
|
1089
1078
|
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
1090
1079
|
cat_features = list(set(client_cat_features + cat_features_from_backend))
|
1091
|
-
has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
|
1092
1080
|
baseline_cat_features = [f for f in cat_features if f in fitting_X.columns]
|
1093
1081
|
enriched_cat_features = [f for f in cat_features if f in fitting_enriched_X.columns]
|
1094
1082
|
if len(enriched_cat_features) < len(cat_features):
|
@@ -1208,6 +1196,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1208
1196
|
# max_initial_eval_set_hit_rate = self._search_task.get_max_initial_eval_set_hit_rate_v2()
|
1209
1197
|
if len(fitting_eval_set_dict) > 0:
|
1210
1198
|
for idx in fitting_eval_set_dict.keys():
|
1199
|
+
# eval_hit_rate = max_initial_eval_set_hit_rate[idx + 1]
|
1200
|
+
|
1211
1201
|
(
|
1212
1202
|
eval_X_sorted,
|
1213
1203
|
eval_y_sorted,
|
@@ -1215,10 +1205,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
1215
1205
|
enriched_eval_y_sorted,
|
1216
1206
|
) = fitting_eval_set_dict[idx]
|
1217
1207
|
|
1218
|
-
if eval_y_sorted.isna().all():
|
1219
|
-
# Skip OOT eval set
|
1220
|
-
continue
|
1221
|
-
|
1222
1208
|
if baseline_estimator is not None:
|
1223
1209
|
self.logger.info(
|
1224
1210
|
f"Calculate baseline {metric} on eval set {idx + 1} "
|
@@ -1261,14 +1247,17 @@ class FeaturesEnricher(TransformerMixin):
|
|
1261
1247
|
"quality_metrics_eval_segment"
|
1262
1248
|
).format(idx + 1),
|
1263
1249
|
self.bundle.get("quality_metrics_rows_header"): _num_samples(
|
1250
|
+
# effective_eval_set[idx][0]
|
1264
1251
|
# Use actually used for metrics dataset
|
1265
1252
|
eval_X_sorted
|
1266
1253
|
),
|
1254
|
+
# self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
|
1267
1255
|
}
|
1268
1256
|
if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
|
1269
1257
|
eval_y_sorted
|
1270
1258
|
):
|
1271
1259
|
eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
|
1260
|
+
# np.mean(validated_eval_set[idx][1]), 4
|
1272
1261
|
# Use actually used for metrics dataset
|
1273
1262
|
np.mean(eval_y_sorted),
|
1274
1263
|
4,
|
@@ -1290,7 +1279,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1290
1279
|
metrics.append(eval_metrics)
|
1291
1280
|
|
1292
1281
|
if updating_shaps is not None:
|
1293
|
-
decoded_X = self._decode_id_columns(fitting_X)
|
1282
|
+
decoded_X = self._decode_id_columns(fitting_X, columns_renaming)
|
1294
1283
|
self._update_shap_values(trace_id, decoded_X, updating_shaps, silent=not internal_call)
|
1295
1284
|
|
1296
1285
|
metrics_df = pd.DataFrame(metrics)
|
@@ -1341,202 +1330,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
1341
1330
|
finally:
|
1342
1331
|
self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
|
1343
1332
|
|
1344
|
-
def _select_features_by_psi(
|
1345
|
-
self,
|
1346
|
-
trace_id: str,
|
1347
|
-
X: Union[pd.DataFrame, pd.Series, np.ndarray],
|
1348
|
-
y: Union[pd.DataFrame, pd.Series, np.ndarray, List],
|
1349
|
-
eval_set: Optional[Union[List[tuple], tuple]],
|
1350
|
-
stability_threshold: float,
|
1351
|
-
cv: Union[BaseCrossValidator, CVType, str, None] = None,
|
1352
|
-
estimator=None,
|
1353
|
-
exclude_features_sources: Optional[List[str]] = None,
|
1354
|
-
importance_threshold: Optional[float] = None,
|
1355
|
-
max_features: Optional[int] = None,
|
1356
|
-
progress_bar: bool = True,
|
1357
|
-
progress_callback: Optional[Callable] = None,
|
1358
|
-
):
|
1359
|
-
search_keys = self.search_keys.copy()
|
1360
|
-
validated_X, _, validated_eval_set = self._validate_train_eval(X, y, eval_set, silent=True)
|
1361
|
-
if isinstance(X, np.ndarray):
|
1362
|
-
search_keys = {str(k): v for k, v in search_keys.items()}
|
1363
|
-
|
1364
|
-
date_column = self._get_date_column(search_keys)
|
1365
|
-
has_date = date_column is not None
|
1366
|
-
if not has_date:
|
1367
|
-
self.logger.info("No date column for OOT PSI calculation")
|
1368
|
-
return
|
1369
|
-
if not validated_eval_set:
|
1370
|
-
self.logger.info("No eval set for OOT PSI calculation")
|
1371
|
-
return
|
1372
|
-
if validated_X[date_column].nunique() <= 1:
|
1373
|
-
self.logger.warning("Constant date for OOT PSI calculation")
|
1374
|
-
return
|
1375
|
-
if self.cv is not None and self.cv.is_time_series():
|
1376
|
-
self.logger.warning("Time series CV is not supported for OOT PSI calculation")
|
1377
|
-
return
|
1378
|
-
|
1379
|
-
cat_features_from_backend = self.__get_categorical_features()
|
1380
|
-
cat_features_from_backend = [self.fit_columns_renaming.get(c, c) for c in cat_features_from_backend]
|
1381
|
-
client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
|
1382
|
-
estimator, validated_X, search_keys
|
1383
|
-
)
|
1384
|
-
if self.id_columns and self.id_columns_encoder is not None:
|
1385
|
-
if cat_features_from_backend:
|
1386
|
-
cat_features_from_backend = [
|
1387
|
-
c
|
1388
|
-
for c in cat_features_from_backend
|
1389
|
-
if c not in self.id_columns_encoder.feature_names_in_
|
1390
|
-
]
|
1391
|
-
if client_cat_features:
|
1392
|
-
client_cat_features = [
|
1393
|
-
c
|
1394
|
-
for c in client_cat_features
|
1395
|
-
if c not in self.id_columns_encoder.feature_names_in_
|
1396
|
-
]
|
1397
|
-
|
1398
|
-
prepared_data = self._prepare_data_for_metrics(
|
1399
|
-
trace_id=trace_id,
|
1400
|
-
X=X,
|
1401
|
-
y=y,
|
1402
|
-
eval_set=eval_set,
|
1403
|
-
exclude_features_sources=exclude_features_sources,
|
1404
|
-
importance_threshold=importance_threshold,
|
1405
|
-
max_features=max_features,
|
1406
|
-
remove_outliers_calc_metrics=False,
|
1407
|
-
cv_override=cv,
|
1408
|
-
search_keys_for_metrics=search_keys_for_metrics,
|
1409
|
-
progress_bar=progress_bar,
|
1410
|
-
progress_callback=progress_callback,
|
1411
|
-
client_cat_features=client_cat_features,
|
1412
|
-
)
|
1413
|
-
if prepared_data is None:
|
1414
|
-
return None
|
1415
|
-
|
1416
|
-
(
|
1417
|
-
validated_X,
|
1418
|
-
fitting_X,
|
1419
|
-
y_sorted,
|
1420
|
-
fitting_enriched_X,
|
1421
|
-
_,
|
1422
|
-
fitting_eval_set_dict,
|
1423
|
-
_,
|
1424
|
-
_,
|
1425
|
-
_,
|
1426
|
-
columns_renaming,
|
1427
|
-
eval_set_dates,
|
1428
|
-
) = prepared_data
|
1429
|
-
|
1430
|
-
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
1431
|
-
cat_features = list(set(client_cat_features + cat_features_from_backend))
|
1432
|
-
|
1433
|
-
# Drop unstable features
|
1434
|
-
unstable_features = self._check_stability(
|
1435
|
-
validated_X,
|
1436
|
-
validated_eval_set,
|
1437
|
-
fitting_eval_set_dict,
|
1438
|
-
eval_set_dates,
|
1439
|
-
search_keys,
|
1440
|
-
stability_threshold,
|
1441
|
-
cat_features,
|
1442
|
-
model_task_type,
|
1443
|
-
)
|
1444
|
-
client_features_df = self.df_with_original_index.rename(columns=columns_renaming)
|
1445
|
-
# decoded_X = self._decode_id_columns(fitting_X, columns_renaming)
|
1446
|
-
self._update_report_psi(trace_id, client_features_df)
|
1447
|
-
|
1448
|
-
if unstable_features:
|
1449
|
-
msg = f"Some features are unstable: {unstable_features} and will be dropped"
|
1450
|
-
self.logger.warning(msg)
|
1451
|
-
print(msg)
|
1452
|
-
fitting_X = fitting_X.drop(columns=unstable_features, errors="ignore")
|
1453
|
-
fitting_enriched_X = fitting_enriched_X.drop(columns=unstable_features, errors="ignore")
|
1454
|
-
msg = f"Threre are {len(fitting_enriched_X.columns)} stable selected features left"
|
1455
|
-
self.logger.info(msg)
|
1456
|
-
print(msg)
|
1457
|
-
for idx, (
|
1458
|
-
eval_X,
|
1459
|
-
eval_y,
|
1460
|
-
eval_enriched_X,
|
1461
|
-
eval_enriched_y,
|
1462
|
-
) in fitting_eval_set_dict.items():
|
1463
|
-
eval_X = eval_X.drop(columns=unstable_features, errors="ignore")
|
1464
|
-
eval_enriched_X = eval_enriched_X.drop(columns=unstable_features, errors="ignore")
|
1465
|
-
fitting_eval_set_dict[idx] = (eval_X, eval_y, eval_enriched_X, eval_enriched_y)
|
1466
|
-
|
1467
|
-
def _check_stability(
|
1468
|
-
self,
|
1469
|
-
X: pd.DataFrame,
|
1470
|
-
eval_set: List[Tuple[pd.DataFrame, pd.Series]],
|
1471
|
-
enriched_eval_set: Dict,
|
1472
|
-
eval_set_dates: Dict[int, pd.Series],
|
1473
|
-
search_keys: Dict[str, SearchKey],
|
1474
|
-
stability_threshold: float,
|
1475
|
-
cat_features: List[str],
|
1476
|
-
model_task_type: ModelTaskType,
|
1477
|
-
) -> List[str]:
|
1478
|
-
# Find latest eval set or earliest if all eval sets are before train set
|
1479
|
-
date_column = self._get_date_column(search_keys)
|
1480
|
-
|
1481
|
-
# Get minimum date from main dataset X
|
1482
|
-
main_min_date = X[date_column].min()
|
1483
|
-
|
1484
|
-
# Find minimum date for each eval_set and compare with main dataset
|
1485
|
-
eval_dates = []
|
1486
|
-
for i, (eval_x, _) in enumerate(eval_set):
|
1487
|
-
if date_column in eval_x.columns:
|
1488
|
-
eval_min_date = eval_x[date_column].min()
|
1489
|
-
eval_max_date = eval_x[date_column].max()
|
1490
|
-
eval_dates.append((i, eval_min_date, eval_max_date))
|
1491
|
-
|
1492
|
-
if not eval_dates:
|
1493
|
-
return []
|
1494
|
-
|
1495
|
-
# Check if any eval_set has minimum date >= main dataset minimum date
|
1496
|
-
later_eval_sets = [(i, min_date, max_date) for i, min_date, max_date in eval_dates if min_date >= main_min_date]
|
1497
|
-
|
1498
|
-
if later_eval_sets:
|
1499
|
-
# If there are eval_sets with date >= main date, choose the one with highest maximum date
|
1500
|
-
selected_eval_set_idx = max(later_eval_sets, key=lambda x: x[2])[0]
|
1501
|
-
else:
|
1502
|
-
# If all eval_sets have dates < main date, choose the one with lowest minimux date
|
1503
|
-
selected_eval_set_idx = max(eval_dates, key=lambda x: x[1])[0]
|
1504
|
-
|
1505
|
-
checking_eval_set = enriched_eval_set[selected_eval_set_idx]
|
1506
|
-
|
1507
|
-
checking_eval_set_df = (
|
1508
|
-
checking_eval_set[2]
|
1509
|
-
if checking_eval_set[1] is None or checking_eval_set[1].isna().all()
|
1510
|
-
else pd.concat([checking_eval_set[2], checking_eval_set[1].to_frame(TARGET)], axis=1)
|
1511
|
-
)
|
1512
|
-
checking_eval_set_df = checking_eval_set_df.copy()
|
1513
|
-
|
1514
|
-
checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
|
1515
|
-
|
1516
|
-
psi_values_sparse = calculate_sparsity_psi(
|
1517
|
-
checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
|
1518
|
-
)
|
1519
|
-
|
1520
|
-
unstable_by_sparsity = [feature for feature, psi in psi_values_sparse.items() if psi > stability_threshold]
|
1521
|
-
if unstable_by_sparsity:
|
1522
|
-
self.logger.info(f"Unstable by sparsity features: {sorted(unstable_by_sparsity)}")
|
1523
|
-
|
1524
|
-
psi_values = calculate_features_psi(
|
1525
|
-
checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
|
1526
|
-
)
|
1527
|
-
|
1528
|
-
unstable_by_value = [feature for feature, psi in psi_values.items() if psi > stability_threshold]
|
1529
|
-
if unstable_by_value:
|
1530
|
-
self.logger.info(f"Unstable by value features: {sorted(unstable_by_value)}")
|
1531
|
-
|
1532
|
-
self.psi_values = {
|
1533
|
-
feature: psi_value for feature, psi_value in psi_values.items() if psi_value <= stability_threshold
|
1534
|
-
}
|
1535
|
-
|
1536
|
-
total_unstable_features = sorted(set(unstable_by_sparsity + unstable_by_value))
|
1537
|
-
|
1538
|
-
return total_unstable_features
|
1539
|
-
|
1540
1333
|
def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float], silent: bool = False):
|
1541
1334
|
renaming = self.fit_columns_renaming or {}
|
1542
1335
|
self.logger.info(f"Updating SHAP values: {new_shaps}")
|
@@ -1592,56 +1385,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
1592
1385
|
except (ImportError, NameError):
|
1593
1386
|
pass
|
1594
1387
|
|
1595
|
-
def _update_report_psi(self, trace_id: str, clients_features_df: pd.DataFrame):
|
1596
|
-
self.__prepare_feature_importances(trace_id, clients_features_df)
|
1597
|
-
|
1598
|
-
if self.features_info_display_handle is not None:
|
1599
|
-
try:
|
1600
|
-
_ = get_ipython() # type: ignore
|
1601
|
-
|
1602
|
-
display_html_dataframe(
|
1603
|
-
self.features_info,
|
1604
|
-
self._features_info_without_links,
|
1605
|
-
self.bundle.get("relevant_features_header"),
|
1606
|
-
display_handle=self.features_info_display_handle,
|
1607
|
-
)
|
1608
|
-
except (ImportError, NameError):
|
1609
|
-
pass
|
1610
|
-
|
1611
|
-
if self.data_sources_display_handle is not None:
|
1612
|
-
try:
|
1613
|
-
_ = get_ipython() # type: ignore
|
1614
|
-
|
1615
|
-
display_html_dataframe(
|
1616
|
-
self.relevant_data_sources,
|
1617
|
-
self._relevant_data_sources_wo_links,
|
1618
|
-
self.bundle.get("relevant_data_sources_header"),
|
1619
|
-
display_handle=self.data_sources_display_handle,
|
1620
|
-
)
|
1621
|
-
except (ImportError, NameError):
|
1622
|
-
pass
|
1623
|
-
|
1624
|
-
if self.autofe_features_display_handle is not None:
|
1625
|
-
try:
|
1626
|
-
_ = get_ipython() # type: ignore
|
1627
|
-
autofe_descriptions_df = self.get_autofe_features_description()
|
1628
|
-
if autofe_descriptions_df is not None:
|
1629
|
-
display_html_dataframe(
|
1630
|
-
df=autofe_descriptions_df,
|
1631
|
-
internal_df=autofe_descriptions_df,
|
1632
|
-
header=self.bundle.get("autofe_descriptions_header"),
|
1633
|
-
display_handle=self.autofe_features_display_handle,
|
1634
|
-
)
|
1635
|
-
except (ImportError, NameError):
|
1636
|
-
pass
|
1637
|
-
if self.report_button_handle is not None:
|
1638
|
-
try:
|
1639
|
-
_ = get_ipython() # type: ignore
|
1640
|
-
|
1641
|
-
self.__show_report_button(display_handle=self.report_button_handle)
|
1642
|
-
except (ImportError, NameError):
|
1643
|
-
pass
|
1644
|
-
|
1645
1388
|
def _check_train_and_eval_target_distribution(self, y, eval_set_dict):
|
1646
1389
|
uneven_distribution = False
|
1647
1390
|
for eval_set in eval_set_dict.values():
|
@@ -1745,7 +1488,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1745
1488
|
def _get_and_validate_client_cat_features(
|
1746
1489
|
self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
1747
1490
|
) -> Tuple[Optional[List[str]], List[str]]:
|
1748
|
-
cat_features =
|
1491
|
+
cat_features = None
|
1749
1492
|
search_keys_for_metrics = []
|
1750
1493
|
if (
|
1751
1494
|
estimator is not None
|
@@ -1792,7 +1535,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1792
1535
|
is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
|
1793
1536
|
is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
|
1794
1537
|
checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
1795
|
-
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set
|
1538
|
+
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set)
|
1796
1539
|
|
1797
1540
|
sampled_data = self._get_enriched_for_metrics(
|
1798
1541
|
trace_id,
|
@@ -1806,7 +1549,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1806
1549
|
progress_bar,
|
1807
1550
|
progress_callback,
|
1808
1551
|
)
|
1809
|
-
|
1552
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = dataclasses.astuple(
|
1810
1553
|
sampled_data
|
1811
1554
|
)
|
1812
1555
|
|
@@ -1829,7 +1572,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1829
1572
|
or c in set(self.feature_names_).union(self.id_columns or [])
|
1830
1573
|
or (self.fit_columns_renaming or {}).get(c, c) in set(self.feature_names_).union(self.id_columns or [])
|
1831
1574
|
)
|
1832
|
-
and c
|
1575
|
+
and c
|
1576
|
+
not in (
|
1833
1577
|
excluding_search_keys
|
1834
1578
|
+ list(self.fit_dropped_features)
|
1835
1579
|
+ [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
|
@@ -1914,7 +1658,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1914
1658
|
fitting_X, y_sorted, search_keys, self.model_task_type, sort_all_columns=True, logger=self.logger
|
1915
1659
|
)
|
1916
1660
|
fitting_X = fitting_X[fitting_x_columns]
|
1917
|
-
fitting_X, _ = self._encode_id_columns(fitting_X)
|
1661
|
+
fitting_X, _ = self._encode_id_columns(fitting_X, self.fit_columns_renaming)
|
1918
1662
|
self.logger.info(f"Final sorted list of fitting X columns: {fitting_x_columns}")
|
1919
1663
|
fitting_enriched_x_columns = fitting_enriched_X.columns.to_list()
|
1920
1664
|
fitting_enriched_x_columns = sort_columns(
|
@@ -1926,18 +1670,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
1926
1670
|
logger=self.logger,
|
1927
1671
|
)
|
1928
1672
|
fitting_enriched_X = fitting_enriched_X[fitting_enriched_x_columns]
|
1929
|
-
fitting_enriched_X, _ = self._encode_id_columns(fitting_enriched_X)
|
1673
|
+
fitting_enriched_X, _ = self._encode_id_columns(fitting_enriched_X, self.fit_columns_renaming)
|
1930
1674
|
self.logger.info(f"Final sorted list of fitting enriched X columns: {fitting_enriched_x_columns}")
|
1931
|
-
date_column = self._get_date_column(search_keys)
|
1932
|
-
eval_set_dates = {}
|
1933
1675
|
for idx, eval_tuple in eval_set_sampled_dict.items():
|
1934
1676
|
eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
|
1935
1677
|
eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
|
1936
1678
|
enriched_eval_X_sorted, enriched_eval_y_sorted = self._sort_by_system_record_id(
|
1937
1679
|
enriched_eval_X, eval_y_sampled, self.cv
|
1938
1680
|
)
|
1939
|
-
if date_column is not None:
|
1940
|
-
eval_set_dates[idx] = eval_X_sorted[date_column]
|
1941
1681
|
fitting_eval_X = eval_X_sorted[fitting_x_columns].copy()
|
1942
1682
|
fitting_enriched_eval_X = enriched_eval_X_sorted[fitting_enriched_x_columns].copy()
|
1943
1683
|
|
@@ -1958,8 +1698,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1958
1698
|
.astype(np.float64)
|
1959
1699
|
)
|
1960
1700
|
|
1961
|
-
fitting_eval_X, unknown_dict = self._encode_id_columns(fitting_eval_X)
|
1962
|
-
fitting_enriched_eval_X, _ = self._encode_id_columns(fitting_enriched_eval_X)
|
1701
|
+
fitting_eval_X, unknown_dict = self._encode_id_columns(fitting_eval_X, self.fit_columns_renaming)
|
1702
|
+
fitting_enriched_eval_X, _ = self._encode_id_columns(fitting_enriched_eval_X, self.fit_columns_renaming)
|
1963
1703
|
|
1964
1704
|
if len(unknown_dict) > 0:
|
1965
1705
|
print(self.bundle.get("unknown_id_column_value_in_eval_set").format(unknown_dict))
|
@@ -1982,7 +1722,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
1982
1722
|
groups,
|
1983
1723
|
cv,
|
1984
1724
|
columns_renaming,
|
1985
|
-
eval_set_dates,
|
1986
1725
|
)
|
1987
1726
|
|
1988
1727
|
@dataclass
|
@@ -2145,16 +1884,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
2145
1884
|
remove_outliers_calc_metrics: Optional[bool],
|
2146
1885
|
) -> _EnrichedDataForMetrics:
|
2147
1886
|
eval_set_sampled_dict = {}
|
2148
|
-
search_keys = self.fit_search_keys
|
1887
|
+
search_keys = self.fit_search_keys
|
2149
1888
|
|
2150
1889
|
rows_to_drop = None
|
2151
1890
|
has_date = self._get_date_column(search_keys) is not None
|
2152
1891
|
self.model_task_type = self.model_task_type or define_task(
|
2153
1892
|
self.df_with_original_index[TARGET], has_date, self.logger, silent=True
|
2154
1893
|
)
|
2155
|
-
if
|
2156
|
-
remove_outliers_calc_metrics = True
|
2157
|
-
if self.model_task_type == ModelTaskType.REGRESSION and remove_outliers_calc_metrics:
|
1894
|
+
if self.model_task_type == ModelTaskType.REGRESSION:
|
2158
1895
|
target_outliers_df = self._search_task.get_target_outliers(trace_id)
|
2159
1896
|
if target_outliers_df is not None and len(target_outliers_df) > 0:
|
2160
1897
|
outliers = pd.merge(
|
@@ -2164,8 +1901,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
2164
1901
|
how="inner",
|
2165
1902
|
)
|
2166
1903
|
top_outliers = outliers.sort_values(by=TARGET, ascending=False)[TARGET].head(3)
|
2167
|
-
|
2168
|
-
|
1904
|
+
if remove_outliers_calc_metrics is None or remove_outliers_calc_metrics is True:
|
1905
|
+
rows_to_drop = outliers
|
1906
|
+
not_msg = ""
|
1907
|
+
else:
|
1908
|
+
not_msg = "not "
|
2169
1909
|
msg = self.bundle.get("target_outliers_warning").format(len(target_outliers_df), top_outliers, not_msg)
|
2170
1910
|
print(msg)
|
2171
1911
|
self.logger.warning(msg)
|
@@ -2223,13 +1963,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
2223
1963
|
enriched_eval_X = enriched_eval_sets[idx + 1][enriched_X_columns].copy()
|
2224
1964
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
2225
1965
|
|
2226
|
-
|
2227
|
-
X_sampled.rename(columns=
|
2228
|
-
enriched_X.rename(columns=
|
1966
|
+
reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
|
1967
|
+
X_sampled.rename(columns=reversed_renaming, inplace=True)
|
1968
|
+
enriched_X.rename(columns=reversed_renaming, inplace=True)
|
2229
1969
|
for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
|
2230
|
-
eval_X_sampled.rename(columns=
|
2231
|
-
enriched_eval_X.rename(columns=
|
2232
|
-
search_keys = {self.fit_columns_renaming.get(k, k): v for k, v in search_keys.items()}
|
1970
|
+
eval_X_sampled.rename(columns=reversed_renaming, inplace=True)
|
1971
|
+
enriched_eval_X.rename(columns=reversed_renaming, inplace=True)
|
2233
1972
|
|
2234
1973
|
datasets_hash = hash_input(self.X, self.y, self.eval_set)
|
2235
1974
|
return self.__cache_and_return_results(
|
@@ -2287,7 +2026,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
2287
2026
|
enriched_df, x_columns, enriched_X.columns.tolist(), len(eval_set) if has_eval_set else 0
|
2288
2027
|
)
|
2289
2028
|
|
2290
|
-
|
2029
|
+
# Add hash-suffixes because output of transform has original names
|
2030
|
+
reversed_renaming = {v: k for k, v in columns_renaming.items()}
|
2031
|
+
X_sampled.rename(columns=reversed_renaming, inplace=True)
|
2032
|
+
enriched_X.rename(columns=reversed_renaming, inplace=True)
|
2033
|
+
for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
|
2034
|
+
eval_X_sampled.rename(columns=reversed_renaming, inplace=True)
|
2035
|
+
enriched_eval_X.rename(columns=reversed_renaming, inplace=True)
|
2291
2036
|
|
2292
2037
|
# Cache and return results
|
2293
2038
|
datasets_hash = hash_input(validated_X, validated_y, eval_set)
|
@@ -2367,7 +2112,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2367
2112
|
|
2368
2113
|
def __extract_eval_data(
|
2369
2114
|
self, enriched_df: pd.DataFrame, x_columns: List[str], enriched_X_columns: List[str], eval_set_len: int
|
2370
|
-
) ->
|
2115
|
+
) -> Dict[int, Tuple]:
|
2371
2116
|
eval_set_sampled_dict = {}
|
2372
2117
|
|
2373
2118
|
for idx in range(eval_set_len):
|
@@ -2413,12 +2158,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
2413
2158
|
columns_renaming: Dict[str, str],
|
2414
2159
|
):
|
2415
2160
|
# X_sampled - with hash-suffixes
|
2416
|
-
|
2417
|
-
|
2418
|
-
|
2419
|
-
|
2420
|
-
|
2421
|
-
|
2161
|
+
reversed_renaming = {v: k for k, v in columns_renaming.items()}
|
2162
|
+
search_keys = {
|
2163
|
+
reversed_renaming.get(k, k): v
|
2164
|
+
for k, v in search_keys.items()
|
2165
|
+
if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
|
2166
|
+
}
|
2422
2167
|
return FeaturesEnricher._EnrichedDataForMetrics(
|
2423
2168
|
X_sampled=X_sampled,
|
2424
2169
|
y_sampled=y_sampled,
|
@@ -2568,7 +2313,7 @@ if response.status_code == 200:
|
|
2568
2313
|
self.logger.info("Start transform")
|
2569
2314
|
|
2570
2315
|
validated_X, validated_y, validated_eval_set = self._validate_train_eval(
|
2571
|
-
X, y, eval_set=None, is_transform=True
|
2316
|
+
X, y, eval_set=None, is_transform=True
|
2572
2317
|
)
|
2573
2318
|
df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
|
2574
2319
|
|
@@ -2670,7 +2415,7 @@ if response.status_code == 200:
|
|
2670
2415
|
else:
|
2671
2416
|
self.logger.info("Input dataset hasn't date column")
|
2672
2417
|
if self.__should_add_date_column():
|
2673
|
-
df = self._add_current_date_as_key(df, search_keys, self.
|
2418
|
+
df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
|
2674
2419
|
|
2675
2420
|
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
2676
2421
|
if email_columns and self.generate_search_key_features:
|
@@ -2919,8 +2664,7 @@ if response.status_code == 200:
|
|
2919
2664
|
selecting_columns = [
|
2920
2665
|
c
|
2921
2666
|
for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
|
2922
|
-
if
|
2923
|
-
or c in (self.id_columns or [])
|
2667
|
+
if c not in self.zero_shap_client_features or c in (self.id_columns or [])
|
2924
2668
|
]
|
2925
2669
|
selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
|
2926
2670
|
if add_fit_system_record_id:
|
@@ -3054,7 +2798,6 @@ if response.status_code == 200:
|
|
3054
2798
|
scoring: Union[Callable, str, None],
|
3055
2799
|
estimator: Optional[Any],
|
3056
2800
|
importance_threshold: Optional[float],
|
3057
|
-
stability_threshold: float,
|
3058
2801
|
max_features: Optional[int],
|
3059
2802
|
remove_outliers_calc_metrics: Optional[bool],
|
3060
2803
|
auto_fe_parameters: AutoFEParameters,
|
@@ -3069,7 +2812,6 @@ if response.status_code == 200:
|
|
3069
2812
|
self.fit_columns_renaming = None
|
3070
2813
|
self.fit_dropped_features = set()
|
3071
2814
|
self.fit_generated_features = []
|
3072
|
-
self.psi_values = None
|
3073
2815
|
|
3074
2816
|
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, eval_set)
|
3075
2817
|
|
@@ -3166,7 +2908,7 @@ if response.status_code == 200:
|
|
3166
2908
|
self.logger.info("Input dataset hasn't date column")
|
3167
2909
|
# TODO remove when this logic will be implemented on the back
|
3168
2910
|
if self.__should_add_date_column():
|
3169
|
-
df = self._add_current_date_as_key(df, self.fit_search_keys, self.bundle)
|
2911
|
+
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
3170
2912
|
|
3171
2913
|
email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
|
3172
2914
|
if email_columns and self.generate_search_key_features:
|
@@ -3181,13 +2923,10 @@ if response.status_code == 200:
|
|
3181
2923
|
except Exception:
|
3182
2924
|
self.logger.exception("Failed to check dates distribution validity")
|
3183
2925
|
|
3184
|
-
self.__adjust_cv(df)
|
3185
|
-
|
3186
2926
|
if (
|
3187
2927
|
is_numeric_dtype(df[self.TARGET_NAME])
|
3188
2928
|
and self.model_task_type in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]
|
3189
2929
|
and has_date
|
3190
|
-
and (self.cv is None or not self.cv.is_time_series())
|
3191
2930
|
):
|
3192
2931
|
self._validate_PSI(df.sort_values(by=maybe_date_column))
|
3193
2932
|
|
@@ -3219,8 +2958,8 @@ if response.status_code == 200:
|
|
3219
2958
|
|
3220
2959
|
self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
|
3221
2960
|
|
3222
|
-
|
3223
|
-
|
2961
|
+
self.__adjust_cv(df)
|
2962
|
+
|
3224
2963
|
if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
|
3225
2964
|
id_columns = self.__get_renamed_id_columns()
|
3226
2965
|
if id_columns:
|
@@ -3425,21 +3164,6 @@ if response.status_code == 200:
|
|
3425
3164
|
display_id=f"autofe_descriptions_{uuid.uuid4()}",
|
3426
3165
|
)
|
3427
3166
|
|
3428
|
-
self._select_features_by_psi(
|
3429
|
-
trace_id=trace_id,
|
3430
|
-
X=X,
|
3431
|
-
y=y,
|
3432
|
-
eval_set=eval_set,
|
3433
|
-
stability_threshold=stability_threshold,
|
3434
|
-
cv=self.cv,
|
3435
|
-
estimator=estimator,
|
3436
|
-
exclude_features_sources=exclude_features_sources,
|
3437
|
-
importance_threshold=importance_threshold,
|
3438
|
-
max_features=max_features,
|
3439
|
-
progress_bar=progress_bar,
|
3440
|
-
progress_callback=progress_callback,
|
3441
|
-
)
|
3442
|
-
|
3443
3167
|
if self._has_paid_features(exclude_features_sources):
|
3444
3168
|
if calculate_metrics is not None and calculate_metrics:
|
3445
3169
|
msg = self.bundle.get("metrics_with_paid_features")
|
@@ -3525,21 +3249,19 @@ if response.status_code == 200:
|
|
3525
3249
|
reverse_renaming = {v: k for k, v in renaming.items()}
|
3526
3250
|
return None if self.id_columns is None else [reverse_renaming.get(c) or c for c in self.id_columns]
|
3527
3251
|
|
3528
|
-
def __adjust_cv(self, df: pd.DataFrame
|
3529
|
-
if self.cv is not None and not force:
|
3530
|
-
return
|
3531
|
-
|
3252
|
+
def __adjust_cv(self, df: pd.DataFrame):
|
3532
3253
|
date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
3533
3254
|
# Check Multivariate time series
|
3534
3255
|
if (
|
3535
|
-
|
3256
|
+
self.cv is None
|
3257
|
+
and date_column
|
3536
3258
|
and self.model_task_type == ModelTaskType.REGRESSION
|
3537
3259
|
and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys())) == 0
|
3538
3260
|
and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
|
3539
3261
|
):
|
3540
3262
|
msg = self.bundle.get("multivariate_timeseries_detected")
|
3541
3263
|
self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
|
3542
|
-
elif self.model_task_type != ModelTaskType.REGRESSION:
|
3264
|
+
elif self.cv is None and self.model_task_type != ModelTaskType.REGRESSION:
|
3543
3265
|
msg = self.bundle.get("group_k_fold_in_classification")
|
3544
3266
|
self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
|
3545
3267
|
group_columns = self._get_group_columns(df, self.fit_search_keys)
|
@@ -3567,42 +3289,48 @@ if response.status_code == 200:
|
|
3567
3289
|
y: Optional[pd.Series] = None,
|
3568
3290
|
eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]] = None,
|
3569
3291
|
is_transform: bool = False,
|
3570
|
-
silent: bool = False,
|
3571
3292
|
) -> Tuple[pd.DataFrame, pd.Series, Optional[List[Tuple[pd.DataFrame, pd.Series]]]]:
|
3572
3293
|
validated_X = self._validate_X(X, is_transform)
|
3573
3294
|
validated_y = self._validate_y(validated_X, y, enforce_y=not is_transform)
|
3574
|
-
validated_eval_set = self._validate_eval_set(validated_X, eval_set
|
3295
|
+
validated_eval_set = self._validate_eval_set(validated_X, eval_set)
|
3575
3296
|
return validated_X, validated_y, validated_eval_set
|
3576
3297
|
|
3577
3298
|
def _encode_id_columns(
|
3578
3299
|
self,
|
3579
3300
|
X: pd.DataFrame,
|
3301
|
+
columns_renaming: Optional[Dict[str, str]] = None,
|
3580
3302
|
) -> Tuple[pd.DataFrame, Dict[str, List[Any]]]:
|
3303
|
+
columns_renaming = columns_renaming or {}
|
3581
3304
|
unknown_dict = {}
|
3582
3305
|
|
3583
3306
|
if self.id_columns and self.id_columns_encoder is not None:
|
3584
|
-
|
3585
|
-
|
3586
|
-
|
3587
|
-
|
3588
|
-
|
3589
|
-
|
3590
|
-
|
3591
|
-
|
3592
|
-
|
3593
|
-
|
3594
|
-
|
3595
|
-
|
3596
|
-
|
3307
|
+
inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
3308
|
+
renamed_id_columns = [
|
3309
|
+
inverse_columns_renaming.get(col, col) for col in self.id_columns_encoder.feature_names_in_
|
3310
|
+
]
|
3311
|
+
self.logger.info(f"Convert id columns to int: {renamed_id_columns}")
|
3312
|
+
encoded = self.id_columns_encoder.transform(X[renamed_id_columns].rename(columns=columns_renaming))
|
3313
|
+
for i, c in enumerate(renamed_id_columns):
|
3314
|
+
unknown_values = X[encoded[:, i] == -1][c].unique().tolist()
|
3315
|
+
if len(unknown_values) > 0:
|
3316
|
+
unknown_dict[c] = unknown_values
|
3317
|
+
X[renamed_id_columns] = encoded
|
3318
|
+
X = X.loc[(X[renamed_id_columns] != -1).all(axis=1)]
|
3319
|
+
|
3320
|
+
if len(unknown_dict) > 0:
|
3321
|
+
self.logger.warning(f"Unknown values in id columns: {unknown_dict}")
|
3597
3322
|
|
3598
3323
|
return X, unknown_dict
|
3599
3324
|
|
3600
|
-
def _decode_id_columns(self, X: pd.DataFrame):
|
3325
|
+
def _decode_id_columns(self, X: pd.DataFrame, columns_renaming: Dict[str, str]):
|
3326
|
+
columns_renaming = columns_renaming or {}
|
3601
3327
|
if self.id_columns and self.id_columns_encoder is not None:
|
3602
|
-
|
3603
|
-
|
3604
|
-
|
3605
|
-
|
3328
|
+
inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
3329
|
+
renamed_id_columns = [
|
3330
|
+
inverse_columns_renaming.get(col, col) for col in self.id_columns_encoder.feature_names_in_
|
3331
|
+
]
|
3332
|
+
decoded = self.id_columns_encoder.inverse_transform(X[renamed_id_columns].rename(columns=columns_renaming))
|
3333
|
+
X[renamed_id_columns] = decoded
|
3606
3334
|
|
3607
3335
|
return X
|
3608
3336
|
|
@@ -3696,30 +3424,10 @@ if response.status_code == 200:
|
|
3696
3424
|
|
3697
3425
|
return validated_y
|
3698
3426
|
|
3699
|
-
def _validate_eval_set(
|
3700
|
-
self, X: pd.DataFrame, eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]], silent: bool = False
|
3701
|
-
):
|
3427
|
+
def _validate_eval_set(self, X: pd.DataFrame, eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]]):
|
3702
3428
|
if eval_set is None:
|
3703
3429
|
return None
|
3704
|
-
|
3705
|
-
has_date = self._get_date_column(self.search_keys) is not None
|
3706
|
-
for idx, eval_pair in enumerate(eval_set):
|
3707
|
-
validated_pair = self._validate_eval_set_pair(X, eval_pair)
|
3708
|
-
if validated_pair[1].isna().all():
|
3709
|
-
if not has_date:
|
3710
|
-
msg = self.bundle.get("oot_without_date_not_supported").format(idx + 1)
|
3711
|
-
elif self.columns_for_online_api:
|
3712
|
-
msg = self.bundle.get("oot_with_online_sources_not_supported").format(idx + 1)
|
3713
|
-
else:
|
3714
|
-
msg = None
|
3715
|
-
if msg:
|
3716
|
-
if not silent:
|
3717
|
-
print(msg)
|
3718
|
-
self.logger.warning(msg)
|
3719
|
-
continue
|
3720
|
-
validated_eval_set.append(validated_pair)
|
3721
|
-
|
3722
|
-
return validated_eval_set
|
3430
|
+
return [self._validate_eval_set_pair(X, eval_pair) for eval_pair in eval_set]
|
3723
3431
|
|
3724
3432
|
def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
|
3725
3433
|
if len(eval_pair) != 2:
|
@@ -3794,18 +3502,16 @@ if response.status_code == 200:
|
|
3794
3502
|
raise ValidationError(self.bundle.get("unsupported_y_type_eval_set").format(type(eval_y)))
|
3795
3503
|
|
3796
3504
|
eval_y_nunique = validated_eval_y.nunique()
|
3797
|
-
|
3798
|
-
if not is_oot and eval_y_nunique < 2:
|
3505
|
+
if eval_y_nunique < 2:
|
3799
3506
|
raise ValidationError(self.bundle.get("y_is_constant_eval_set"))
|
3800
3507
|
|
3801
|
-
if
|
3508
|
+
if self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
|
3802
3509
|
raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
|
3803
3510
|
|
3804
|
-
|
3805
|
-
|
3806
|
-
|
3807
|
-
|
3808
|
-
raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
|
3511
|
+
# Check for duplicates between train and eval sets by comparing all values
|
3512
|
+
train_eval_intersection = pd.merge(X, validated_eval_X, how="inner")
|
3513
|
+
if len(train_eval_intersection) > 0:
|
3514
|
+
raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
|
3809
3515
|
|
3810
3516
|
return validated_eval_X, validated_eval_y
|
3811
3517
|
|
@@ -3821,12 +3527,10 @@ if response.status_code == 200:
|
|
3821
3527
|
if isinstance(eval_set, tuple):
|
3822
3528
|
eval_set = [eval_set]
|
3823
3529
|
for eval in eval_set:
|
3824
|
-
|
3825
|
-
|
3826
|
-
|
3827
|
-
|
3828
|
-
if eval[0][self.baseline_score_column].isna().any():
|
3829
|
-
raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
|
3530
|
+
if self.baseline_score_column not in eval[0].columns:
|
3531
|
+
raise ValidationError(self.bundle.get("baseline_score_column_not_exists"))
|
3532
|
+
if eval[0][self.baseline_score_column].isna().any():
|
3533
|
+
raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
|
3830
3534
|
|
3831
3535
|
@staticmethod
|
3832
3536
|
def _sample_X_and_y(X: pd.DataFrame, y: pd.Series, enriched_X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
|
@@ -4000,7 +3704,7 @@ if response.status_code == 200:
|
|
4000
3704
|
return df
|
4001
3705
|
|
4002
3706
|
def _add_current_date_as_key(
|
4003
|
-
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey],
|
3707
|
+
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
|
4004
3708
|
) -> pd.DataFrame:
|
4005
3709
|
if (
|
4006
3710
|
set(search_keys.values()) == {SearchKey.PHONE}
|
@@ -4008,8 +3712,7 @@ if response.status_code == 200:
|
|
4008
3712
|
or set(search_keys.values()) == {SearchKey.HEM}
|
4009
3713
|
or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
|
4010
3714
|
):
|
4011
|
-
|
4012
|
-
self.__log_warning(bundle.get("current_date_added"))
|
3715
|
+
self.__log_warning(bundle.get("current_date_added"))
|
4013
3716
|
df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
|
4014
3717
|
search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
|
4015
3718
|
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, generate_cyclical_features=False)
|
@@ -4148,7 +3851,7 @@ if response.status_code == 200:
|
|
4148
3851
|
columns_to_sort = [date_column] if date_column is not None else []
|
4149
3852
|
|
4150
3853
|
do_sorting = True
|
4151
|
-
if self.id_columns and self.cv
|
3854
|
+
if self.id_columns and self.cv.is_time_series():
|
4152
3855
|
# Check duplicates by date and id_columns
|
4153
3856
|
reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
4154
3857
|
renamed_id_columns = [reversed_columns_renaming.get(c, c) for c in self.id_columns]
|
@@ -4344,11 +4047,7 @@ if response.status_code == 200:
|
|
4344
4047
|
return [f.name for f in features_meta if f.type == "categorical" and f.name not in (self.id_columns or [])]
|
4345
4048
|
|
4346
4049
|
def __prepare_feature_importances(
|
4347
|
-
self,
|
4348
|
-
trace_id: str,
|
4349
|
-
clients_features_df: pd.DataFrame,
|
4350
|
-
updated_shaps: Optional[Dict[str, float]] = None,
|
4351
|
-
silent=False,
|
4050
|
+
self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
|
4352
4051
|
):
|
4353
4052
|
if self._search_task is None:
|
4354
4053
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
@@ -4361,12 +4060,11 @@ if response.status_code == 200:
|
|
4361
4060
|
features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
|
4362
4061
|
|
4363
4062
|
# To be sure that names with hash suffixes
|
4364
|
-
|
4063
|
+
df = df.rename(columns=original_names_dict)
|
4365
4064
|
|
4366
4065
|
self.feature_names_ = []
|
4367
4066
|
self.external_source_feature_names = []
|
4368
4067
|
self.zero_shap_client_features = []
|
4369
|
-
self.unstable_client_features = []
|
4370
4068
|
self.feature_importances_ = []
|
4371
4069
|
features_info = []
|
4372
4070
|
features_info_without_links = []
|
@@ -4375,10 +4073,10 @@ if response.status_code == 200:
|
|
4375
4073
|
original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
|
4376
4074
|
|
4377
4075
|
for feature_meta in features_meta:
|
4378
|
-
|
4379
|
-
|
4076
|
+
if feature_meta.name in original_names_dict.keys():
|
4077
|
+
feature_meta.name = original_names_dict[feature_meta.name]
|
4380
4078
|
|
4381
|
-
is_client_feature =
|
4079
|
+
is_client_feature = original_names_dict.get(feature_meta.name, feature_meta.name) in df.columns
|
4382
4080
|
|
4383
4081
|
# Show and update shap values for client features only if select_features is True
|
4384
4082
|
if updated_shaps is not None and (not is_client_feature or self.fit_select_features):
|
@@ -4395,21 +4093,12 @@ if response.status_code == 200:
|
|
4395
4093
|
|
4396
4094
|
for feature_meta in features_meta:
|
4397
4095
|
original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
|
4398
|
-
is_client_feature = original_name in
|
4096
|
+
is_client_feature = original_name in df.columns
|
4399
4097
|
|
4400
4098
|
if not is_client_feature:
|
4401
4099
|
self.external_source_feature_names.append(original_name)
|
4402
4100
|
|
4403
|
-
if self.psi_values is not None:
|
4404
|
-
if original_name in self.psi_values:
|
4405
|
-
feature_meta.psi_value = self.psi_values[original_name]
|
4406
|
-
else:
|
4407
|
-
if is_client_feature and self.fit_select_features:
|
4408
|
-
self.unstable_client_features.append(original_name)
|
4409
|
-
continue
|
4410
|
-
|
4411
4101
|
# TODO make a decision about selected features based on special flag from mlb
|
4412
|
-
|
4413
4102
|
if original_shaps.get(feature_meta.name, 0.0) == 0.0:
|
4414
4103
|
if is_client_feature and self.fit_select_features:
|
4415
4104
|
self.zero_shap_client_features.append(original_name)
|
@@ -4433,7 +4122,7 @@ if response.status_code == 200:
|
|
4433
4122
|
self.feature_names_.append(feature_meta.name)
|
4434
4123
|
self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
|
4435
4124
|
|
4436
|
-
df_for_sample = features_df if feature_meta.name in features_df.columns else
|
4125
|
+
df_for_sample = features_df if feature_meta.name in features_df.columns else df
|
4437
4126
|
feature_info = FeatureInfo.from_metadata(feature_meta, df_for_sample, is_client_feature)
|
4438
4127
|
features_info.append(feature_info.to_row(self.bundle))
|
4439
4128
|
features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
|
@@ -4441,8 +4130,6 @@ if response.status_code == 200:
|
|
4441
4130
|
|
4442
4131
|
if len(features_info) > 0:
|
4443
4132
|
self.features_info = pd.DataFrame(features_info)
|
4444
|
-
if self.features_info[self.bundle.get("features_info_psi")].isna().all():
|
4445
|
-
self.features_info.drop(columns=[self.bundle.get("features_info_psi")], inplace=True)
|
4446
4133
|
self._features_info_without_links = pd.DataFrame(features_info_without_links)
|
4447
4134
|
self._internal_features_info = pd.DataFrame(internal_features_info)
|
4448
4135
|
if not silent:
|