upgini 1.2.80__py3-none-any.whl → 1.2.81__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/autofe/binary.py +2 -2
- upgini/autofe/timeseries/volatility.py +6 -4
- upgini/features_enricher.py +155 -91
- upgini/http.py +21 -21
- upgini/mdc/__init__.py +1 -1
- upgini/metadata.py +1 -1
- upgini/metrics.py +289 -228
- upgini/resource_bundle/strings.properties +1 -1
- upgini/search_task.py +1 -0
- upgini/utils/display_utils.py +12 -7
- upgini/utils/target_utils.py +9 -6
- {upgini-1.2.80.dist-info → upgini-1.2.81.dist-info}/METADATA +3 -1
- {upgini-1.2.80.dist-info → upgini-1.2.81.dist-info}/RECORD +16 -16
- {upgini-1.2.80.dist-info → upgini-1.2.81.dist-info}/WHEEL +0 -0
- {upgini-1.2.80.dist-info → upgini-1.2.81.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.81"
|
upgini/autofe/binary.py
CHANGED
@@ -146,8 +146,8 @@ class Distance(PandasOperator):
|
|
146
146
|
|
147
147
|
# row-wise dot product, handling None values
|
148
148
|
def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
149
|
-
left = left.apply(lambda x: np.array(x))
|
150
|
-
right = right.apply(lambda x: np.array(x))
|
149
|
+
left = left.apply(lambda x: np.array(x).astype(np.float64))
|
150
|
+
right = right.apply(lambda x: np.array(x).astype(np.float64))
|
151
151
|
res = (left.dropna() * right.dropna()).apply(np.sum)
|
152
152
|
res = res.reindex(left.index.union(right.index))
|
153
153
|
return res
|
@@ -60,12 +60,14 @@ class EWMAVolatility(VolatilityBase, ParametrizedOperator):
|
|
60
60
|
return res
|
61
61
|
|
62
62
|
def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
|
63
|
-
return ts.apply(self._ewma_vol)
|
63
|
+
return ts.apply(self._ewma_vol).iloc[:, [-1]]
|
64
64
|
|
65
65
|
def _ewma_vol(self, x):
|
66
|
-
|
67
|
-
|
68
|
-
|
66
|
+
return_series = isinstance(x, pd.Series)
|
67
|
+
x = pd.DataFrame(x)
|
68
|
+
returns = self._get_returns(x.iloc[:, -1], f"{self.step_size}{self.step_unit}")
|
69
|
+
x.iloc[:, -1] = returns.ewm(span=self.window_size).std()
|
70
|
+
return x.iloc[:, -1] if return_series else x
|
69
71
|
|
70
72
|
|
71
73
|
class RollingVolBase(VolatilityBase):
|
upgini/features_enricher.py
CHANGED
@@ -63,7 +63,7 @@ from upgini.metadata import (
|
|
63
63
|
RuntimeParameters,
|
64
64
|
SearchKey,
|
65
65
|
)
|
66
|
-
from upgini.metrics import EstimatorWrapper, validate_scoring_argument
|
66
|
+
from upgini.metrics import EstimatorWrapper, define_scorer, validate_scoring_argument
|
67
67
|
from upgini.normalizer.normalize_utils import Normalizer
|
68
68
|
from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
|
69
69
|
from upgini.search_task import SearchTask
|
@@ -310,6 +310,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
310
310
|
self._search_task = search_task.poll_result(trace_id, quiet=True, check_fit=True)
|
311
311
|
file_metadata = self._search_task.get_file_metadata(trace_id)
|
312
312
|
x_columns = [c.originalName or c.name for c in file_metadata.columns]
|
313
|
+
self.fit_columns_renaming = {c.name: c.originalName for c in file_metadata.columns}
|
313
314
|
df = pd.DataFrame(columns=x_columns)
|
314
315
|
self.__prepare_feature_importances(trace_id, df, silent=True)
|
315
316
|
# TODO validate search_keys with search_keys from file_metadata
|
@@ -452,6 +453,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
452
453
|
"""
|
453
454
|
trace_id = str(uuid.uuid4())
|
454
455
|
start_time = time.time()
|
456
|
+
auto_fe_parameters = AutoFEParameters() if auto_fe_parameters is None else auto_fe_parameters
|
455
457
|
search_progress = SearchProgress(0.0, ProgressStage.START_FIT)
|
456
458
|
if progress_callback is not None:
|
457
459
|
progress_callback(search_progress)
|
@@ -476,7 +478,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
476
478
|
self.__validate_search_keys(self.search_keys)
|
477
479
|
|
478
480
|
# Validate client estimator params
|
479
|
-
self.
|
481
|
+
self._get_and_validate_client_cat_features(estimator, X, self.search_keys)
|
480
482
|
|
481
483
|
try:
|
482
484
|
self.X = X
|
@@ -606,6 +608,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
606
608
|
"""
|
607
609
|
|
608
610
|
self.warning_counter.reset()
|
611
|
+
auto_fe_parameters = AutoFEParameters() if auto_fe_parameters is None else auto_fe_parameters
|
609
612
|
trace_id = str(uuid.uuid4())
|
610
613
|
start_time = time.time()
|
611
614
|
with MDC(trace_id=trace_id):
|
@@ -957,9 +960,17 @@ class FeaturesEnricher(TransformerMixin):
|
|
957
960
|
self.__display_support_link(msg)
|
958
961
|
return None
|
959
962
|
|
960
|
-
|
963
|
+
cat_features_from_backend = self.__get_categorical_features()
|
964
|
+
client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
|
961
965
|
estimator, validated_X, self.search_keys
|
962
966
|
)
|
967
|
+
for cat_feature in cat_features_from_backend:
|
968
|
+
original_cat_feature = self.fit_columns_renaming.get(cat_feature)
|
969
|
+
if original_cat_feature in self.search_keys:
|
970
|
+
if self.search_keys[original_cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
|
971
|
+
search_keys_for_metrics.append(original_cat_feature)
|
972
|
+
else:
|
973
|
+
self.logger.warning(self.bundle.get("cat_feature_search_key").format(original_cat_feature))
|
963
974
|
search_keys_for_metrics.extend([c for c in self.id_columns or [] if c not in search_keys_for_metrics])
|
964
975
|
self.logger.info(f"Search keys for metrics: {search_keys_for_metrics}")
|
965
976
|
|
@@ -976,7 +987,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
976
987
|
search_keys_for_metrics=search_keys_for_metrics,
|
977
988
|
progress_bar=progress_bar,
|
978
989
|
progress_callback=progress_callback,
|
979
|
-
|
990
|
+
client_cat_features=client_cat_features,
|
980
991
|
)
|
981
992
|
if prepared_data is None:
|
982
993
|
return None
|
@@ -994,11 +1005,25 @@ class FeaturesEnricher(TransformerMixin):
|
|
994
1005
|
) = prepared_data
|
995
1006
|
|
996
1007
|
# rename cat_features
|
997
|
-
if
|
1008
|
+
if client_cat_features:
|
998
1009
|
for new_c, old_c in columns_renaming.items():
|
999
|
-
if old_c in
|
1000
|
-
|
1001
|
-
|
1010
|
+
if old_c in client_cat_features:
|
1011
|
+
client_cat_features.remove(old_c)
|
1012
|
+
client_cat_features.append(new_c)
|
1013
|
+
for cat_feature in client_cat_features:
|
1014
|
+
if cat_feature not in fitting_X.columns:
|
1015
|
+
self.logger.error(
|
1016
|
+
f"Client cat_feature `{cat_feature}` not found in"
|
1017
|
+
f" x columns: {fitting_X.columns.to_list()}"
|
1018
|
+
)
|
1019
|
+
else:
|
1020
|
+
client_cat_features = []
|
1021
|
+
|
1022
|
+
# rename baseline_score_column
|
1023
|
+
reversed_renaming = {v: k for k, v in columns_renaming.items()}
|
1024
|
+
baseline_score_column = self.baseline_score_column
|
1025
|
+
if baseline_score_column is not None:
|
1026
|
+
baseline_score_column = reversed_renaming[baseline_score_column]
|
1002
1027
|
|
1003
1028
|
gc.collect()
|
1004
1029
|
|
@@ -1006,12 +1031,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
1006
1031
|
self.__log_warning(self.bundle.get("metrics_no_important_free_features"))
|
1007
1032
|
return None
|
1008
1033
|
|
1009
|
-
|
1010
|
-
text_features
|
1011
|
-
|
1012
|
-
|
1013
|
-
|
1014
|
-
|
1034
|
+
text_features = self.generate_features.copy() if self.generate_features else None
|
1035
|
+
if text_features:
|
1036
|
+
for renamed, original in columns_renaming.items():
|
1037
|
+
if original in text_features:
|
1038
|
+
text_features.remove(original)
|
1039
|
+
text_features.append(renamed)
|
1015
1040
|
|
1016
1041
|
print(self.bundle.get("metrics_start"))
|
1017
1042
|
with Spinner():
|
@@ -1019,25 +1044,20 @@ class FeaturesEnricher(TransformerMixin):
|
|
1019
1044
|
|
1020
1045
|
has_date = self._get_date_column(search_keys) is not None
|
1021
1046
|
model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
|
1047
|
+
cat_features = list(set(client_cat_features + cat_features_from_backend))
|
1048
|
+
baseline_cat_features = [f for f in cat_features if f in fitting_X.columns]
|
1049
|
+
enriched_cat_features = [f for f in cat_features if f in fitting_enriched_X.columns]
|
1050
|
+
if len(enriched_cat_features) < len(cat_features):
|
1051
|
+
missing_cat_features = [f for f in cat_features if f not in fitting_enriched_X.columns]
|
1052
|
+
self.logger.warning(f"Some cat_features were not found in enriched_X: {missing_cat_features}")
|
1022
1053
|
|
1023
|
-
|
1024
|
-
estimator,
|
1025
|
-
self.logger,
|
1026
|
-
model_task_type,
|
1027
|
-
_cv,
|
1028
|
-
fitting_enriched_X,
|
1029
|
-
scoring,
|
1030
|
-
groups=groups,
|
1031
|
-
text_features=text_features,
|
1032
|
-
has_date=has_date,
|
1033
|
-
)
|
1034
|
-
metric = wrapper.metric_name
|
1035
|
-
multiplier = wrapper.multiplier
|
1054
|
+
_, metric, multiplier = define_scorer(model_task_type, scoring)
|
1036
1055
|
|
1037
1056
|
# 1 If client features are presented - fit and predict with KFold estimator
|
1038
1057
|
# on etalon features and calculate baseline metric
|
1039
|
-
|
1058
|
+
baseline_metric = None
|
1040
1059
|
baseline_estimator = None
|
1060
|
+
updating_shaps = None
|
1041
1061
|
custom_loss_add_params = get_additional_params_custom_loss(
|
1042
1062
|
self.loss, model_task_type, logger=self.logger
|
1043
1063
|
)
|
@@ -1050,25 +1070,25 @@ class FeaturesEnricher(TransformerMixin):
|
|
1050
1070
|
self.logger,
|
1051
1071
|
model_task_type,
|
1052
1072
|
_cv,
|
1053
|
-
|
1054
|
-
|
1055
|
-
cat_features,
|
1073
|
+
scoring=scoring,
|
1074
|
+
cat_features=baseline_cat_features,
|
1056
1075
|
add_params=custom_loss_add_params,
|
1057
1076
|
groups=groups,
|
1058
1077
|
text_features=text_features,
|
1059
1078
|
has_date=has_date,
|
1060
1079
|
)
|
1061
|
-
|
1062
|
-
fitting_X, y_sorted,
|
1080
|
+
baseline_cv_result = baseline_estimator.cross_val_predict(
|
1081
|
+
fitting_X, y_sorted, baseline_score_column
|
1063
1082
|
)
|
1064
|
-
|
1065
|
-
if
|
1083
|
+
baseline_metric = baseline_cv_result.get_display_metric()
|
1084
|
+
if baseline_metric is None:
|
1066
1085
|
self.logger.info(
|
1067
1086
|
f"Baseline {metric} on train client features is None (maybe all features was removed)"
|
1068
1087
|
)
|
1069
1088
|
baseline_estimator = None
|
1070
1089
|
else:
|
1071
|
-
self.logger.info(f"Baseline {metric} on train client features: {
|
1090
|
+
self.logger.info(f"Baseline {metric} on train client features: {baseline_metric}")
|
1091
|
+
updating_shaps = baseline_cv_result.shap_values
|
1072
1092
|
|
1073
1093
|
# 2 Fit and predict with KFold estimator on enriched tds
|
1074
1094
|
# and calculate final metric (and uplift)
|
@@ -1085,9 +1105,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1085
1105
|
self.logger,
|
1086
1106
|
model_task_type,
|
1087
1107
|
_cv,
|
1088
|
-
|
1089
|
-
|
1090
|
-
cat_features,
|
1108
|
+
scoring=scoring,
|
1109
|
+
cat_features=enriched_cat_features,
|
1091
1110
|
add_params=custom_loss_add_params,
|
1092
1111
|
groups=groups,
|
1093
1112
|
text_features=text_features,
|
@@ -1095,10 +1114,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1095
1114
|
)
|
1096
1115
|
enriched_cv_result = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
|
1097
1116
|
enriched_metric = enriched_cv_result.get_display_metric()
|
1098
|
-
|
1099
|
-
|
1100
|
-
if enriched_shaps is not None:
|
1101
|
-
self._update_shap_values(trace_id, fitting_X, enriched_shaps, silent=not internal_call)
|
1117
|
+
updating_shaps = enriched_cv_result.shap_values
|
1102
1118
|
|
1103
1119
|
if enriched_metric is None:
|
1104
1120
|
self.logger.warning(
|
@@ -1107,8 +1123,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1107
1123
|
enriched_estimator = None
|
1108
1124
|
else:
|
1109
1125
|
self.logger.info(f"Enriched {metric} on train combined features: {enriched_metric}")
|
1110
|
-
if
|
1111
|
-
uplift = (enriched_cv_result.metric -
|
1126
|
+
if baseline_metric is not None and enriched_metric is not None:
|
1127
|
+
uplift = (enriched_cv_result.metric - baseline_cv_result.metric) * multiplier
|
1112
1128
|
|
1113
1129
|
train_metrics = {
|
1114
1130
|
self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
|
@@ -1126,8 +1142,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
1126
1142
|
np.mean(y_sorted),
|
1127
1143
|
4,
|
1128
1144
|
)
|
1129
|
-
if
|
1130
|
-
train_metrics[self.bundle.get("quality_metrics_baseline_header").format(metric)] =
|
1145
|
+
if baseline_metric is not None:
|
1146
|
+
train_metrics[self.bundle.get("quality_metrics_baseline_header").format(metric)] = (
|
1147
|
+
baseline_metric
|
1148
|
+
)
|
1131
1149
|
if enriched_metric is not None:
|
1132
1150
|
train_metrics[self.bundle.get("quality_metrics_enriched_header").format(metric)] = (
|
1133
1151
|
enriched_metric
|
@@ -1156,7 +1174,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1156
1174
|
f"on client features: {eval_X_sorted.columns.to_list()}"
|
1157
1175
|
)
|
1158
1176
|
etalon_eval_results = baseline_estimator.calculate_metric(
|
1159
|
-
eval_X_sorted, eval_y_sorted,
|
1177
|
+
eval_X_sorted, eval_y_sorted, baseline_score_column
|
1160
1178
|
)
|
1161
1179
|
etalon_eval_metric = etalon_eval_results.get_display_metric()
|
1162
1180
|
self.logger.info(
|
@@ -1218,6 +1236,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
1218
1236
|
|
1219
1237
|
metrics.append(eval_metrics)
|
1220
1238
|
|
1239
|
+
if updating_shaps is not None:
|
1240
|
+
self._update_shap_values(trace_id, fitting_X, updating_shaps, silent=not internal_call)
|
1241
|
+
|
1221
1242
|
metrics_df = pd.DataFrame(metrics)
|
1222
1243
|
mean_target_hdr = self.bundle.get("quality_metrics_mean_target_header")
|
1223
1244
|
if mean_target_hdr in metrics_df.columns:
|
@@ -1268,6 +1289,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1268
1289
|
|
1269
1290
|
def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float], silent: bool = False):
|
1270
1291
|
renaming = self.fit_columns_renaming or {}
|
1292
|
+
self.logger.info(f"Updating SHAP values: {new_shaps}")
|
1271
1293
|
new_shaps = {
|
1272
1294
|
renaming.get(feature, feature): _round_shap_value(shap)
|
1273
1295
|
for feature, shap in new_shaps.items()
|
@@ -1420,7 +1442,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1420
1442
|
|
1421
1443
|
return _cv, groups
|
1422
1444
|
|
1423
|
-
def
|
1445
|
+
def _get_and_validate_client_cat_features(
|
1424
1446
|
self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
1425
1447
|
) -> Tuple[Optional[List[str]], List[str]]:
|
1426
1448
|
cat_features = None
|
@@ -1428,12 +1450,20 @@ class FeaturesEnricher(TransformerMixin):
|
|
1428
1450
|
if (
|
1429
1451
|
estimator is not None
|
1430
1452
|
and hasattr(estimator, "get_param")
|
1453
|
+
and hasattr(estimator, "_init_params")
|
1431
1454
|
and estimator.get_param("cat_features") is not None
|
1432
1455
|
):
|
1433
|
-
|
1434
|
-
if
|
1435
|
-
|
1436
|
-
|
1456
|
+
estimator_cat_features = estimator.get_param("cat_features")
|
1457
|
+
if all([isinstance(c, int) for c in estimator_cat_features]):
|
1458
|
+
cat_features = [X.columns[idx] for idx in estimator_cat_features]
|
1459
|
+
elif all([isinstance(c, str) for c in estimator_cat_features]):
|
1460
|
+
cat_features = estimator_cat_features
|
1461
|
+
else:
|
1462
|
+
print(f"WARNING: Unsupported type of cat_features in CatBoost estimator: {estimator_cat_features}")
|
1463
|
+
|
1464
|
+
del estimator._init_params["cat_features"]
|
1465
|
+
|
1466
|
+
if cat_features:
|
1437
1467
|
self.logger.info(f"Collected categorical features {cat_features} from user estimator")
|
1438
1468
|
for cat_feature in cat_features:
|
1439
1469
|
if cat_feature in search_keys:
|
@@ -1457,7 +1487,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1457
1487
|
search_keys_for_metrics: Optional[List[str]] = None,
|
1458
1488
|
progress_bar: Optional[ProgressBar] = None,
|
1459
1489
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
1460
|
-
|
1490
|
+
client_cat_features: Optional[List[str]] = None,
|
1461
1491
|
):
|
1462
1492
|
is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
|
1463
1493
|
is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
|
@@ -1531,7 +1561,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1531
1561
|
|
1532
1562
|
# Detect and drop high cardinality columns in train
|
1533
1563
|
columns_with_high_cardinality = FeaturesValidator.find_high_cardinality(fitting_X)
|
1534
|
-
non_excluding_columns = (self.generate_features or []) + (
|
1564
|
+
non_excluding_columns = (self.generate_features or []) + (client_cat_features or [])
|
1535
1565
|
columns_with_high_cardinality = [c for c in columns_with_high_cardinality if c not in non_excluding_columns]
|
1536
1566
|
if len(columns_with_high_cardinality) > 0:
|
1537
1567
|
self.logger.warning(
|
@@ -1751,10 +1781,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
1751
1781
|
df = generator.generate(df)
|
1752
1782
|
generated_features.extend(generator.generated_features)
|
1753
1783
|
|
1754
|
-
|
1755
|
-
|
1756
|
-
|
1757
|
-
columns_renaming = {c: c for c in df.columns}
|
1784
|
+
normalizer = Normalizer(self.bundle, self.logger)
|
1785
|
+
df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
1786
|
+
columns_renaming = normalizer.columns_renaming
|
1787
|
+
# columns_renaming = {c: c for c in df.columns}
|
1758
1788
|
|
1759
1789
|
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
1760
1790
|
|
@@ -1886,6 +1916,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
1886
1916
|
enriched_eval_X = enriched_eval_sets[idx + 1][enriched_X_columns].copy()
|
1887
1917
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
1888
1918
|
|
1919
|
+
reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
|
1920
|
+
X_sampled.rename(columns=reversed_renaming, inplace=True)
|
1921
|
+
enriched_X.rename(columns=reversed_renaming, inplace=True)
|
1922
|
+
for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
|
1923
|
+
eval_X_sampled.rename(columns=reversed_renaming, inplace=True)
|
1924
|
+
enriched_eval_X.rename(columns=reversed_renaming, inplace=True)
|
1925
|
+
|
1889
1926
|
datasets_hash = hash_input(self.X, self.y, self.eval_set)
|
1890
1927
|
return self.__cache_and_return_results(
|
1891
1928
|
datasets_hash,
|
@@ -1942,6 +1979,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
1942
1979
|
enriched_df, x_columns, enriched_X.columns.tolist(), len(eval_set) if has_eval_set else 0
|
1943
1980
|
)
|
1944
1981
|
|
1982
|
+
# Add hash-suffixes because output of transform has original names
|
1983
|
+
reversed_renaming = {v: k for k, v in columns_renaming.items()}
|
1984
|
+
X_sampled.rename(columns=reversed_renaming, inplace=True)
|
1985
|
+
enriched_X.rename(columns=reversed_renaming, inplace=True)
|
1986
|
+
for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
|
1987
|
+
eval_X_sampled.rename(columns=reversed_renaming, inplace=True)
|
1988
|
+
enriched_eval_X.rename(columns=reversed_renaming, inplace=True)
|
1989
|
+
|
1945
1990
|
# Cache and return results
|
1946
1991
|
datasets_hash = hash_input(validated_X, validated_y, eval_set)
|
1947
1992
|
return self.__cache_and_return_results(
|
@@ -2069,10 +2114,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
2069
2114
|
search_keys: Dict,
|
2070
2115
|
columns_renaming: Dict[str, str],
|
2071
2116
|
):
|
2117
|
+
# X_sampled - with hash-suffixes
|
2118
|
+
reversed_renaming = {v: k for k, v in columns_renaming.items()}
|
2072
2119
|
search_keys = {
|
2073
|
-
|
2120
|
+
reversed_renaming.get(k, k): v
|
2074
2121
|
for k, v in search_keys.items()
|
2075
|
-
if
|
2122
|
+
if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
|
2076
2123
|
}
|
2077
2124
|
return FeaturesEnricher._SampledDataForMetrics(
|
2078
2125
|
X_sampled=X_sampled,
|
@@ -2334,6 +2381,25 @@ if response.status_code == 200:
|
|
2334
2381
|
df[columns_for_system_record_id], index=False
|
2335
2382
|
).astype("float64")
|
2336
2383
|
|
2384
|
+
features_not_to_pass = []
|
2385
|
+
if add_fit_system_record_id:
|
2386
|
+
df = self.__add_fit_system_record_id(
|
2387
|
+
df,
|
2388
|
+
search_keys,
|
2389
|
+
SYSTEM_RECORD_ID,
|
2390
|
+
TARGET,
|
2391
|
+
columns_renaming,
|
2392
|
+
silent=True,
|
2393
|
+
)
|
2394
|
+
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
2395
|
+
features_not_to_pass.append(SORT_ID)
|
2396
|
+
|
2397
|
+
system_columns_with_original_index = [ENTITY_SYSTEM_RECORD_ID] + generated_features
|
2398
|
+
if add_fit_system_record_id:
|
2399
|
+
system_columns_with_original_index.append(SORT_ID)
|
2400
|
+
|
2401
|
+
df_before_explode = df[system_columns_with_original_index].copy()
|
2402
|
+
|
2337
2403
|
# Explode multiple search keys
|
2338
2404
|
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys, columns_renaming)
|
2339
2405
|
|
@@ -2381,25 +2447,13 @@ if response.status_code == 200:
|
|
2381
2447
|
meaning_types.update({col: FileColumnMeaningType.FEATURE for col in features_for_transform})
|
2382
2448
|
meaning_types.update({col: key.value for col, key in search_keys.items()})
|
2383
2449
|
|
2384
|
-
features_not_to_pass
|
2450
|
+
features_not_to_pass.extend([
|
2385
2451
|
c
|
2386
2452
|
for c in df.columns
|
2387
2453
|
if c not in search_keys.keys()
|
2388
2454
|
and c not in features_for_transform
|
2389
2455
|
and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
|
2390
|
-
]
|
2391
|
-
|
2392
|
-
if add_fit_system_record_id:
|
2393
|
-
df = self.__add_fit_system_record_id(
|
2394
|
-
df,
|
2395
|
-
search_keys,
|
2396
|
-
SYSTEM_RECORD_ID,
|
2397
|
-
TARGET,
|
2398
|
-
columns_renaming,
|
2399
|
-
silent=True,
|
2400
|
-
)
|
2401
|
-
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
2402
|
-
features_not_to_pass.append(SORT_ID)
|
2456
|
+
])
|
2403
2457
|
|
2404
2458
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
2405
2459
|
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
@@ -2415,10 +2469,6 @@ if response.status_code == 200:
|
|
2415
2469
|
meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
|
2416
2470
|
|
2417
2471
|
df = df.reset_index(drop=True)
|
2418
|
-
system_columns_with_original_index = [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID] + generated_features
|
2419
|
-
if add_fit_system_record_id:
|
2420
|
-
system_columns_with_original_index.append(SORT_ID)
|
2421
|
-
df_with_original_index = df[system_columns_with_original_index].copy()
|
2422
2472
|
|
2423
2473
|
combined_search_keys = combine_search_keys(search_keys.keys())
|
2424
2474
|
|
@@ -2526,7 +2576,7 @@ if response.status_code == 200:
|
|
2526
2576
|
combined_df = pd.concat(
|
2527
2577
|
[
|
2528
2578
|
validated_Xy.reset_index(drop=True),
|
2529
|
-
|
2579
|
+
df_before_explode.reset_index(drop=True),
|
2530
2580
|
],
|
2531
2581
|
axis=1,
|
2532
2582
|
).set_index(validated_Xy.index)
|
@@ -2683,7 +2733,7 @@ if response.status_code == 200:
|
|
2683
2733
|
importance_threshold: Optional[float],
|
2684
2734
|
max_features: Optional[int],
|
2685
2735
|
remove_outliers_calc_metrics: Optional[bool],
|
2686
|
-
auto_fe_parameters:
|
2736
|
+
auto_fe_parameters: AutoFEParameters,
|
2687
2737
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
2688
2738
|
search_id_callback: Optional[Callable[[str], Any]] = None,
|
2689
2739
|
):
|
@@ -2729,7 +2779,9 @@ if response.status_code == 200:
|
|
2729
2779
|
if self.id_columns is not None:
|
2730
2780
|
for id_column in self.id_columns:
|
2731
2781
|
if id_column not in validated_X.columns:
|
2732
|
-
raise ValidationError(
|
2782
|
+
raise ValidationError(
|
2783
|
+
self.bundle.get("missing_id_column").format(id_column, list(validated_X.columns))
|
2784
|
+
)
|
2733
2785
|
|
2734
2786
|
validate_scoring_argument(scoring)
|
2735
2787
|
|
@@ -3071,7 +3123,7 @@ if response.status_code == 200:
|
|
3071
3123
|
self.__show_selected_features(self.fit_search_keys)
|
3072
3124
|
|
3073
3125
|
autofe_description = self.get_autofe_features_description()
|
3074
|
-
if autofe_description is not None:
|
3126
|
+
if autofe_description is not None and len(autofe_description) > 0:
|
3075
3127
|
self.logger.info(f"AutoFE descriptions: {autofe_description}")
|
3076
3128
|
self.autofe_features_display_handle = display_html_dataframe(
|
3077
3129
|
df=autofe_description,
|
@@ -3855,6 +3907,13 @@ if response.status_code == 200:
|
|
3855
3907
|
|
3856
3908
|
return importances
|
3857
3909
|
|
3910
|
+
def __get_categorical_features(self) -> List[str]:
|
3911
|
+
features_meta = self._search_task.get_all_features_metadata_v2()
|
3912
|
+
if features_meta is None:
|
3913
|
+
raise Exception(self.bundle.get("missing_features_meta"))
|
3914
|
+
|
3915
|
+
return [f.name for f in features_meta if f.type == "categorical"]
|
3916
|
+
|
3858
3917
|
def __prepare_feature_importances(
|
3859
3918
|
self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
|
3860
3919
|
):
|
@@ -3883,7 +3942,10 @@ if response.status_code == 200:
|
|
3883
3942
|
if feature_meta.name in original_names_dict.keys():
|
3884
3943
|
feature_meta.name = original_names_dict[feature_meta.name]
|
3885
3944
|
|
3886
|
-
|
3945
|
+
is_client_feature = feature_meta.name in df.columns
|
3946
|
+
|
3947
|
+
# Show and update shap values for client features only if select_features is True
|
3948
|
+
if updated_shaps is not None and (not is_client_feature or self.fit_select_features):
|
3887
3949
|
updating_shap = updated_shaps.get(feature_meta.name)
|
3888
3950
|
if updating_shap is None:
|
3889
3951
|
if feature_meta.shap_value != 0.0:
|
@@ -3906,6 +3968,7 @@ if response.status_code == 200:
|
|
3906
3968
|
continue
|
3907
3969
|
|
3908
3970
|
# Use only important features
|
3971
|
+
# If select_features is False, we don't show etalon features in the report
|
3909
3972
|
if (
|
3910
3973
|
# feature_meta.name in self.fit_generated_features or
|
3911
3974
|
feature_meta.name == COUNTRY # constant synthetic column
|
@@ -4216,7 +4279,7 @@ if response.status_code == 200:
|
|
4216
4279
|
def __show_selected_features(self, search_keys: Dict[str, SearchKey]):
|
4217
4280
|
search_key_names = [col for col, tpe in search_keys.items() if tpe != SearchKey.CUSTOM_KEY]
|
4218
4281
|
if self.fit_columns_renaming:
|
4219
|
-
search_key_names = [self.fit_columns_renaming.get(col, col) for col in search_key_names]
|
4282
|
+
search_key_names = sorted(set([self.fit_columns_renaming.get(col, col) for col in search_key_names]))
|
4220
4283
|
msg = self.bundle.get("features_info_header").format(len(self.feature_names_), search_key_names)
|
4221
4284
|
|
4222
4285
|
try:
|
@@ -4232,12 +4295,13 @@ if response.status_code == 200:
|
|
4232
4295
|
display_id=f"features_info_{uuid.uuid4()}",
|
4233
4296
|
)
|
4234
4297
|
|
4235
|
-
self.
|
4236
|
-
self.
|
4237
|
-
|
4238
|
-
|
4239
|
-
|
4240
|
-
|
4298
|
+
if len(self.relevant_data_sources) > 0:
|
4299
|
+
self.data_sources_display_handle = display_html_dataframe(
|
4300
|
+
self.relevant_data_sources,
|
4301
|
+
self._relevant_data_sources_wo_links,
|
4302
|
+
self.bundle.get("relevant_data_sources_header"),
|
4303
|
+
display_id=f"data_sources_{uuid.uuid4()}",
|
4304
|
+
)
|
4241
4305
|
else:
|
4242
4306
|
msg = self.bundle.get("features_info_zero_important_features")
|
4243
4307
|
self.__log_warning(msg, show_support_link=True)
|