upgini 1.1.222a1__py3-none-any.whl → 1.1.224__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/dataset.py +5 -1
- upgini/features_enricher.py +416 -220
- upgini/metadata.py +14 -0
- upgini/metrics.py +45 -23
- upgini/resource_bundle/strings.properties +4 -2
- upgini/search_task.py +9 -0
- upgini/utils/cv_utils.py +9 -11
- upgini/utils/display_utils.py +21 -2
- {upgini-1.1.222a1.dist-info → upgini-1.1.224.dist-info}/METADATA +18 -16
- {upgini-1.1.222a1.dist-info → upgini-1.1.224.dist-info}/RECORD +13 -14
- {upgini-1.1.222a1.dist-info → upgini-1.1.224.dist-info}/WHEEL +1 -1
- upgini/fingerprint.js +0 -8
- {upgini-1.1.222a1.dist-info → upgini-1.1.224.dist-info}/LICENSE +0 -0
- {upgini-1.1.222a1.dist-info → upgini-1.1.224.dist-info}/top_level.txt +0 -0
upgini/features_enricher.py
CHANGED
|
@@ -5,10 +5,13 @@ import logging
|
|
|
5
5
|
import numbers
|
|
6
6
|
import os
|
|
7
7
|
import pickle
|
|
8
|
+
import re
|
|
8
9
|
import sys
|
|
9
10
|
import tempfile
|
|
10
11
|
import time
|
|
11
12
|
import uuid
|
|
13
|
+
from collections import namedtuple
|
|
14
|
+
from functools import reduce
|
|
12
15
|
from threading import Thread
|
|
13
16
|
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
|
14
17
|
|
|
@@ -175,6 +178,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
175
178
|
logs_enabled: bool = True,
|
|
176
179
|
raise_validation_error: bool = True,
|
|
177
180
|
exclude_columns: Optional[List[str]] = None,
|
|
181
|
+
baseline_score_column: Optional[Any] = None,
|
|
178
182
|
client_ip: Optional[str] = None,
|
|
179
183
|
**kwargs,
|
|
180
184
|
):
|
|
@@ -279,6 +283,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
279
283
|
|
|
280
284
|
self.raise_validation_error = raise_validation_error
|
|
281
285
|
self.exclude_columns = exclude_columns
|
|
286
|
+
self.baseline_score_column = baseline_score_column
|
|
282
287
|
|
|
283
288
|
def _get_api_key(self):
|
|
284
289
|
return self._api_key
|
|
@@ -290,6 +295,18 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
290
295
|
|
|
291
296
|
api_key = property(_get_api_key, _set_api_key)
|
|
292
297
|
|
|
298
|
+
@staticmethod
|
|
299
|
+
def _check_eval_set(eval_set, X):
|
|
300
|
+
checked_eval_set = []
|
|
301
|
+
if eval_set is not None and not isinstance(eval_set, list):
|
|
302
|
+
raise ValidationError(bundle.get("unsupported_type_eval_set").format(type(eval_set)))
|
|
303
|
+
for eval_pair in eval_set or []:
|
|
304
|
+
if not isinstance(eval_pair, tuple) or len(eval_pair) != 2:
|
|
305
|
+
raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
|
306
|
+
if not is_frames_equal(X, eval_pair[0]):
|
|
307
|
+
checked_eval_set.append(eval_pair)
|
|
308
|
+
return checked_eval_set
|
|
309
|
+
|
|
293
310
|
def fit(
|
|
294
311
|
self,
|
|
295
312
|
X: Union[pd.DataFrame, pd.Series, np.ndarray],
|
|
@@ -366,19 +383,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
366
383
|
try:
|
|
367
384
|
self.X = X
|
|
368
385
|
self.y = y
|
|
369
|
-
|
|
370
|
-
for eval_pair in eval_set or []:
|
|
371
|
-
if len(eval_pair) != 2:
|
|
372
|
-
raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
|
373
|
-
if not is_frames_equal(X, eval_pair[0]):
|
|
374
|
-
checked_eval_set.append(eval_pair)
|
|
375
|
-
self.eval_set = checked_eval_set
|
|
386
|
+
self.eval_set = self._check_eval_set(eval_set, X)
|
|
376
387
|
self.dump_input(trace_id, X, y, eval_set)
|
|
377
388
|
self.__inner_fit(
|
|
378
389
|
trace_id,
|
|
379
390
|
X,
|
|
380
391
|
y,
|
|
381
|
-
|
|
392
|
+
self.eval_set,
|
|
382
393
|
progress_bar,
|
|
383
394
|
start_time=start_time,
|
|
384
395
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -508,13 +519,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
508
519
|
try:
|
|
509
520
|
self.X = X
|
|
510
521
|
self.y = y
|
|
511
|
-
|
|
512
|
-
for eval_pair in eval_set or []:
|
|
513
|
-
if len(eval_pair) != 2:
|
|
514
|
-
raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
|
515
|
-
if not is_frames_equal(X, eval_pair[0]):
|
|
516
|
-
checked_eval_set.append(eval_pair)
|
|
517
|
-
self.eval_set = checked_eval_set
|
|
522
|
+
self.eval_set = self._check_eval_set(eval_set, X)
|
|
518
523
|
self.dump_input(trace_id, X, y, eval_set)
|
|
519
524
|
|
|
520
525
|
if _num_samples(drop_duplicates(X)) > Dataset.MAX_ROWS:
|
|
@@ -524,7 +529,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
524
529
|
trace_id,
|
|
525
530
|
X,
|
|
526
531
|
y,
|
|
527
|
-
|
|
532
|
+
self.eval_set,
|
|
528
533
|
progress_bar,
|
|
529
534
|
start_time=start_time,
|
|
530
535
|
exclude_features_sources=exclude_features_sources,
|
|
@@ -882,6 +887,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
882
887
|
enriched_y_sorted,
|
|
883
888
|
fitting_eval_set_dict,
|
|
884
889
|
search_keys,
|
|
890
|
+
groups,
|
|
885
891
|
) = prepared_data
|
|
886
892
|
|
|
887
893
|
gc.collect()
|
|
@@ -906,7 +912,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
906
912
|
).get_cv()
|
|
907
913
|
|
|
908
914
|
wrapper = EstimatorWrapper.create(
|
|
909
|
-
estimator,
|
|
915
|
+
estimator,
|
|
916
|
+
self.logger,
|
|
917
|
+
model_task_type,
|
|
918
|
+
_cv,
|
|
919
|
+
fitting_enriched_X,
|
|
920
|
+
scoring,
|
|
921
|
+
groups=groups,
|
|
910
922
|
)
|
|
911
923
|
metric = wrapper.metric_name
|
|
912
924
|
multiplier = wrapper.multiplier
|
|
@@ -931,8 +943,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
931
943
|
scoring,
|
|
932
944
|
cat_features,
|
|
933
945
|
add_params=custom_loss_add_params,
|
|
946
|
+
groups=groups,
|
|
947
|
+
)
|
|
948
|
+
etalon_metric = baseline_estimator.cross_val_predict(
|
|
949
|
+
fitting_X, y_sorted, self.baseline_score_column
|
|
934
950
|
)
|
|
935
|
-
etalon_metric = baseline_estimator.cross_val_predict(fitting_X, y_sorted)
|
|
936
951
|
self.logger.info(f"Baseline {metric} on train client features: {etalon_metric}")
|
|
937
952
|
|
|
938
953
|
# 2 Fit and predict with KFold Catboost model on enriched tds
|
|
@@ -952,6 +967,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
952
967
|
scoring,
|
|
953
968
|
cat_features,
|
|
954
969
|
add_params=custom_loss_add_params,
|
|
970
|
+
groups=groups,
|
|
955
971
|
)
|
|
956
972
|
enriched_metric = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
|
|
957
973
|
self.logger.info(f"Enriched {metric} on train combined features: {enriched_metric}")
|
|
@@ -999,7 +1015,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
999
1015
|
f"Calculate baseline {metric} on eval set {idx + 1} "
|
|
1000
1016
|
f"on client features: {eval_X_sorted.columns.to_list()}"
|
|
1001
1017
|
)
|
|
1002
|
-
etalon_eval_metric = baseline_estimator.calculate_metric(
|
|
1018
|
+
etalon_eval_metric = baseline_estimator.calculate_metric(
|
|
1019
|
+
eval_X_sorted, eval_y_sorted, self.baseline_score_column
|
|
1020
|
+
)
|
|
1003
1021
|
self.logger.info(
|
|
1004
1022
|
f"Baseline {metric} on eval set {idx + 1} client features: {etalon_eval_metric}"
|
|
1005
1023
|
)
|
|
@@ -1172,12 +1190,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1172
1190
|
if X is None:
|
|
1173
1191
|
return True, self.X, self.y, self.eval_set
|
|
1174
1192
|
|
|
1175
|
-
checked_eval_set =
|
|
1176
|
-
for eval_pair in eval_set or []:
|
|
1177
|
-
if len(eval_pair) != 2:
|
|
1178
|
-
raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
|
1179
|
-
if not is_frames_equal(X, eval_pair[0]):
|
|
1180
|
-
checked_eval_set.append(eval_pair)
|
|
1193
|
+
checked_eval_set = self._check_eval_set(eval_set, X)
|
|
1181
1194
|
|
|
1182
1195
|
if (
|
|
1183
1196
|
X is self.X
|
|
@@ -1214,181 +1227,28 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1214
1227
|
progress_bar: Optional[ProgressBar] = None,
|
|
1215
1228
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
1216
1229
|
):
|
|
1217
|
-
is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
|
|
1218
1230
|
is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
|
|
1231
|
+
is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
|
|
1219
1232
|
validated_X = self._validate_X(X)
|
|
1220
1233
|
validated_y = self._validate_y(validated_X, y)
|
|
1234
|
+
validated_eval_set = (
|
|
1235
|
+
[self._validate_eval_set_pair(validated_X, eval_set_pair) for eval_set_pair in eval_set]
|
|
1236
|
+
if eval_set
|
|
1237
|
+
else None
|
|
1238
|
+
)
|
|
1221
1239
|
|
|
1222
|
-
eval_set_sampled_dict =
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
if eval_set is not None:
|
|
1235
|
-
for idx in range(len(eval_set)):
|
|
1236
|
-
eval_X_sampled, _ = self._extend_x(eval_set[idx][0], is_demo_dataset)
|
|
1237
|
-
eval_y_sampled = eval_set[idx][1]
|
|
1238
|
-
enriched_eval_X = eval_X_sampled
|
|
1239
|
-
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
1240
|
-
self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
|
1241
|
-
elif not self.imbalanced and not exclude_features_sources and is_input_same_as_fit:
|
|
1242
|
-
self.logger.info("Dataset is not imbalanced, so use enriched_X from fit")
|
|
1243
|
-
search_keys = self.fit_search_keys
|
|
1244
|
-
|
|
1245
|
-
rows_to_drop = None
|
|
1246
|
-
task_type = self.model_task_type or define_task(validated_y, self.logger, silent=True)
|
|
1247
|
-
if task_type == ModelTaskType.REGRESSION:
|
|
1248
|
-
target_outliers_df = self._search_task.get_target_outliers(trace_id)
|
|
1249
|
-
if target_outliers_df is not None and len(target_outliers_df) > 0:
|
|
1250
|
-
outliers = pd.merge(
|
|
1251
|
-
self.df_with_original_index,
|
|
1252
|
-
target_outliers_df,
|
|
1253
|
-
left_on=SYSTEM_RECORD_ID,
|
|
1254
|
-
right_on=SYSTEM_RECORD_ID,
|
|
1255
|
-
how="inner",
|
|
1256
|
-
)
|
|
1257
|
-
top_outliers = outliers.sort_values(by=TARGET, ascending=False)[TARGET].head(3)
|
|
1258
|
-
if remove_outliers_calc_metrics is None or remove_outliers_calc_metrics is True:
|
|
1259
|
-
rows_to_drop = outliers
|
|
1260
|
-
not_msg = ""
|
|
1261
|
-
else:
|
|
1262
|
-
not_msg = "not "
|
|
1263
|
-
msg = bundle.get("target_outliers_warning").format(len(target_outliers_df), top_outliers, not_msg)
|
|
1264
|
-
print(msg)
|
|
1265
|
-
self.logger.warning(msg)
|
|
1266
|
-
|
|
1267
|
-
enriched_Xy, enriched_eval_sets = self.__enrich(
|
|
1268
|
-
self.df_with_original_index,
|
|
1269
|
-
self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True),
|
|
1270
|
-
rows_to_drop=rows_to_drop,
|
|
1271
|
-
)
|
|
1272
|
-
|
|
1273
|
-
enriched_X = drop_existing_columns(enriched_Xy, TARGET)
|
|
1274
|
-
x_columns = [
|
|
1275
|
-
c for c in validated_X.columns.to_list() + self.fit_generated_features if c in enriched_X.columns
|
|
1276
|
-
]
|
|
1277
|
-
X_sampled = enriched_Xy[x_columns].copy()
|
|
1278
|
-
y_sampled = enriched_Xy[TARGET].copy()
|
|
1279
|
-
|
|
1280
|
-
self.logger.info(f"Shape of enriched_X: {enriched_X.shape}")
|
|
1281
|
-
self.logger.info(f"Shape of X after sampling: {X_sampled.shape}")
|
|
1282
|
-
self.logger.info(f"Shape of y after sampling: {len(y_sampled)}")
|
|
1283
|
-
|
|
1284
|
-
if eval_set is not None:
|
|
1285
|
-
if len(enriched_eval_sets) != len(eval_set):
|
|
1286
|
-
raise ValidationError(
|
|
1287
|
-
bundle.get("metrics_eval_set_count_diff").format(len(enriched_eval_sets), len(eval_set))
|
|
1288
|
-
)
|
|
1289
|
-
|
|
1290
|
-
for idx in range(len(eval_set)):
|
|
1291
|
-
enriched_eval_X = drop_existing_columns(enriched_eval_sets[idx + 1], TARGET)
|
|
1292
|
-
eval_X_sampled = enriched_eval_sets[idx + 1][x_columns].copy()
|
|
1293
|
-
eval_y_sampled = enriched_eval_sets[idx + 1][TARGET].copy()
|
|
1294
|
-
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
1295
|
-
|
|
1296
|
-
self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
|
1297
|
-
else:
|
|
1298
|
-
self.logger.info("Dataset is imbalanced or exclude_features_sources or X was passed. Run transform")
|
|
1299
|
-
print(bundle.get("prepare_data_for_metrics"))
|
|
1300
|
-
if eval_set is not None:
|
|
1301
|
-
self.logger.info("Transform with eval_set")
|
|
1302
|
-
# concatenate X and eval_set with eval_set_index
|
|
1303
|
-
df_with_eval_set_index = validated_X.copy()
|
|
1304
|
-
df_with_eval_set_index[TARGET] = validated_y
|
|
1305
|
-
df_with_eval_set_index[EVAL_SET_INDEX] = 0
|
|
1306
|
-
for idx, eval_pair in enumerate(eval_set):
|
|
1307
|
-
eval_x, eval_y = self._validate_eval_set_pair(validated_X, eval_pair)
|
|
1308
|
-
eval_df_with_index = eval_x.copy()
|
|
1309
|
-
eval_df_with_index[TARGET] = eval_y
|
|
1310
|
-
eval_df_with_index[EVAL_SET_INDEX] = idx + 1
|
|
1311
|
-
df_with_eval_set_index = pd.concat([df_with_eval_set_index, eval_df_with_index])
|
|
1312
|
-
|
|
1313
|
-
# downsample if need to eval_set threshold
|
|
1314
|
-
num_samples = _num_samples(df_with_eval_set_index)
|
|
1315
|
-
if num_samples > Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD:
|
|
1316
|
-
self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS}")
|
|
1317
|
-
df_with_eval_set_index = df_with_eval_set_index.sample(
|
|
1318
|
-
n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state
|
|
1319
|
-
)
|
|
1320
|
-
|
|
1321
|
-
X_sampled = (
|
|
1322
|
-
df_with_eval_set_index[df_with_eval_set_index[EVAL_SET_INDEX] == 0]
|
|
1323
|
-
.copy()
|
|
1324
|
-
.drop(columns=[EVAL_SET_INDEX, TARGET])
|
|
1325
|
-
)
|
|
1326
|
-
X_sampled, search_keys = self._extend_x(X_sampled, is_demo_dataset)
|
|
1327
|
-
y_sampled = df_with_eval_set_index[df_with_eval_set_index[EVAL_SET_INDEX] == 0].copy()[TARGET]
|
|
1328
|
-
eval_set_sampled_dict = dict()
|
|
1329
|
-
for idx in range(len(eval_set)):
|
|
1330
|
-
eval_x_sampled = (
|
|
1331
|
-
df_with_eval_set_index[df_with_eval_set_index[EVAL_SET_INDEX] == (idx + 1)]
|
|
1332
|
-
.copy()
|
|
1333
|
-
.drop(columns=[EVAL_SET_INDEX, TARGET])
|
|
1334
|
-
)
|
|
1335
|
-
eval_x_sampled, _ = self._extend_x(eval_x_sampled, is_demo_dataset)
|
|
1336
|
-
eval_y_sampled = df_with_eval_set_index[df_with_eval_set_index[EVAL_SET_INDEX] == (idx + 1)].copy()[
|
|
1337
|
-
TARGET
|
|
1338
|
-
]
|
|
1339
|
-
eval_set_sampled_dict[idx] = (eval_x_sampled, eval_y_sampled)
|
|
1340
|
-
|
|
1341
|
-
df_with_eval_set_index.drop(columns=TARGET, inplace=True)
|
|
1342
|
-
|
|
1343
|
-
enriched = self.transform(
|
|
1344
|
-
df_with_eval_set_index,
|
|
1345
|
-
exclude_features_sources=exclude_features_sources,
|
|
1346
|
-
silent_mode=True,
|
|
1347
|
-
trace_id=trace_id,
|
|
1348
|
-
metrics_calculation=True,
|
|
1349
|
-
progress_bar=progress_bar,
|
|
1350
|
-
progress_callback=progress_callback,
|
|
1351
|
-
)
|
|
1352
|
-
if enriched is None:
|
|
1353
|
-
return None
|
|
1354
|
-
|
|
1355
|
-
enriched_X = enriched[enriched[EVAL_SET_INDEX] == 0].copy()
|
|
1356
|
-
enriched_X.drop(columns=EVAL_SET_INDEX, inplace=True)
|
|
1357
|
-
|
|
1358
|
-
for idx in range(len(eval_set)):
|
|
1359
|
-
enriched_eval_x = enriched[enriched[EVAL_SET_INDEX] == (idx + 1)].copy()
|
|
1360
|
-
enriched_eval_x.drop(columns=EVAL_SET_INDEX, inplace=True)
|
|
1361
|
-
eval_x_sampled, eval_y_sampled = eval_set_sampled_dict[idx]
|
|
1362
|
-
eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
|
|
1363
|
-
else:
|
|
1364
|
-
self.logger.info("Transform without eval_set")
|
|
1365
|
-
df = self.X.copy()
|
|
1366
|
-
|
|
1367
|
-
df[TARGET] = validated_y
|
|
1368
|
-
num_samples = _num_samples(df)
|
|
1369
|
-
if num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
|
|
1370
|
-
self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
|
|
1371
|
-
df = df.sample(n=Dataset.FIT_SAMPLE_ROWS, random_state=self.random_state)
|
|
1372
|
-
|
|
1373
|
-
X_sampled = df.copy().drop(columns=TARGET)
|
|
1374
|
-
X_sampled, search_keys = self._extend_x(X_sampled, is_demo_dataset)
|
|
1375
|
-
y_sampled = df.copy()[TARGET]
|
|
1376
|
-
|
|
1377
|
-
df.drop(columns=TARGET, inplace=True)
|
|
1378
|
-
|
|
1379
|
-
enriched_X = self.transform(
|
|
1380
|
-
df,
|
|
1381
|
-
exclude_features_sources=exclude_features_sources,
|
|
1382
|
-
silent_mode=True,
|
|
1383
|
-
trace_id=trace_id,
|
|
1384
|
-
metrics_calculation=True,
|
|
1385
|
-
progress_bar=progress_bar,
|
|
1386
|
-
progress_callback=progress_callback,
|
|
1387
|
-
)
|
|
1388
|
-
if enriched_X is None:
|
|
1389
|
-
return None
|
|
1390
|
-
|
|
1391
|
-
self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
|
1240
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = self._sample_data_for_metrics(
|
|
1241
|
+
trace_id,
|
|
1242
|
+
validated_X,
|
|
1243
|
+
validated_y,
|
|
1244
|
+
validated_eval_set,
|
|
1245
|
+
exclude_features_sources,
|
|
1246
|
+
is_input_same_as_fit,
|
|
1247
|
+
is_demo_dataset,
|
|
1248
|
+
remove_outliers_calc_metrics,
|
|
1249
|
+
progress_bar,
|
|
1250
|
+
progress_callback,
|
|
1251
|
+
)
|
|
1392
1252
|
|
|
1393
1253
|
excluding_search_keys = list(search_keys.keys())
|
|
1394
1254
|
if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
|
|
@@ -1408,6 +1268,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1408
1268
|
X_sorted, y_sorted = self._sort_by_keys(X_sampled, y_sampled, search_keys, self.cv)
|
|
1409
1269
|
enriched_X_sorted, enriched_y_sorted = self._sort_by_keys(enriched_X, y_sampled, search_keys, self.cv)
|
|
1410
1270
|
|
|
1271
|
+
group_columns = sorted(self._get_group_columns(search_keys))
|
|
1272
|
+
groups = (
|
|
1273
|
+
None
|
|
1274
|
+
if not group_columns or self.cv != CVType.group_k_fold
|
|
1275
|
+
else reduce(
|
|
1276
|
+
lambda left, right: left + "_" + right, [enriched_X_sorted[c].astype(str) for c in group_columns]
|
|
1277
|
+
).factorize()[0]
|
|
1278
|
+
)
|
|
1279
|
+
|
|
1411
1280
|
existing_filtered_enriched_features = [c for c in filtered_enriched_features if c in enriched_X_sorted.columns]
|
|
1412
1281
|
|
|
1413
1282
|
fitting_X = X_sorted[client_features].copy()
|
|
@@ -1449,6 +1318,259 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1449
1318
|
enriched_y_sorted,
|
|
1450
1319
|
fitting_eval_set_dict,
|
|
1451
1320
|
search_keys,
|
|
1321
|
+
groups,
|
|
1322
|
+
)
|
|
1323
|
+
|
|
1324
|
+
_SampledDataForMetrics = namedtuple(
|
|
1325
|
+
"_SampledDataForMetrics", "X_sampled y_sampled enriched_X eval_set_sampled_dict search_keys"
|
|
1326
|
+
)
|
|
1327
|
+
|
|
1328
|
+
def _sample_data_for_metrics(
|
|
1329
|
+
self,
|
|
1330
|
+
trace_id: str,
|
|
1331
|
+
validated_X: Union[pd.DataFrame, pd.Series, np.ndarray, None],
|
|
1332
|
+
validated_y: Union[pd.DataFrame, pd.Series, np.ndarray, List, None],
|
|
1333
|
+
eval_set: Optional[List[tuple]],
|
|
1334
|
+
exclude_features_sources: Optional[List[str]],
|
|
1335
|
+
is_input_same_as_fit: bool,
|
|
1336
|
+
is_demo_dataset: bool,
|
|
1337
|
+
remove_outliers_calc_metrics: Optional[bool],
|
|
1338
|
+
progress_bar: Optional[ProgressBar],
|
|
1339
|
+
progress_callback: Optional[Callable[[SearchProgress], Any]],
|
|
1340
|
+
) -> _SampledDataForMetrics:
|
|
1341
|
+
if self.__cached_sampled_datasets is not None and is_input_same_as_fit and remove_outliers_calc_metrics is None:
|
|
1342
|
+
self.logger.info("Cached enriched dataset found - use it")
|
|
1343
|
+
return self.__get_sampled_cached_enriched(exclude_features_sources)
|
|
1344
|
+
elif len(self.feature_importances_) == 0:
|
|
1345
|
+
self.logger.info("No external features selected. So use only input datasets for metrics calculation")
|
|
1346
|
+
return self.__sample_only_input(validated_X, validated_y, eval_set, is_demo_dataset)
|
|
1347
|
+
elif not self.imbalanced and not exclude_features_sources and is_input_same_as_fit:
|
|
1348
|
+
self.logger.info("Dataset is not imbalanced, so use enriched_X from fit")
|
|
1349
|
+
return self.__sample_balanced(
|
|
1350
|
+
validated_X, validated_y, eval_set, trace_id, remove_outliers_calc_metrics, is_demo_dataset
|
|
1351
|
+
)
|
|
1352
|
+
else:
|
|
1353
|
+
self.logger.info("Dataset is imbalanced or exclude_features_sources or X was passed. Run transform")
|
|
1354
|
+
print(bundle.get("prepare_data_for_metrics"))
|
|
1355
|
+
return self.__sample_imbalanced(
|
|
1356
|
+
validated_X,
|
|
1357
|
+
validated_y,
|
|
1358
|
+
eval_set,
|
|
1359
|
+
is_demo_dataset,
|
|
1360
|
+
exclude_features_sources,
|
|
1361
|
+
trace_id,
|
|
1362
|
+
progress_bar,
|
|
1363
|
+
progress_callback,
|
|
1364
|
+
)
|
|
1365
|
+
|
|
1366
|
+
def __get_sampled_cached_enriched(self, exclude_features_sources: Optional[List[str]]) -> _SampledDataForMetrics:
|
|
1367
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = self.__cached_sampled_datasets
|
|
1368
|
+
if exclude_features_sources:
|
|
1369
|
+
enriched_X = drop_existing_columns(enriched_X, exclude_features_sources)
|
|
1370
|
+
|
|
1371
|
+
return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
|
1372
|
+
|
|
1373
|
+
def __sample_only_input(
|
|
1374
|
+
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
|
|
1375
|
+
) -> _SampledDataForMetrics:
|
|
1376
|
+
eval_set_sampled_dict = dict()
|
|
1377
|
+
X_sampled, search_keys = self._extend_x(validated_X, is_demo_dataset)
|
|
1378
|
+
y_sampled = validated_y
|
|
1379
|
+
enriched_X = X_sampled
|
|
1380
|
+
if eval_set is not None:
|
|
1381
|
+
for idx in range(len(eval_set)):
|
|
1382
|
+
eval_X_sampled, _ = self._extend_x(eval_set[idx][0], is_demo_dataset)
|
|
1383
|
+
eval_y_sampled = eval_set[idx][1]
|
|
1384
|
+
enriched_eval_X = eval_X_sampled
|
|
1385
|
+
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
1386
|
+
self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
|
1387
|
+
|
|
1388
|
+
return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
|
1389
|
+
|
|
1390
|
+
def __sample_balanced(
|
|
1391
|
+
self,
|
|
1392
|
+
validated_X: pd.DataFrame,
|
|
1393
|
+
validated_y: pd.Series,
|
|
1394
|
+
eval_set: Optional[List[tuple]],
|
|
1395
|
+
trace_id: str,
|
|
1396
|
+
remove_outliers_calc_metrics: Optional[bool],
|
|
1397
|
+
is_demo_dataset: bool,
|
|
1398
|
+
) -> _SampledDataForMetrics:
|
|
1399
|
+
eval_set_sampled_dict = dict()
|
|
1400
|
+
search_keys = self.fit_search_keys
|
|
1401
|
+
|
|
1402
|
+
rows_to_drop = None
|
|
1403
|
+
task_type = self.model_task_type or define_task(validated_y, self.logger, silent=True)
|
|
1404
|
+
if task_type == ModelTaskType.REGRESSION:
|
|
1405
|
+
target_outliers_df = self._search_task.get_target_outliers(trace_id)
|
|
1406
|
+
if target_outliers_df is not None and len(target_outliers_df) > 0:
|
|
1407
|
+
outliers = pd.merge(
|
|
1408
|
+
self.df_with_original_index,
|
|
1409
|
+
target_outliers_df,
|
|
1410
|
+
left_on=SYSTEM_RECORD_ID,
|
|
1411
|
+
right_on=SYSTEM_RECORD_ID,
|
|
1412
|
+
how="inner",
|
|
1413
|
+
)
|
|
1414
|
+
top_outliers = outliers.sort_values(by=TARGET, ascending=False)[TARGET].head(3)
|
|
1415
|
+
if remove_outliers_calc_metrics is None or remove_outliers_calc_metrics is True:
|
|
1416
|
+
rows_to_drop = outliers
|
|
1417
|
+
not_msg = ""
|
|
1418
|
+
else:
|
|
1419
|
+
not_msg = "not "
|
|
1420
|
+
msg = bundle.get("target_outliers_warning").format(len(target_outliers_df), top_outliers, not_msg)
|
|
1421
|
+
print(msg)
|
|
1422
|
+
self.logger.warning(msg)
|
|
1423
|
+
|
|
1424
|
+
# index in each dataset (X, eval set) may be reordered and non unique, but index in validated datasets
|
|
1425
|
+
# can differs from it
|
|
1426
|
+
enriched_Xy, enriched_eval_sets = self.__enrich(
|
|
1427
|
+
self.df_with_original_index,
|
|
1428
|
+
self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True),
|
|
1429
|
+
rows_to_drop=rows_to_drop,
|
|
1430
|
+
)
|
|
1431
|
+
|
|
1432
|
+
enriched_X = drop_existing_columns(enriched_Xy, TARGET)
|
|
1433
|
+
X_sampled, search_keys = self._extend_x(validated_X, is_demo_dataset)
|
|
1434
|
+
y_sampled = enriched_Xy[TARGET].copy()
|
|
1435
|
+
|
|
1436
|
+
self.logger.info(f"Shape of enriched_X: {enriched_X.shape}")
|
|
1437
|
+
self.logger.info(f"Shape of X after sampling: {X_sampled.shape}")
|
|
1438
|
+
self.logger.info(f"Shape of y after sampling: {len(y_sampled)}")
|
|
1439
|
+
|
|
1440
|
+
if eval_set is not None:
|
|
1441
|
+
if len(enriched_eval_sets) != len(eval_set):
|
|
1442
|
+
raise ValidationError(
|
|
1443
|
+
bundle.get("metrics_eval_set_count_diff").format(len(enriched_eval_sets), len(eval_set))
|
|
1444
|
+
)
|
|
1445
|
+
|
|
1446
|
+
for idx in range(len(eval_set)):
|
|
1447
|
+
enriched_eval_X = drop_existing_columns(enriched_eval_sets[idx + 1], TARGET)
|
|
1448
|
+
eval_X_sampled, _ = self._extend_x(eval_set[idx][0], is_demo_dataset)
|
|
1449
|
+
eval_y_sampled = eval_set[idx][1].copy()
|
|
1450
|
+
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
1451
|
+
|
|
1452
|
+
self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
|
1453
|
+
|
|
1454
|
+
return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
|
1455
|
+
|
|
1456
|
+
def __sample_imbalanced(
|
|
1457
|
+
self,
|
|
1458
|
+
validated_X: pd.DataFrame,
|
|
1459
|
+
validated_y: pd.Series,
|
|
1460
|
+
eval_set: Optional[List[tuple]],
|
|
1461
|
+
is_demo_dataset: bool,
|
|
1462
|
+
exclude_features_sources: Optional[List[str]],
|
|
1463
|
+
trace_id: str,
|
|
1464
|
+
progress_bar: Optional[ProgressBar],
|
|
1465
|
+
progress_callback: Optional[Callable[[SearchProgress], Any]],
|
|
1466
|
+
) -> _SampledDataForMetrics:
|
|
1467
|
+
eval_set_sampled_dict = dict()
|
|
1468
|
+
if eval_set is not None:
|
|
1469
|
+
self.logger.info("Transform with eval_set")
|
|
1470
|
+
# concatenate X and eval_set with eval_set_index
|
|
1471
|
+
df_with_eval_set_index = validated_X.copy()
|
|
1472
|
+
df_with_eval_set_index[TARGET] = validated_y
|
|
1473
|
+
df_with_eval_set_index[EVAL_SET_INDEX] = 0
|
|
1474
|
+
for idx, eval_pair in enumerate(eval_set):
|
|
1475
|
+
eval_x, eval_y = self._validate_eval_set_pair(validated_X, eval_pair)
|
|
1476
|
+
eval_df_with_index = eval_x.copy()
|
|
1477
|
+
eval_df_with_index[TARGET] = eval_y
|
|
1478
|
+
eval_df_with_index[EVAL_SET_INDEX] = idx + 1
|
|
1479
|
+
df_with_eval_set_index = pd.concat([df_with_eval_set_index, eval_df_with_index])
|
|
1480
|
+
|
|
1481
|
+
# downsample if need to eval_set threshold
|
|
1482
|
+
num_samples = _num_samples(df_with_eval_set_index)
|
|
1483
|
+
if num_samples > Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD:
|
|
1484
|
+
self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS}")
|
|
1485
|
+
df_with_eval_set_index = df_with_eval_set_index.sample(
|
|
1486
|
+
n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state
|
|
1487
|
+
)
|
|
1488
|
+
|
|
1489
|
+
X_sampled = (
|
|
1490
|
+
df_with_eval_set_index[df_with_eval_set_index[EVAL_SET_INDEX] == 0]
|
|
1491
|
+
.copy()
|
|
1492
|
+
.drop(columns=[EVAL_SET_INDEX, TARGET])
|
|
1493
|
+
)
|
|
1494
|
+
X_sampled, search_keys = self._extend_x(X_sampled, is_demo_dataset)
|
|
1495
|
+
y_sampled = df_with_eval_set_index[df_with_eval_set_index[EVAL_SET_INDEX] == 0].copy()[TARGET]
|
|
1496
|
+
eval_set_sampled_dict = dict()
|
|
1497
|
+
for idx in range(len(eval_set)):
|
|
1498
|
+
eval_x_sampled = (
|
|
1499
|
+
df_with_eval_set_index[df_with_eval_set_index[EVAL_SET_INDEX] == (idx + 1)]
|
|
1500
|
+
.copy()
|
|
1501
|
+
.drop(columns=[EVAL_SET_INDEX, TARGET])
|
|
1502
|
+
)
|
|
1503
|
+
eval_x_sampled, _ = self._extend_x(eval_x_sampled, is_demo_dataset)
|
|
1504
|
+
eval_y_sampled = df_with_eval_set_index[df_with_eval_set_index[EVAL_SET_INDEX] == (idx + 1)].copy()[
|
|
1505
|
+
TARGET
|
|
1506
|
+
]
|
|
1507
|
+
eval_set_sampled_dict[idx] = (eval_x_sampled, eval_y_sampled)
|
|
1508
|
+
|
|
1509
|
+
df_with_eval_set_index.drop(columns=TARGET, inplace=True)
|
|
1510
|
+
|
|
1511
|
+
enriched = self.transform(
|
|
1512
|
+
df_with_eval_set_index,
|
|
1513
|
+
exclude_features_sources=exclude_features_sources,
|
|
1514
|
+
silent_mode=True,
|
|
1515
|
+
trace_id=trace_id,
|
|
1516
|
+
metrics_calculation=True,
|
|
1517
|
+
progress_bar=progress_bar,
|
|
1518
|
+
progress_callback=progress_callback,
|
|
1519
|
+
)
|
|
1520
|
+
if enriched is None:
|
|
1521
|
+
return None
|
|
1522
|
+
|
|
1523
|
+
enriched_X = enriched[enriched[EVAL_SET_INDEX] == 0].copy()
|
|
1524
|
+
enriched_X.drop(columns=EVAL_SET_INDEX, inplace=True)
|
|
1525
|
+
|
|
1526
|
+
for idx in range(len(eval_set)):
|
|
1527
|
+
enriched_eval_x = enriched[enriched[EVAL_SET_INDEX] == (idx + 1)].copy()
|
|
1528
|
+
enriched_eval_x.drop(columns=EVAL_SET_INDEX, inplace=True)
|
|
1529
|
+
eval_x_sampled, eval_y_sampled = eval_set_sampled_dict[idx]
|
|
1530
|
+
eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
|
|
1531
|
+
else:
|
|
1532
|
+
self.logger.info("Transform without eval_set")
|
|
1533
|
+
df = self.X.copy()
|
|
1534
|
+
|
|
1535
|
+
df[TARGET] = validated_y
|
|
1536
|
+
num_samples = _num_samples(df)
|
|
1537
|
+
if num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
|
|
1538
|
+
self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
|
|
1539
|
+
df = df.sample(n=Dataset.FIT_SAMPLE_ROWS, random_state=self.random_state)
|
|
1540
|
+
|
|
1541
|
+
X_sampled = df.copy().drop(columns=TARGET)
|
|
1542
|
+
X_sampled, search_keys = self._extend_x(X_sampled, is_demo_dataset)
|
|
1543
|
+
y_sampled = df.copy()[TARGET]
|
|
1544
|
+
|
|
1545
|
+
df.drop(columns=TARGET, inplace=True)
|
|
1546
|
+
|
|
1547
|
+
enriched_X = self.transform(
|
|
1548
|
+
df,
|
|
1549
|
+
exclude_features_sources=exclude_features_sources,
|
|
1550
|
+
silent_mode=True,
|
|
1551
|
+
trace_id=trace_id,
|
|
1552
|
+
metrics_calculation=True,
|
|
1553
|
+
progress_bar=progress_bar,
|
|
1554
|
+
progress_callback=progress_callback,
|
|
1555
|
+
)
|
|
1556
|
+
if enriched_X is None:
|
|
1557
|
+
return None
|
|
1558
|
+
|
|
1559
|
+
self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
|
1560
|
+
|
|
1561
|
+
return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
|
1562
|
+
|
|
1563
|
+
def __mk_sampled_data_tuple(
|
|
1564
|
+
self,
|
|
1565
|
+
X_sampled: pd.DataFrame,
|
|
1566
|
+
y_sampled: pd.Series,
|
|
1567
|
+
enriched_X: pd.DataFrame,
|
|
1568
|
+
eval_set_sampled_dict: Dict,
|
|
1569
|
+
search_keys: Dict,
|
|
1570
|
+
):
|
|
1571
|
+
search_keys = {k: v for k, v in search_keys.items() if k in X_sampled.columns.to_list()}
|
|
1572
|
+
return FeaturesEnricher._SampledDataForMetrics(
|
|
1573
|
+
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys
|
|
1452
1574
|
)
|
|
1453
1575
|
|
|
1454
1576
|
def get_search_id(self) -> Optional[str]:
|
|
@@ -1868,20 +1990,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1868
1990
|
|
|
1869
1991
|
df = self.__add_country_code(df, self.fit_search_keys)
|
|
1870
1992
|
|
|
1871
|
-
# Check Multivariate time series
|
|
1872
1993
|
date_column = self._get_date_column(self.fit_search_keys)
|
|
1873
|
-
|
|
1874
|
-
self.cv is None
|
|
1875
|
-
and date_column
|
|
1876
|
-
and model_task_type == ModelTaskType.REGRESSION
|
|
1877
|
-
and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys())) == 0
|
|
1878
|
-
and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
|
|
1879
|
-
):
|
|
1880
|
-
msg = bundle.get("multivariate_timeseries_detected")
|
|
1881
|
-
print(msg)
|
|
1882
|
-
self.logger.warning(msg)
|
|
1883
|
-
self.cv = CVType.blocked_time_series
|
|
1884
|
-
self.runtime_parameters.properties["cv_type"] = self.cv.name
|
|
1994
|
+
self.__adjust_cv(df, date_column, model_task_type)
|
|
1885
1995
|
|
|
1886
1996
|
self.fit_generated_features = []
|
|
1887
1997
|
|
|
@@ -2046,8 +2156,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2046
2156
|
|
|
2047
2157
|
self.__show_selected_features(self.fit_search_keys)
|
|
2048
2158
|
|
|
2049
|
-
|
|
2050
|
-
|
|
2159
|
+
autofe_description = self.get_autofe_features_description()
|
|
2160
|
+
if autofe_description is not None:
|
|
2161
|
+
display_html_dataframe(autofe_description, autofe_description, "*Description of AutoFE feature names")
|
|
2051
2162
|
|
|
2052
2163
|
if self._has_paid_features(exclude_features_sources):
|
|
2053
2164
|
if calculate_metrics is not None and calculate_metrics:
|
|
@@ -2089,6 +2200,35 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2089
2200
|
|
|
2090
2201
|
self.__show_report_button()
|
|
2091
2202
|
|
|
2203
|
+
if not self.warning_counter.has_warnings():
|
|
2204
|
+
self.__display_support_link(bundle.get("all_ok_community_invite"))
|
|
2205
|
+
|
|
2206
|
+
def __adjust_cv(self, df: pd.DataFrame, date_column: pd.Series, model_task_type: ModelTaskType):
|
|
2207
|
+
# Check Multivariate time series
|
|
2208
|
+
if (
|
|
2209
|
+
self.cv is None
|
|
2210
|
+
and date_column
|
|
2211
|
+
and model_task_type == ModelTaskType.REGRESSION
|
|
2212
|
+
and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys())) == 0
|
|
2213
|
+
and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
|
|
2214
|
+
):
|
|
2215
|
+
msg = bundle.get("multivariate_timeseries_detected")
|
|
2216
|
+
self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
|
|
2217
|
+
elif (
|
|
2218
|
+
(self.cv is None or self.cv == CVType.k_fold)
|
|
2219
|
+
and model_task_type != ModelTaskType.REGRESSION
|
|
2220
|
+
and self._get_group_columns(self.fit_search_keys)
|
|
2221
|
+
):
|
|
2222
|
+
msg = bundle.get("group_k_fold_in_classification")
|
|
2223
|
+
self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
|
|
2224
|
+
|
|
2225
|
+
def __override_cv(self, cv: CVType, msg: str, print_warning: bool = True):
|
|
2226
|
+
if print_warning:
|
|
2227
|
+
print(msg)
|
|
2228
|
+
self.logger.warning(msg)
|
|
2229
|
+
self.cv = cv
|
|
2230
|
+
self.runtime_parameters.properties["cv_type"] = self.cv.name
|
|
2231
|
+
|
|
2092
2232
|
def get_columns_by_search_keys(self, keys: List[str]):
|
|
2093
2233
|
if "HEM" in keys:
|
|
2094
2234
|
keys.append("EMAIL")
|
|
@@ -2384,6 +2524,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2384
2524
|
if t in [SearchKey.DATE, SearchKey.DATETIME]:
|
|
2385
2525
|
return col
|
|
2386
2526
|
|
|
2527
|
+
@staticmethod
|
|
2528
|
+
def _get_group_columns(search_keys: Dict[str, SearchKey]) -> List[str]:
|
|
2529
|
+
return [col for col, t in search_keys.items() if t not in [SearchKey.DATE, SearchKey.DATETIME]]
|
|
2530
|
+
|
|
2387
2531
|
@staticmethod
|
|
2388
2532
|
def __get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2389
2533
|
for col, t in search_keys.items():
|
|
@@ -2400,8 +2544,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2400
2544
|
self, df: pd.DataFrame, meaning_types: Dict[str, FileColumnMeaningType], search_keys: Dict[str, SearchKey]
|
|
2401
2545
|
) -> pd.DataFrame:
|
|
2402
2546
|
# save original order or rows
|
|
2403
|
-
|
|
2404
|
-
|
|
2547
|
+
original_index_name = df.index.name
|
|
2548
|
+
index_name = df.index.name or DEFAULT_INDEX
|
|
2549
|
+
df = df.reset_index().reset_index(drop=True)
|
|
2550
|
+
df = df.rename(columns={index_name: ORIGINAL_INDEX})
|
|
2405
2551
|
|
|
2406
2552
|
# order by date and idempotent order by other keys
|
|
2407
2553
|
if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
|
|
@@ -2432,7 +2578,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2432
2578
|
|
|
2433
2579
|
# return original order
|
|
2434
2580
|
df = df.set_index(ORIGINAL_INDEX)
|
|
2435
|
-
df =
|
|
2581
|
+
df.index.name = original_index_name
|
|
2582
|
+
# df = df.sort_index()
|
|
2436
2583
|
|
|
2437
2584
|
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
2438
2585
|
return df
|
|
@@ -2493,6 +2640,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2493
2640
|
self.logger.warning(f"X contain columns with same name as returned from backend: {dup_features}")
|
|
2494
2641
|
raise ValidationError(bundle.get("returned_features_same_as_passed").format(dup_features))
|
|
2495
2642
|
|
|
2643
|
+
# index overrites from result_features
|
|
2644
|
+
original_index_name = df_with_original_index.index.name
|
|
2645
|
+
df_with_original_index = df_with_original_index.reset_index()
|
|
2496
2646
|
result_features = pd.merge(
|
|
2497
2647
|
df_with_original_index,
|
|
2498
2648
|
result_features,
|
|
@@ -2500,6 +2650,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2500
2650
|
right_on=SYSTEM_RECORD_ID,
|
|
2501
2651
|
how="left" if is_transform else "inner",
|
|
2502
2652
|
)
|
|
2653
|
+
result_features = result_features.set_index(original_index_name or DEFAULT_INDEX)
|
|
2654
|
+
result_features.index.name = original_index_name
|
|
2503
2655
|
|
|
2504
2656
|
if rows_to_drop is not None:
|
|
2505
2657
|
print(f"Before dropping target outliers size: {len(result_features)}")
|
|
@@ -2687,6 +2839,52 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2687
2839
|
else:
|
|
2688
2840
|
self.logger.warning("Empty features info")
|
|
2689
2841
|
|
|
2842
|
+
def get_autofe_features_description(self):
|
|
2843
|
+
try:
|
|
2844
|
+
autofe_meta = self._search_task.get_autofe_metadata()
|
|
2845
|
+
if autofe_meta is None:
|
|
2846
|
+
return None
|
|
2847
|
+
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
2848
|
+
|
|
2849
|
+
def get_feature_by_display_index(idx):
|
|
2850
|
+
for m in features_meta:
|
|
2851
|
+
if m.name.endswith(str(idx)):
|
|
2852
|
+
return m
|
|
2853
|
+
|
|
2854
|
+
descriptions = []
|
|
2855
|
+
for m in autofe_meta:
|
|
2856
|
+
description = dict()
|
|
2857
|
+
|
|
2858
|
+
feature_meta = get_feature_by_display_index(m.display_index)
|
|
2859
|
+
if feature_meta is None:
|
|
2860
|
+
self.logger.warning(f"Feature meta for display index {m.display_index} not found")
|
|
2861
|
+
continue
|
|
2862
|
+
description["Sources"] = feature_meta.data_source.replace("AutoFE: features from ", "")
|
|
2863
|
+
description["Feature name"] = feature_meta.name
|
|
2864
|
+
|
|
2865
|
+
feature_idx = 1
|
|
2866
|
+
for bc in m.base_columns:
|
|
2867
|
+
description[f"Feature {feature_idx}"] = bc.hashed_name
|
|
2868
|
+
feature_idx += 1
|
|
2869
|
+
|
|
2870
|
+
match = re.match(f"f_autofe_(.+)_{m.display_index}", feature_meta.name)
|
|
2871
|
+
if match is None:
|
|
2872
|
+
self.logger.warning(f"Failed to infer autofe function from name {feature_meta.name}")
|
|
2873
|
+
else:
|
|
2874
|
+
description["Function"] = match.group(1)
|
|
2875
|
+
|
|
2876
|
+
descriptions.append(description)
|
|
2877
|
+
|
|
2878
|
+
if len(descriptions) == 0:
|
|
2879
|
+
return None
|
|
2880
|
+
|
|
2881
|
+
descriptions_df = pd.DataFrame(descriptions)
|
|
2882
|
+
descriptions_df.fillna("", inplace=True)
|
|
2883
|
+
return descriptions_df
|
|
2884
|
+
except Exception:
|
|
2885
|
+
self.logger.exception("Failed to generate AutoFE features description")
|
|
2886
|
+
return None
|
|
2887
|
+
|
|
2690
2888
|
@staticmethod
|
|
2691
2889
|
def _group_relevant_data_sources(df: pd.DataFrame) -> pd.DataFrame:
|
|
2692
2890
|
return (
|
|
@@ -2889,8 +3087,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2889
3087
|
relevant_features_df=self._features_info_without_links,
|
|
2890
3088
|
relevant_datasources_df=self.relevant_data_sources,
|
|
2891
3089
|
metrics_df=self.metrics,
|
|
3090
|
+
autofe_descriptions_df=self.get_autofe_features_description(),
|
|
2892
3091
|
search_id=self._search_task.search_task_id,
|
|
2893
3092
|
email=get_rest_client(self.endpoint, self.api_key).get_current_email(),
|
|
3093
|
+
search_keys=[str(sk) for sk in self.search_keys.values()],
|
|
2894
3094
|
)
|
|
2895
3095
|
except Exception:
|
|
2896
3096
|
pass
|
|
@@ -3014,19 +3214,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3014
3214
|
def __display_support_link(self, link_text: Optional[str] = None):
|
|
3015
3215
|
support_link = bundle.get("support_link")
|
|
3016
3216
|
link_text = link_text or bundle.get("support_text")
|
|
3017
|
-
# badge = bundle.get("slack_community_bage")
|
|
3018
|
-
# alt = bundle.get("slack_community_alt")
|
|
3019
3217
|
try:
|
|
3020
3218
|
from IPython.display import HTML, display
|
|
3021
3219
|
|
|
3022
3220
|
_ = get_ipython() # type: ignore
|
|
3023
3221
|
self.logger.warning(link_text)
|
|
3024
|
-
print(link_text)
|
|
3025
3222
|
display(
|
|
3026
3223
|
HTML(
|
|
3027
|
-
f"""<a href='{support_link}' target='_blank' rel='noopener noreferrer'>
|
|
3028
|
-
|
|
3029
|
-
# <img alt='{alt}' src='{badge}'></a>
|
|
3224
|
+
f"""{link_text} <a href='{support_link}' target='_blank' rel='noopener noreferrer'>
|
|
3225
|
+
here</a>"""
|
|
3030
3226
|
)
|
|
3031
3227
|
)
|
|
3032
3228
|
except (ImportError, NameError):
|