upgini 1.1.222a1__py3-none-any.whl → 1.1.224__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

@@ -5,10 +5,13 @@ import logging
5
5
  import numbers
6
6
  import os
7
7
  import pickle
8
+ import re
8
9
  import sys
9
10
  import tempfile
10
11
  import time
11
12
  import uuid
13
+ from collections import namedtuple
14
+ from functools import reduce
12
15
  from threading import Thread
13
16
  from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
14
17
 
@@ -175,6 +178,7 @@ class FeaturesEnricher(TransformerMixin):
175
178
  logs_enabled: bool = True,
176
179
  raise_validation_error: bool = True,
177
180
  exclude_columns: Optional[List[str]] = None,
181
+ baseline_score_column: Optional[Any] = None,
178
182
  client_ip: Optional[str] = None,
179
183
  **kwargs,
180
184
  ):
@@ -279,6 +283,7 @@ class FeaturesEnricher(TransformerMixin):
279
283
 
280
284
  self.raise_validation_error = raise_validation_error
281
285
  self.exclude_columns = exclude_columns
286
+ self.baseline_score_column = baseline_score_column
282
287
 
283
288
  def _get_api_key(self):
284
289
  return self._api_key
@@ -290,6 +295,18 @@ class FeaturesEnricher(TransformerMixin):
290
295
 
291
296
  api_key = property(_get_api_key, _set_api_key)
292
297
 
298
+ @staticmethod
299
+ def _check_eval_set(eval_set, X):
300
+ checked_eval_set = []
301
+ if eval_set is not None and not isinstance(eval_set, list):
302
+ raise ValidationError(bundle.get("unsupported_type_eval_set").format(type(eval_set)))
303
+ for eval_pair in eval_set or []:
304
+ if not isinstance(eval_pair, tuple) or len(eval_pair) != 2:
305
+ raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
306
+ if not is_frames_equal(X, eval_pair[0]):
307
+ checked_eval_set.append(eval_pair)
308
+ return checked_eval_set
309
+
293
310
  def fit(
294
311
  self,
295
312
  X: Union[pd.DataFrame, pd.Series, np.ndarray],
@@ -366,19 +383,13 @@ class FeaturesEnricher(TransformerMixin):
366
383
  try:
367
384
  self.X = X
368
385
  self.y = y
369
- checked_eval_set = []
370
- for eval_pair in eval_set or []:
371
- if len(eval_pair) != 2:
372
- raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
373
- if not is_frames_equal(X, eval_pair[0]):
374
- checked_eval_set.append(eval_pair)
375
- self.eval_set = checked_eval_set
386
+ self.eval_set = self._check_eval_set(eval_set, X)
376
387
  self.dump_input(trace_id, X, y, eval_set)
377
388
  self.__inner_fit(
378
389
  trace_id,
379
390
  X,
380
391
  y,
381
- checked_eval_set,
392
+ self.eval_set,
382
393
  progress_bar,
383
394
  start_time=start_time,
384
395
  exclude_features_sources=exclude_features_sources,
@@ -508,13 +519,7 @@ class FeaturesEnricher(TransformerMixin):
508
519
  try:
509
520
  self.X = X
510
521
  self.y = y
511
- checked_eval_set = []
512
- for eval_pair in eval_set or []:
513
- if len(eval_pair) != 2:
514
- raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
515
- if not is_frames_equal(X, eval_pair[0]):
516
- checked_eval_set.append(eval_pair)
517
- self.eval_set = checked_eval_set
522
+ self.eval_set = self._check_eval_set(eval_set, X)
518
523
  self.dump_input(trace_id, X, y, eval_set)
519
524
 
520
525
  if _num_samples(drop_duplicates(X)) > Dataset.MAX_ROWS:
@@ -524,7 +529,7 @@ class FeaturesEnricher(TransformerMixin):
524
529
  trace_id,
525
530
  X,
526
531
  y,
527
- checked_eval_set,
532
+ self.eval_set,
528
533
  progress_bar,
529
534
  start_time=start_time,
530
535
  exclude_features_sources=exclude_features_sources,
@@ -882,6 +887,7 @@ class FeaturesEnricher(TransformerMixin):
882
887
  enriched_y_sorted,
883
888
  fitting_eval_set_dict,
884
889
  search_keys,
890
+ groups,
885
891
  ) = prepared_data
886
892
 
887
893
  gc.collect()
@@ -906,7 +912,13 @@ class FeaturesEnricher(TransformerMixin):
906
912
  ).get_cv()
907
913
 
908
914
  wrapper = EstimatorWrapper.create(
909
- estimator, self.logger, model_task_type, _cv, fitting_enriched_X, scoring
915
+ estimator,
916
+ self.logger,
917
+ model_task_type,
918
+ _cv,
919
+ fitting_enriched_X,
920
+ scoring,
921
+ groups=groups,
910
922
  )
911
923
  metric = wrapper.metric_name
912
924
  multiplier = wrapper.multiplier
@@ -931,8 +943,11 @@ class FeaturesEnricher(TransformerMixin):
931
943
  scoring,
932
944
  cat_features,
933
945
  add_params=custom_loss_add_params,
946
+ groups=groups,
947
+ )
948
+ etalon_metric = baseline_estimator.cross_val_predict(
949
+ fitting_X, y_sorted, self.baseline_score_column
934
950
  )
935
- etalon_metric = baseline_estimator.cross_val_predict(fitting_X, y_sorted)
936
951
  self.logger.info(f"Baseline {metric} on train client features: {etalon_metric}")
937
952
 
938
953
  # 2 Fit and predict with KFold Catboost model on enriched tds
@@ -952,6 +967,7 @@ class FeaturesEnricher(TransformerMixin):
952
967
  scoring,
953
968
  cat_features,
954
969
  add_params=custom_loss_add_params,
970
+ groups=groups,
955
971
  )
956
972
  enriched_metric = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
957
973
  self.logger.info(f"Enriched {metric} on train combined features: {enriched_metric}")
@@ -999,7 +1015,9 @@ class FeaturesEnricher(TransformerMixin):
999
1015
  f"Calculate baseline {metric} on eval set {idx + 1} "
1000
1016
  f"on client features: {eval_X_sorted.columns.to_list()}"
1001
1017
  )
1002
- etalon_eval_metric = baseline_estimator.calculate_metric(eval_X_sorted, eval_y_sorted)
1018
+ etalon_eval_metric = baseline_estimator.calculate_metric(
1019
+ eval_X_sorted, eval_y_sorted, self.baseline_score_column
1020
+ )
1003
1021
  self.logger.info(
1004
1022
  f"Baseline {metric} on eval set {idx + 1} client features: {etalon_eval_metric}"
1005
1023
  )
@@ -1172,12 +1190,7 @@ class FeaturesEnricher(TransformerMixin):
1172
1190
  if X is None:
1173
1191
  return True, self.X, self.y, self.eval_set
1174
1192
 
1175
- checked_eval_set = []
1176
- for eval_pair in eval_set or []:
1177
- if len(eval_pair) != 2:
1178
- raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
1179
- if not is_frames_equal(X, eval_pair[0]):
1180
- checked_eval_set.append(eval_pair)
1193
+ checked_eval_set = self._check_eval_set(eval_set, X)
1181
1194
 
1182
1195
  if (
1183
1196
  X is self.X
@@ -1214,181 +1227,28 @@ class FeaturesEnricher(TransformerMixin):
1214
1227
  progress_bar: Optional[ProgressBar] = None,
1215
1228
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
1216
1229
  ):
1217
- is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
1218
1230
  is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
1231
+ is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
1219
1232
  validated_X = self._validate_X(X)
1220
1233
  validated_y = self._validate_y(validated_X, y)
1234
+ validated_eval_set = (
1235
+ [self._validate_eval_set_pair(validated_X, eval_set_pair) for eval_set_pair in eval_set]
1236
+ if eval_set
1237
+ else None
1238
+ )
1221
1239
 
1222
- eval_set_sampled_dict = dict()
1223
-
1224
- if self.__cached_sampled_datasets is not None and is_input_same_as_fit and remove_outliers_calc_metrics is None:
1225
- self.logger.info("Cached enriched dataset found - use it")
1226
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = self.__cached_sampled_datasets
1227
- if exclude_features_sources:
1228
- enriched_X = drop_existing_columns(enriched_X, exclude_features_sources)
1229
- elif len(self.feature_importances_) == 0:
1230
- self.logger.info("No external features selected. So use only input datasets for metrics calculation")
1231
- X_sampled, search_keys = self._extend_x(validated_X, is_demo_dataset)
1232
- y_sampled = validated_y
1233
- enriched_X = X_sampled
1234
- if eval_set is not None:
1235
- for idx in range(len(eval_set)):
1236
- eval_X_sampled, _ = self._extend_x(eval_set[idx][0], is_demo_dataset)
1237
- eval_y_sampled = eval_set[idx][1]
1238
- enriched_eval_X = eval_X_sampled
1239
- eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
1240
- self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1241
- elif not self.imbalanced and not exclude_features_sources and is_input_same_as_fit:
1242
- self.logger.info("Dataset is not imbalanced, so use enriched_X from fit")
1243
- search_keys = self.fit_search_keys
1244
-
1245
- rows_to_drop = None
1246
- task_type = self.model_task_type or define_task(validated_y, self.logger, silent=True)
1247
- if task_type == ModelTaskType.REGRESSION:
1248
- target_outliers_df = self._search_task.get_target_outliers(trace_id)
1249
- if target_outliers_df is not None and len(target_outliers_df) > 0:
1250
- outliers = pd.merge(
1251
- self.df_with_original_index,
1252
- target_outliers_df,
1253
- left_on=SYSTEM_RECORD_ID,
1254
- right_on=SYSTEM_RECORD_ID,
1255
- how="inner",
1256
- )
1257
- top_outliers = outliers.sort_values(by=TARGET, ascending=False)[TARGET].head(3)
1258
- if remove_outliers_calc_metrics is None or remove_outliers_calc_metrics is True:
1259
- rows_to_drop = outliers
1260
- not_msg = ""
1261
- else:
1262
- not_msg = "not "
1263
- msg = bundle.get("target_outliers_warning").format(len(target_outliers_df), top_outliers, not_msg)
1264
- print(msg)
1265
- self.logger.warning(msg)
1266
-
1267
- enriched_Xy, enriched_eval_sets = self.__enrich(
1268
- self.df_with_original_index,
1269
- self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True),
1270
- rows_to_drop=rows_to_drop,
1271
- )
1272
-
1273
- enriched_X = drop_existing_columns(enriched_Xy, TARGET)
1274
- x_columns = [
1275
- c for c in validated_X.columns.to_list() + self.fit_generated_features if c in enriched_X.columns
1276
- ]
1277
- X_sampled = enriched_Xy[x_columns].copy()
1278
- y_sampled = enriched_Xy[TARGET].copy()
1279
-
1280
- self.logger.info(f"Shape of enriched_X: {enriched_X.shape}")
1281
- self.logger.info(f"Shape of X after sampling: {X_sampled.shape}")
1282
- self.logger.info(f"Shape of y after sampling: {len(y_sampled)}")
1283
-
1284
- if eval_set is not None:
1285
- if len(enriched_eval_sets) != len(eval_set):
1286
- raise ValidationError(
1287
- bundle.get("metrics_eval_set_count_diff").format(len(enriched_eval_sets), len(eval_set))
1288
- )
1289
-
1290
- for idx in range(len(eval_set)):
1291
- enriched_eval_X = drop_existing_columns(enriched_eval_sets[idx + 1], TARGET)
1292
- eval_X_sampled = enriched_eval_sets[idx + 1][x_columns].copy()
1293
- eval_y_sampled = enriched_eval_sets[idx + 1][TARGET].copy()
1294
- eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
1295
-
1296
- self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1297
- else:
1298
- self.logger.info("Dataset is imbalanced or exclude_features_sources or X was passed. Run transform")
1299
- print(bundle.get("prepare_data_for_metrics"))
1300
- if eval_set is not None:
1301
- self.logger.info("Transform with eval_set")
1302
- # concatenate X and eval_set with eval_set_index
1303
- df_with_eval_set_index = validated_X.copy()
1304
- df_with_eval_set_index[TARGET] = validated_y
1305
- df_with_eval_set_index[EVAL_SET_INDEX] = 0
1306
- for idx, eval_pair in enumerate(eval_set):
1307
- eval_x, eval_y = self._validate_eval_set_pair(validated_X, eval_pair)
1308
- eval_df_with_index = eval_x.copy()
1309
- eval_df_with_index[TARGET] = eval_y
1310
- eval_df_with_index[EVAL_SET_INDEX] = idx + 1
1311
- df_with_eval_set_index = pd.concat([df_with_eval_set_index, eval_df_with_index])
1312
-
1313
- # downsample if need to eval_set threshold
1314
- num_samples = _num_samples(df_with_eval_set_index)
1315
- if num_samples > Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD:
1316
- self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS}")
1317
- df_with_eval_set_index = df_with_eval_set_index.sample(
1318
- n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state
1319
- )
1320
-
1321
- X_sampled = (
1322
- df_with_eval_set_index[df_with_eval_set_index[EVAL_SET_INDEX] == 0]
1323
- .copy()
1324
- .drop(columns=[EVAL_SET_INDEX, TARGET])
1325
- )
1326
- X_sampled, search_keys = self._extend_x(X_sampled, is_demo_dataset)
1327
- y_sampled = df_with_eval_set_index[df_with_eval_set_index[EVAL_SET_INDEX] == 0].copy()[TARGET]
1328
- eval_set_sampled_dict = dict()
1329
- for idx in range(len(eval_set)):
1330
- eval_x_sampled = (
1331
- df_with_eval_set_index[df_with_eval_set_index[EVAL_SET_INDEX] == (idx + 1)]
1332
- .copy()
1333
- .drop(columns=[EVAL_SET_INDEX, TARGET])
1334
- )
1335
- eval_x_sampled, _ = self._extend_x(eval_x_sampled, is_demo_dataset)
1336
- eval_y_sampled = df_with_eval_set_index[df_with_eval_set_index[EVAL_SET_INDEX] == (idx + 1)].copy()[
1337
- TARGET
1338
- ]
1339
- eval_set_sampled_dict[idx] = (eval_x_sampled, eval_y_sampled)
1340
-
1341
- df_with_eval_set_index.drop(columns=TARGET, inplace=True)
1342
-
1343
- enriched = self.transform(
1344
- df_with_eval_set_index,
1345
- exclude_features_sources=exclude_features_sources,
1346
- silent_mode=True,
1347
- trace_id=trace_id,
1348
- metrics_calculation=True,
1349
- progress_bar=progress_bar,
1350
- progress_callback=progress_callback,
1351
- )
1352
- if enriched is None:
1353
- return None
1354
-
1355
- enriched_X = enriched[enriched[EVAL_SET_INDEX] == 0].copy()
1356
- enriched_X.drop(columns=EVAL_SET_INDEX, inplace=True)
1357
-
1358
- for idx in range(len(eval_set)):
1359
- enriched_eval_x = enriched[enriched[EVAL_SET_INDEX] == (idx + 1)].copy()
1360
- enriched_eval_x.drop(columns=EVAL_SET_INDEX, inplace=True)
1361
- eval_x_sampled, eval_y_sampled = eval_set_sampled_dict[idx]
1362
- eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
1363
- else:
1364
- self.logger.info("Transform without eval_set")
1365
- df = self.X.copy()
1366
-
1367
- df[TARGET] = validated_y
1368
- num_samples = _num_samples(df)
1369
- if num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
1370
- self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
1371
- df = df.sample(n=Dataset.FIT_SAMPLE_ROWS, random_state=self.random_state)
1372
-
1373
- X_sampled = df.copy().drop(columns=TARGET)
1374
- X_sampled, search_keys = self._extend_x(X_sampled, is_demo_dataset)
1375
- y_sampled = df.copy()[TARGET]
1376
-
1377
- df.drop(columns=TARGET, inplace=True)
1378
-
1379
- enriched_X = self.transform(
1380
- df,
1381
- exclude_features_sources=exclude_features_sources,
1382
- silent_mode=True,
1383
- trace_id=trace_id,
1384
- metrics_calculation=True,
1385
- progress_bar=progress_bar,
1386
- progress_callback=progress_callback,
1387
- )
1388
- if enriched_X is None:
1389
- return None
1390
-
1391
- self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1240
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = self._sample_data_for_metrics(
1241
+ trace_id,
1242
+ validated_X,
1243
+ validated_y,
1244
+ validated_eval_set,
1245
+ exclude_features_sources,
1246
+ is_input_same_as_fit,
1247
+ is_demo_dataset,
1248
+ remove_outliers_calc_metrics,
1249
+ progress_bar,
1250
+ progress_callback,
1251
+ )
1392
1252
 
1393
1253
  excluding_search_keys = list(search_keys.keys())
1394
1254
  if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
@@ -1408,6 +1268,15 @@ class FeaturesEnricher(TransformerMixin):
1408
1268
  X_sorted, y_sorted = self._sort_by_keys(X_sampled, y_sampled, search_keys, self.cv)
1409
1269
  enriched_X_sorted, enriched_y_sorted = self._sort_by_keys(enriched_X, y_sampled, search_keys, self.cv)
1410
1270
 
1271
+ group_columns = sorted(self._get_group_columns(search_keys))
1272
+ groups = (
1273
+ None
1274
+ if not group_columns or self.cv != CVType.group_k_fold
1275
+ else reduce(
1276
+ lambda left, right: left + "_" + right, [enriched_X_sorted[c].astype(str) for c in group_columns]
1277
+ ).factorize()[0]
1278
+ )
1279
+
1411
1280
  existing_filtered_enriched_features = [c for c in filtered_enriched_features if c in enriched_X_sorted.columns]
1412
1281
 
1413
1282
  fitting_X = X_sorted[client_features].copy()
@@ -1449,6 +1318,259 @@ class FeaturesEnricher(TransformerMixin):
1449
1318
  enriched_y_sorted,
1450
1319
  fitting_eval_set_dict,
1451
1320
  search_keys,
1321
+ groups,
1322
+ )
1323
+
1324
+ _SampledDataForMetrics = namedtuple(
1325
+ "_SampledDataForMetrics", "X_sampled y_sampled enriched_X eval_set_sampled_dict search_keys"
1326
+ )
1327
+
1328
+ def _sample_data_for_metrics(
1329
+ self,
1330
+ trace_id: str,
1331
+ validated_X: Union[pd.DataFrame, pd.Series, np.ndarray, None],
1332
+ validated_y: Union[pd.DataFrame, pd.Series, np.ndarray, List, None],
1333
+ eval_set: Optional[List[tuple]],
1334
+ exclude_features_sources: Optional[List[str]],
1335
+ is_input_same_as_fit: bool,
1336
+ is_demo_dataset: bool,
1337
+ remove_outliers_calc_metrics: Optional[bool],
1338
+ progress_bar: Optional[ProgressBar],
1339
+ progress_callback: Optional[Callable[[SearchProgress], Any]],
1340
+ ) -> _SampledDataForMetrics:
1341
+ if self.__cached_sampled_datasets is not None and is_input_same_as_fit and remove_outliers_calc_metrics is None:
1342
+ self.logger.info("Cached enriched dataset found - use it")
1343
+ return self.__get_sampled_cached_enriched(exclude_features_sources)
1344
+ elif len(self.feature_importances_) == 0:
1345
+ self.logger.info("No external features selected. So use only input datasets for metrics calculation")
1346
+ return self.__sample_only_input(validated_X, validated_y, eval_set, is_demo_dataset)
1347
+ elif not self.imbalanced and not exclude_features_sources and is_input_same_as_fit:
1348
+ self.logger.info("Dataset is not imbalanced, so use enriched_X from fit")
1349
+ return self.__sample_balanced(
1350
+ validated_X, validated_y, eval_set, trace_id, remove_outliers_calc_metrics, is_demo_dataset
1351
+ )
1352
+ else:
1353
+ self.logger.info("Dataset is imbalanced or exclude_features_sources or X was passed. Run transform")
1354
+ print(bundle.get("prepare_data_for_metrics"))
1355
+ return self.__sample_imbalanced(
1356
+ validated_X,
1357
+ validated_y,
1358
+ eval_set,
1359
+ is_demo_dataset,
1360
+ exclude_features_sources,
1361
+ trace_id,
1362
+ progress_bar,
1363
+ progress_callback,
1364
+ )
1365
+
1366
+ def __get_sampled_cached_enriched(self, exclude_features_sources: Optional[List[str]]) -> _SampledDataForMetrics:
1367
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = self.__cached_sampled_datasets
1368
+ if exclude_features_sources:
1369
+ enriched_X = drop_existing_columns(enriched_X, exclude_features_sources)
1370
+
1371
+ return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1372
+
1373
+ def __sample_only_input(
1374
+ self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
1375
+ ) -> _SampledDataForMetrics:
1376
+ eval_set_sampled_dict = dict()
1377
+ X_sampled, search_keys = self._extend_x(validated_X, is_demo_dataset)
1378
+ y_sampled = validated_y
1379
+ enriched_X = X_sampled
1380
+ if eval_set is not None:
1381
+ for idx in range(len(eval_set)):
1382
+ eval_X_sampled, _ = self._extend_x(eval_set[idx][0], is_demo_dataset)
1383
+ eval_y_sampled = eval_set[idx][1]
1384
+ enriched_eval_X = eval_X_sampled
1385
+ eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
1386
+ self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1387
+
1388
+ return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1389
+
1390
+ def __sample_balanced(
1391
+ self,
1392
+ validated_X: pd.DataFrame,
1393
+ validated_y: pd.Series,
1394
+ eval_set: Optional[List[tuple]],
1395
+ trace_id: str,
1396
+ remove_outliers_calc_metrics: Optional[bool],
1397
+ is_demo_dataset: bool,
1398
+ ) -> _SampledDataForMetrics:
1399
+ eval_set_sampled_dict = dict()
1400
+ search_keys = self.fit_search_keys
1401
+
1402
+ rows_to_drop = None
1403
+ task_type = self.model_task_type or define_task(validated_y, self.logger, silent=True)
1404
+ if task_type == ModelTaskType.REGRESSION:
1405
+ target_outliers_df = self._search_task.get_target_outliers(trace_id)
1406
+ if target_outliers_df is not None and len(target_outliers_df) > 0:
1407
+ outliers = pd.merge(
1408
+ self.df_with_original_index,
1409
+ target_outliers_df,
1410
+ left_on=SYSTEM_RECORD_ID,
1411
+ right_on=SYSTEM_RECORD_ID,
1412
+ how="inner",
1413
+ )
1414
+ top_outliers = outliers.sort_values(by=TARGET, ascending=False)[TARGET].head(3)
1415
+ if remove_outliers_calc_metrics is None or remove_outliers_calc_metrics is True:
1416
+ rows_to_drop = outliers
1417
+ not_msg = ""
1418
+ else:
1419
+ not_msg = "not "
1420
+ msg = bundle.get("target_outliers_warning").format(len(target_outliers_df), top_outliers, not_msg)
1421
+ print(msg)
1422
+ self.logger.warning(msg)
1423
+
1424
+ # index in each dataset (X, eval set) may be reordered and non unique, but index in validated datasets
1425
+ # can differs from it
1426
+ enriched_Xy, enriched_eval_sets = self.__enrich(
1427
+ self.df_with_original_index,
1428
+ self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True),
1429
+ rows_to_drop=rows_to_drop,
1430
+ )
1431
+
1432
+ enriched_X = drop_existing_columns(enriched_Xy, TARGET)
1433
+ X_sampled, search_keys = self._extend_x(validated_X, is_demo_dataset)
1434
+ y_sampled = enriched_Xy[TARGET].copy()
1435
+
1436
+ self.logger.info(f"Shape of enriched_X: {enriched_X.shape}")
1437
+ self.logger.info(f"Shape of X after sampling: {X_sampled.shape}")
1438
+ self.logger.info(f"Shape of y after sampling: {len(y_sampled)}")
1439
+
1440
+ if eval_set is not None:
1441
+ if len(enriched_eval_sets) != len(eval_set):
1442
+ raise ValidationError(
1443
+ bundle.get("metrics_eval_set_count_diff").format(len(enriched_eval_sets), len(eval_set))
1444
+ )
1445
+
1446
+ for idx in range(len(eval_set)):
1447
+ enriched_eval_X = drop_existing_columns(enriched_eval_sets[idx + 1], TARGET)
1448
+ eval_X_sampled, _ = self._extend_x(eval_set[idx][0], is_demo_dataset)
1449
+ eval_y_sampled = eval_set[idx][1].copy()
1450
+ eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
1451
+
1452
+ self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1453
+
1454
+ return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1455
+
1456
+ def __sample_imbalanced(
1457
+ self,
1458
+ validated_X: pd.DataFrame,
1459
+ validated_y: pd.Series,
1460
+ eval_set: Optional[List[tuple]],
1461
+ is_demo_dataset: bool,
1462
+ exclude_features_sources: Optional[List[str]],
1463
+ trace_id: str,
1464
+ progress_bar: Optional[ProgressBar],
1465
+ progress_callback: Optional[Callable[[SearchProgress], Any]],
1466
+ ) -> _SampledDataForMetrics:
1467
+ eval_set_sampled_dict = dict()
1468
+ if eval_set is not None:
1469
+ self.logger.info("Transform with eval_set")
1470
+ # concatenate X and eval_set with eval_set_index
1471
+ df_with_eval_set_index = validated_X.copy()
1472
+ df_with_eval_set_index[TARGET] = validated_y
1473
+ df_with_eval_set_index[EVAL_SET_INDEX] = 0
1474
+ for idx, eval_pair in enumerate(eval_set):
1475
+ eval_x, eval_y = self._validate_eval_set_pair(validated_X, eval_pair)
1476
+ eval_df_with_index = eval_x.copy()
1477
+ eval_df_with_index[TARGET] = eval_y
1478
+ eval_df_with_index[EVAL_SET_INDEX] = idx + 1
1479
+ df_with_eval_set_index = pd.concat([df_with_eval_set_index, eval_df_with_index])
1480
+
1481
+ # downsample if need to eval_set threshold
1482
+ num_samples = _num_samples(df_with_eval_set_index)
1483
+ if num_samples > Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD:
1484
+ self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS}")
1485
+ df_with_eval_set_index = df_with_eval_set_index.sample(
1486
+ n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state
1487
+ )
1488
+
1489
+ X_sampled = (
1490
+ df_with_eval_set_index[df_with_eval_set_index[EVAL_SET_INDEX] == 0]
1491
+ .copy()
1492
+ .drop(columns=[EVAL_SET_INDEX, TARGET])
1493
+ )
1494
+ X_sampled, search_keys = self._extend_x(X_sampled, is_demo_dataset)
1495
+ y_sampled = df_with_eval_set_index[df_with_eval_set_index[EVAL_SET_INDEX] == 0].copy()[TARGET]
1496
+ eval_set_sampled_dict = dict()
1497
+ for idx in range(len(eval_set)):
1498
+ eval_x_sampled = (
1499
+ df_with_eval_set_index[df_with_eval_set_index[EVAL_SET_INDEX] == (idx + 1)]
1500
+ .copy()
1501
+ .drop(columns=[EVAL_SET_INDEX, TARGET])
1502
+ )
1503
+ eval_x_sampled, _ = self._extend_x(eval_x_sampled, is_demo_dataset)
1504
+ eval_y_sampled = df_with_eval_set_index[df_with_eval_set_index[EVAL_SET_INDEX] == (idx + 1)].copy()[
1505
+ TARGET
1506
+ ]
1507
+ eval_set_sampled_dict[idx] = (eval_x_sampled, eval_y_sampled)
1508
+
1509
+ df_with_eval_set_index.drop(columns=TARGET, inplace=True)
1510
+
1511
+ enriched = self.transform(
1512
+ df_with_eval_set_index,
1513
+ exclude_features_sources=exclude_features_sources,
1514
+ silent_mode=True,
1515
+ trace_id=trace_id,
1516
+ metrics_calculation=True,
1517
+ progress_bar=progress_bar,
1518
+ progress_callback=progress_callback,
1519
+ )
1520
+ if enriched is None:
1521
+ return None
1522
+
1523
+ enriched_X = enriched[enriched[EVAL_SET_INDEX] == 0].copy()
1524
+ enriched_X.drop(columns=EVAL_SET_INDEX, inplace=True)
1525
+
1526
+ for idx in range(len(eval_set)):
1527
+ enriched_eval_x = enriched[enriched[EVAL_SET_INDEX] == (idx + 1)].copy()
1528
+ enriched_eval_x.drop(columns=EVAL_SET_INDEX, inplace=True)
1529
+ eval_x_sampled, eval_y_sampled = eval_set_sampled_dict[idx]
1530
+ eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
1531
+ else:
1532
+ self.logger.info("Transform without eval_set")
1533
+ df = self.X.copy()
1534
+
1535
+ df[TARGET] = validated_y
1536
+ num_samples = _num_samples(df)
1537
+ if num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
1538
+ self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
1539
+ df = df.sample(n=Dataset.FIT_SAMPLE_ROWS, random_state=self.random_state)
1540
+
1541
+ X_sampled = df.copy().drop(columns=TARGET)
1542
+ X_sampled, search_keys = self._extend_x(X_sampled, is_demo_dataset)
1543
+ y_sampled = df.copy()[TARGET]
1544
+
1545
+ df.drop(columns=TARGET, inplace=True)
1546
+
1547
+ enriched_X = self.transform(
1548
+ df,
1549
+ exclude_features_sources=exclude_features_sources,
1550
+ silent_mode=True,
1551
+ trace_id=trace_id,
1552
+ metrics_calculation=True,
1553
+ progress_bar=progress_bar,
1554
+ progress_callback=progress_callback,
1555
+ )
1556
+ if enriched_X is None:
1557
+ return None
1558
+
1559
+ self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1560
+
1561
+ return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1562
+
1563
+ def __mk_sampled_data_tuple(
1564
+ self,
1565
+ X_sampled: pd.DataFrame,
1566
+ y_sampled: pd.Series,
1567
+ enriched_X: pd.DataFrame,
1568
+ eval_set_sampled_dict: Dict,
1569
+ search_keys: Dict,
1570
+ ):
1571
+ search_keys = {k: v for k, v in search_keys.items() if k in X_sampled.columns.to_list()}
1572
+ return FeaturesEnricher._SampledDataForMetrics(
1573
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys
1452
1574
  )
1453
1575
 
1454
1576
  def get_search_id(self) -> Optional[str]:
@@ -1868,20 +1990,8 @@ class FeaturesEnricher(TransformerMixin):
1868
1990
 
1869
1991
  df = self.__add_country_code(df, self.fit_search_keys)
1870
1992
 
1871
- # Check Multivariate time series
1872
1993
  date_column = self._get_date_column(self.fit_search_keys)
1873
- if (
1874
- self.cv is None
1875
- and date_column
1876
- and model_task_type == ModelTaskType.REGRESSION
1877
- and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys())) == 0
1878
- and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
1879
- ):
1880
- msg = bundle.get("multivariate_timeseries_detected")
1881
- print(msg)
1882
- self.logger.warning(msg)
1883
- self.cv = CVType.blocked_time_series
1884
- self.runtime_parameters.properties["cv_type"] = self.cv.name
1994
+ self.__adjust_cv(df, date_column, model_task_type)
1885
1995
 
1886
1996
  self.fit_generated_features = []
1887
1997
 
@@ -2046,8 +2156,9 @@ class FeaturesEnricher(TransformerMixin):
2046
2156
 
2047
2157
  self.__show_selected_features(self.fit_search_keys)
2048
2158
 
2049
- if not self.warning_counter.has_warnings():
2050
- self.__display_support_link(bundle.get("all_ok_community_invite"))
2159
+ autofe_description = self.get_autofe_features_description()
2160
+ if autofe_description is not None:
2161
+ display_html_dataframe(autofe_description, autofe_description, "*Description of AutoFE feature names")
2051
2162
 
2052
2163
  if self._has_paid_features(exclude_features_sources):
2053
2164
  if calculate_metrics is not None and calculate_metrics:
@@ -2089,6 +2200,35 @@ class FeaturesEnricher(TransformerMixin):
2089
2200
 
2090
2201
  self.__show_report_button()
2091
2202
 
2203
+ if not self.warning_counter.has_warnings():
2204
+ self.__display_support_link(bundle.get("all_ok_community_invite"))
2205
+
2206
+ def __adjust_cv(self, df: pd.DataFrame, date_column: pd.Series, model_task_type: ModelTaskType):
2207
+ # Check Multivariate time series
2208
+ if (
2209
+ self.cv is None
2210
+ and date_column
2211
+ and model_task_type == ModelTaskType.REGRESSION
2212
+ and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys())) == 0
2213
+ and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
2214
+ ):
2215
+ msg = bundle.get("multivariate_timeseries_detected")
2216
+ self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
2217
+ elif (
2218
+ (self.cv is None or self.cv == CVType.k_fold)
2219
+ and model_task_type != ModelTaskType.REGRESSION
2220
+ and self._get_group_columns(self.fit_search_keys)
2221
+ ):
2222
+ msg = bundle.get("group_k_fold_in_classification")
2223
+ self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
2224
+
2225
+ def __override_cv(self, cv: CVType, msg: str, print_warning: bool = True):
2226
+ if print_warning:
2227
+ print(msg)
2228
+ self.logger.warning(msg)
2229
+ self.cv = cv
2230
+ self.runtime_parameters.properties["cv_type"] = self.cv.name
2231
+
2092
2232
  def get_columns_by_search_keys(self, keys: List[str]):
2093
2233
  if "HEM" in keys:
2094
2234
  keys.append("EMAIL")
@@ -2384,6 +2524,10 @@ class FeaturesEnricher(TransformerMixin):
2384
2524
  if t in [SearchKey.DATE, SearchKey.DATETIME]:
2385
2525
  return col
2386
2526
 
2527
+ @staticmethod
2528
+ def _get_group_columns(search_keys: Dict[str, SearchKey]) -> List[str]:
2529
+ return [col for col, t in search_keys.items() if t not in [SearchKey.DATE, SearchKey.DATETIME]]
2530
+
2387
2531
  @staticmethod
2388
2532
  def __get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2389
2533
  for col, t in search_keys.items():
@@ -2400,8 +2544,10 @@ class FeaturesEnricher(TransformerMixin):
2400
2544
  self, df: pd.DataFrame, meaning_types: Dict[str, FileColumnMeaningType], search_keys: Dict[str, SearchKey]
2401
2545
  ) -> pd.DataFrame:
2402
2546
  # save original order or rows
2403
- df = df.reset_index(drop=True).reset_index()
2404
- df = df.rename(columns={DEFAULT_INDEX: ORIGINAL_INDEX})
2547
+ original_index_name = df.index.name
2548
+ index_name = df.index.name or DEFAULT_INDEX
2549
+ df = df.reset_index().reset_index(drop=True)
2550
+ df = df.rename(columns={index_name: ORIGINAL_INDEX})
2405
2551
 
2406
2552
  # order by date and idempotent order by other keys
2407
2553
  if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
@@ -2432,7 +2578,8 @@ class FeaturesEnricher(TransformerMixin):
2432
2578
 
2433
2579
  # return original order
2434
2580
  df = df.set_index(ORIGINAL_INDEX)
2435
- df = df.sort_index()
2581
+ df.index.name = original_index_name
2582
+ # df = df.sort_index()
2436
2583
 
2437
2584
  meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
2438
2585
  return df
@@ -2493,6 +2640,9 @@ class FeaturesEnricher(TransformerMixin):
2493
2640
  self.logger.warning(f"X contain columns with same name as returned from backend: {dup_features}")
2494
2641
  raise ValidationError(bundle.get("returned_features_same_as_passed").format(dup_features))
2495
2642
 
2643
+ # index overrites from result_features
2644
+ original_index_name = df_with_original_index.index.name
2645
+ df_with_original_index = df_with_original_index.reset_index()
2496
2646
  result_features = pd.merge(
2497
2647
  df_with_original_index,
2498
2648
  result_features,
@@ -2500,6 +2650,8 @@ class FeaturesEnricher(TransformerMixin):
2500
2650
  right_on=SYSTEM_RECORD_ID,
2501
2651
  how="left" if is_transform else "inner",
2502
2652
  )
2653
+ result_features = result_features.set_index(original_index_name or DEFAULT_INDEX)
2654
+ result_features.index.name = original_index_name
2503
2655
 
2504
2656
  if rows_to_drop is not None:
2505
2657
  print(f"Before dropping target outliers size: {len(result_features)}")
@@ -2687,6 +2839,52 @@ class FeaturesEnricher(TransformerMixin):
2687
2839
  else:
2688
2840
  self.logger.warning("Empty features info")
2689
2841
 
2842
+ def get_autofe_features_description(self):
2843
+ try:
2844
+ autofe_meta = self._search_task.get_autofe_metadata()
2845
+ if autofe_meta is None:
2846
+ return None
2847
+ features_meta = self._search_task.get_all_features_metadata_v2()
2848
+
2849
+ def get_feature_by_display_index(idx):
2850
+ for m in features_meta:
2851
+ if m.name.endswith(str(idx)):
2852
+ return m
2853
+
2854
+ descriptions = []
2855
+ for m in autofe_meta:
2856
+ description = dict()
2857
+
2858
+ feature_meta = get_feature_by_display_index(m.display_index)
2859
+ if feature_meta is None:
2860
+ self.logger.warning(f"Feature meta for display index {m.display_index} not found")
2861
+ continue
2862
+ description["Sources"] = feature_meta.data_source.replace("AutoFE: features from ", "")
2863
+ description["Feature name"] = feature_meta.name
2864
+
2865
+ feature_idx = 1
2866
+ for bc in m.base_columns:
2867
+ description[f"Feature {feature_idx}"] = bc.hashed_name
2868
+ feature_idx += 1
2869
+
2870
+ match = re.match(f"f_autofe_(.+)_{m.display_index}", feature_meta.name)
2871
+ if match is None:
2872
+ self.logger.warning(f"Failed to infer autofe function from name {feature_meta.name}")
2873
+ else:
2874
+ description["Function"] = match.group(1)
2875
+
2876
+ descriptions.append(description)
2877
+
2878
+ if len(descriptions) == 0:
2879
+ return None
2880
+
2881
+ descriptions_df = pd.DataFrame(descriptions)
2882
+ descriptions_df.fillna("", inplace=True)
2883
+ return descriptions_df
2884
+ except Exception:
2885
+ self.logger.exception("Failed to generate AutoFE features description")
2886
+ return None
2887
+
2690
2888
  @staticmethod
2691
2889
  def _group_relevant_data_sources(df: pd.DataFrame) -> pd.DataFrame:
2692
2890
  return (
@@ -2889,8 +3087,10 @@ class FeaturesEnricher(TransformerMixin):
2889
3087
  relevant_features_df=self._features_info_without_links,
2890
3088
  relevant_datasources_df=self.relevant_data_sources,
2891
3089
  metrics_df=self.metrics,
3090
+ autofe_descriptions_df=self.get_autofe_features_description(),
2892
3091
  search_id=self._search_task.search_task_id,
2893
3092
  email=get_rest_client(self.endpoint, self.api_key).get_current_email(),
3093
+ search_keys=[str(sk) for sk in self.search_keys.values()],
2894
3094
  )
2895
3095
  except Exception:
2896
3096
  pass
@@ -3014,19 +3214,15 @@ class FeaturesEnricher(TransformerMixin):
3014
3214
  def __display_support_link(self, link_text: Optional[str] = None):
3015
3215
  support_link = bundle.get("support_link")
3016
3216
  link_text = link_text or bundle.get("support_text")
3017
- # badge = bundle.get("slack_community_bage")
3018
- # alt = bundle.get("slack_community_alt")
3019
3217
  try:
3020
3218
  from IPython.display import HTML, display
3021
3219
 
3022
3220
  _ = get_ipython() # type: ignore
3023
3221
  self.logger.warning(link_text)
3024
- print(link_text)
3025
3222
  display(
3026
3223
  HTML(
3027
- f"""<a href='{support_link}' target='_blank' rel='noopener noreferrer'>
3028
- Support</a>"""
3029
- # <img alt='{alt}' src='{badge}'></a>
3224
+ f"""{link_text} <a href='{support_link}' target='_blank' rel='noopener noreferrer'>
3225
+ here</a>"""
3030
3226
  )
3031
3227
  )
3032
3228
  except (ImportError, NameError):