upgini 1.2.16a3654.dev2__py3-none-any.whl → 1.2.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.16a3654.dev2"
1
+ __version__ = "1.2.17"
@@ -165,7 +165,6 @@ class FeaturesEnricher(TransformerMixin):
165
165
  RANDOM_STATE = 42
166
166
  CALCULATE_METRICS_THRESHOLD = 50_000_000
167
167
  CALCULATE_METRICS_MIN_THRESHOLD = 500
168
- TEXT_FEATURES_THRESHOLD = 5_000
169
168
  GENERATE_FEATURES_LIMIT = 10
170
169
  EMPTY_FEATURES_INFO = pd.DataFrame(
171
170
  columns=[
@@ -337,6 +336,9 @@ class FeaturesEnricher(TransformerMixin):
337
336
  self.exclude_columns = exclude_columns
338
337
  self.baseline_score_column = baseline_score_column
339
338
  self.add_date_if_missing = add_date_if_missing
339
+ self.features_info_display_handle = None
340
+ self.data_sources_display_handle = None
341
+ self.report_button_handle = None
340
342
 
341
343
  def _get_api_key(self):
342
344
  return self._api_key
@@ -872,6 +874,13 @@ class FeaturesEnricher(TransformerMixin):
872
874
  else None
873
875
  )
874
876
 
877
+ if self.X is None:
878
+ self.X = X
879
+ if self.y is None:
880
+ self.y = y
881
+ if self.eval_set is None:
882
+ self.eval_set = effective_eval_set
883
+
875
884
  try:
876
885
  self.__log_debug_information(
877
886
  validated_X,
@@ -939,16 +948,14 @@ class FeaturesEnricher(TransformerMixin):
939
948
 
940
949
  gc.collect()
941
950
 
942
- text_features = self.generate_features if fitting_X.shape[0] >= self.TEXT_FEATURES_THRESHOLD else []
951
+ if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
952
+ print(self.bundle.get("metrics_no_important_free_features"))
953
+ self.logger.warning("No client or free relevant ADS features found to calculate metrics")
954
+ self.warning_counter.increment()
955
+ return None
943
956
 
944
957
  print(self.bundle.get("metrics_start"))
945
958
  with Spinner():
946
- if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
947
- print(self.bundle.get("metrics_no_important_free_features"))
948
- self.logger.warning("No client or free relevant ADS features found to calculate metrics")
949
- self.warning_counter.increment()
950
- return None
951
-
952
959
  self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
953
960
 
954
961
  has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
@@ -962,7 +969,7 @@ class FeaturesEnricher(TransformerMixin):
962
969
  fitting_enriched_X,
963
970
  scoring,
964
971
  groups=groups,
965
- text_features=text_features,
972
+ text_features=self.generate_features,
966
973
  has_date=has_date,
967
974
  )
968
975
  metric = wrapper.metric_name
@@ -989,10 +996,10 @@ class FeaturesEnricher(TransformerMixin):
989
996
  cat_features,
990
997
  add_params=custom_loss_add_params,
991
998
  groups=groups,
992
- text_features=text_features,
999
+ text_features=self.generate_features,
993
1000
  has_date=has_date,
994
1001
  )
995
- etalon_metric = baseline_estimator.cross_val_predict(
1002
+ etalon_metric, _ = baseline_estimator.cross_val_predict(
996
1003
  fitting_X, y_sorted, self.baseline_score_column
997
1004
  )
998
1005
  if etalon_metric is None:
@@ -1023,10 +1030,16 @@ class FeaturesEnricher(TransformerMixin):
1023
1030
  cat_features,
1024
1031
  add_params=custom_loss_add_params,
1025
1032
  groups=groups,
1026
- text_features=text_features,
1033
+ text_features=self.generate_features,
1027
1034
  has_date=has_date,
1028
1035
  )
1029
- enriched_metric = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
1036
+ enriched_metric, enriched_shaps = enriched_estimator.cross_val_predict(
1037
+ fitting_enriched_X, enriched_y_sorted
1038
+ )
1039
+
1040
+ if enriched_shaps is not None:
1041
+ self._update_shap_values(enriched_shaps)
1042
+
1030
1043
  if enriched_metric is None:
1031
1044
  self.logger.warning(
1032
1045
  f"Enriched {metric} on train combined features is None (maybe all features was removed)"
@@ -1159,13 +1172,6 @@ class FeaturesEnricher(TransformerMixin):
1159
1172
  elif uplift_col in metrics_df.columns and (metrics_df[uplift_col] < 0).any():
1160
1173
  self.logger.warning("Uplift is negative")
1161
1174
 
1162
- if self.X is None:
1163
- self.X = X
1164
- if self.y is None:
1165
- self.y = y
1166
- if self.eval_set is None:
1167
- self.eval_set = effective_eval_set
1168
-
1169
1175
  return metrics_df
1170
1176
  except Exception as e:
1171
1177
  error_message = "Failed to calculate metrics" + (
@@ -1190,6 +1196,72 @@ class FeaturesEnricher(TransformerMixin):
1190
1196
  finally:
1191
1197
  self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
1192
1198
 
1199
+ def _update_shap_values(self, new_shaps: Dict[str, float]):
1200
+ new_shaps = {
1201
+ feature: self._round_shap_value(shap)
1202
+ for feature, shap in new_shaps.items()
1203
+ if feature in self.feature_names_
1204
+ }
1205
+ features_importances = list(new_shaps.items())
1206
+ features_importances.sort(key=lambda m: (-m[1], m[0]))
1207
+ self.feature_names_, self.feature_importances_ = zip(*features_importances)
1208
+ self.feature_names_ = list(self.feature_names_)
1209
+ self.feature_importances_ = list(self.feature_importances_)
1210
+
1211
+ feature_name_header = self.bundle.get("features_info_name")
1212
+ shap_value_header = self.bundle.get("features_info_shap")
1213
+
1214
+ def update_shap(row):
1215
+ return new_shaps.get(row[feature_name_header], row[shap_value_header])
1216
+
1217
+ self.features_info[shap_value_header] = self.features_info.apply(update_shap, axis=1)
1218
+ self._internal_features_info[shap_value_header] = self._internal_features_info.apply(update_shap, axis=1)
1219
+ self._features_info_without_links[shap_value_header] = self._features_info_without_links.apply(
1220
+ update_shap, axis=1
1221
+ )
1222
+ self.logger.info(f"Recalculated SHAP values:\n{self._features_info_without_links}")
1223
+
1224
+ self.features_info.sort_values(by=shap_value_header, ascending=False, inplace=True)
1225
+ self._internal_features_info.sort_values(by=shap_value_header, ascending=False, inplace=True)
1226
+ self._features_info_without_links.sort_values(by=shap_value_header, ascending=False, inplace=True)
1227
+
1228
+ self.relevant_data_sources = self._group_relevant_data_sources(self.features_info, self.bundle)
1229
+ self._relevant_data_sources_wo_links = self._group_relevant_data_sources(
1230
+ self._features_info_without_links, self.bundle
1231
+ )
1232
+
1233
+ if self.features_info_display_handle is not None:
1234
+ try:
1235
+ _ = get_ipython() # type: ignore
1236
+
1237
+ display_html_dataframe(
1238
+ self.features_info,
1239
+ self._features_info_without_links,
1240
+ self.bundle.get("relevant_features_header"),
1241
+ display_handle=self.features_info_display_handle,
1242
+ )
1243
+ except (ImportError, NameError):
1244
+ print(self._internal_features_info)
1245
+ if self.data_sources_display_handle is not None:
1246
+ try:
1247
+ _ = get_ipython() # type: ignore
1248
+
1249
+ display_html_dataframe(
1250
+ self.relevant_data_sources,
1251
+ self._relevant_data_sources_wo_links,
1252
+ self.bundle.get("relevant_features_header"),
1253
+ display_handle=self.data_sources_display_handle,
1254
+ )
1255
+ except (ImportError, NameError):
1256
+ print(self._relevant_data_sources_wo_links)
1257
+ if self.report_button_handle is not None:
1258
+ try:
1259
+ _ = get_ipython() # type: ignore
1260
+
1261
+ self.__show_report_button(display_handle=self.report_button_handle)
1262
+ except (ImportError, NameError):
1263
+ pass
1264
+
1193
1265
  def _check_train_and_eval_target_distribution(self, y, eval_set_dict):
1194
1266
  uneven_distribution = False
1195
1267
  for eval_set in eval_set_dict.values():
@@ -1518,11 +1590,19 @@ class FeaturesEnricher(TransformerMixin):
1518
1590
  self.logger.info("No external features selected. So use only input datasets for metrics calculation")
1519
1591
  return self.__sample_only_input(validated_X, validated_y, eval_set, is_demo_dataset)
1520
1592
  # TODO save and check if dataset was deduplicated - use imbalance branch for such case
1521
- elif not self.imbalanced and not exclude_features_sources and is_input_same_as_fit:
1593
+ elif (
1594
+ not self.imbalanced
1595
+ and not exclude_features_sources
1596
+ and is_input_same_as_fit
1597
+ and self.df_with_original_index is not None
1598
+ ):
1522
1599
  self.logger.info("Dataset is not imbalanced, so use enriched_X from fit")
1523
1600
  return self.__sample_balanced(eval_set, trace_id, remove_outliers_calc_metrics)
1524
1601
  else:
1525
- self.logger.info("Dataset is imbalanced or exclude_features_sources or X was passed. Run transform")
1602
+ self.logger.info(
1603
+ "Dataset is imbalanced or exclude_features_sources or X was passed or this is saved search."
1604
+ " Run transform"
1605
+ )
1526
1606
  print(self.bundle.get("prepare_data_for_metrics"))
1527
1607
  return self.__sample_imbalanced(
1528
1608
  validated_X,
@@ -2028,6 +2108,13 @@ class FeaturesEnricher(TransformerMixin):
2028
2108
  runtime_parameters = self._get_copy_of_runtime_parameters()
2029
2109
  features_for_transform = self._search_task.get_features_for_transform() or []
2030
2110
  if len(features_for_transform) > 0:
2111
+ missing_features_for_transform = [
2112
+ columns_renaming.get(f) for f in features_for_transform if f not in df.columns
2113
+ ]
2114
+ if len(missing_features_for_transform) > 0:
2115
+ raise ValidationError(
2116
+ self.bundle.get("missing_features_for_transform").format(missing_features_for_transform)
2117
+ )
2031
2118
  runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
2032
2119
 
2033
2120
  columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
@@ -2702,10 +2789,10 @@ class FeaturesEnricher(TransformerMixin):
2702
2789
  progress_callback,
2703
2790
  )
2704
2791
  except Exception:
2705
- self.__show_report_button()
2792
+ self.report_button_handle = self.__show_report_button(display_id="report_button")
2706
2793
  raise
2707
2794
 
2708
- self.__show_report_button()
2795
+ self.report_button_handle = self.__show_report_button(display_id="report_button")
2709
2796
 
2710
2797
  if not self.warning_counter.has_warnings():
2711
2798
  self.__display_support_link(self.bundle.get("all_ok_community_invite"))
@@ -3377,6 +3464,13 @@ class FeaturesEnricher(TransformerMixin):
3377
3464
 
3378
3465
  return result_train, result_eval_sets
3379
3466
 
3467
+ @staticmethod
3468
+ def _round_shap_value(shap: float) -> float:
3469
+ if shap > 0.0 and shap < 0.0001:
3470
+ return 0.0001
3471
+ else:
3472
+ return round(shap, 4)
3473
+
3380
3474
  def __prepare_feature_importances(self, trace_id: str, x_columns: List[str], silent=False):
3381
3475
  llm_source = "LLM with external data augmentation"
3382
3476
  if self._search_task is None:
@@ -3394,12 +3488,6 @@ class FeaturesEnricher(TransformerMixin):
3394
3488
  features_info_without_links = []
3395
3489
  internal_features_info = []
3396
3490
 
3397
- def round_shap_value(shap: float) -> float:
3398
- if shap > 0.0 and shap < 0.0001:
3399
- return 0.0001
3400
- else:
3401
- return round(shap, 4)
3402
-
3403
3491
  def list_or_single(lst: List[str], single: str):
3404
3492
  return lst or ([single] if single else [])
3405
3493
 
@@ -3432,7 +3520,7 @@ class FeaturesEnricher(TransformerMixin):
3432
3520
 
3433
3521
  feature_sample = []
3434
3522
  self.feature_names_.append(feature_meta.name)
3435
- self.feature_importances_.append(round_shap_value(feature_meta.shap_value))
3523
+ self.feature_importances_.append(self._round_shap_value(feature_meta.shap_value))
3436
3524
  if feature_meta.name in features_df.columns:
3437
3525
  feature_sample = np.random.choice(features_df[feature_meta.name].dropna().unique(), 3).tolist()
3438
3526
  if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
@@ -3471,7 +3559,7 @@ class FeaturesEnricher(TransformerMixin):
3471
3559
  features_info.append(
3472
3560
  {
3473
3561
  self.bundle.get("features_info_name"): feature_name,
3474
- self.bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
3562
+ self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
3475
3563
  self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3476
3564
  self.bundle.get("features_info_value_preview"): feature_sample,
3477
3565
  self.bundle.get("features_info_provider"): provider,
@@ -3482,7 +3570,7 @@ class FeaturesEnricher(TransformerMixin):
3482
3570
  features_info_without_links.append(
3483
3571
  {
3484
3572
  self.bundle.get("features_info_name"): internal_feature_name,
3485
- self.bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
3573
+ self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
3486
3574
  self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3487
3575
  self.bundle.get("features_info_value_preview"): feature_sample,
3488
3576
  self.bundle.get("features_info_provider"): internal_provider,
@@ -3494,7 +3582,7 @@ class FeaturesEnricher(TransformerMixin):
3494
3582
  {
3495
3583
  self.bundle.get("features_info_name"): internal_feature_name,
3496
3584
  "feature_link": feature_meta.doc_link,
3497
- self.bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
3585
+ self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
3498
3586
  self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3499
3587
  self.bundle.get("features_info_value_preview"): feature_sample,
3500
3588
  self.bundle.get("features_info_provider"): internal_provider,
@@ -3774,14 +3862,18 @@ class FeaturesEnricher(TransformerMixin):
3774
3862
  print(Format.GREEN + Format.BOLD + msg + Format.END)
3775
3863
  self.logger.info(msg)
3776
3864
  if len(self.feature_names_) > 0:
3777
- display_html_dataframe(
3778
- self.features_info, self._features_info_without_links, self.bundle.get("relevant_features_header")
3865
+ self.features_info_display_handle = display_html_dataframe(
3866
+ self.features_info,
3867
+ self._features_info_without_links,
3868
+ self.bundle.get("relevant_features_header"),
3869
+ display_id="features_info",
3779
3870
  )
3780
3871
 
3781
- display_html_dataframe(
3872
+ self.data_sources_display_handle = display_html_dataframe(
3782
3873
  self.relevant_data_sources,
3783
3874
  self._relevant_data_sources_wo_links,
3784
3875
  self.bundle.get("relevant_data_sources_header"),
3876
+ display_id="data_sources",
3785
3877
  )
3786
3878
  else:
3787
3879
  msg = self.bundle.get("features_info_zero_important_features")
@@ -3792,9 +3884,9 @@ class FeaturesEnricher(TransformerMixin):
3792
3884
  print(msg)
3793
3885
  print(self._internal_features_info)
3794
3886
 
3795
- def __show_report_button(self):
3887
+ def __show_report_button(self, display_id: Optional[str] = None, display_handle=None):
3796
3888
  try:
3797
- prepare_and_show_report(
3889
+ return prepare_and_show_report(
3798
3890
  relevant_features_df=self._features_info_without_links,
3799
3891
  relevant_datasources_df=self.relevant_data_sources,
3800
3892
  metrics_df=self.metrics,
@@ -3802,6 +3894,8 @@ class FeaturesEnricher(TransformerMixin):
3802
3894
  search_id=self._search_task.search_task_id,
3803
3895
  email=self.rest_client.get_current_email(),
3804
3896
  search_keys=[str(sk) for sk in self.search_keys.values()],
3897
+ display_id=display_id,
3898
+ display_handle=display_handle,
3805
3899
  )
3806
3900
  except Exception:
3807
3901
  pass
upgini/metrics.py CHANGED
@@ -3,13 +3,14 @@ from __future__ import annotations
3
3
  import inspect
4
4
  import logging
5
5
  import re
6
+ from collections import defaultdict
6
7
  from copy import deepcopy
7
8
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
8
9
 
9
10
  import catboost
10
11
  import numpy as np
11
12
  import pandas as pd
12
- from catboost import CatBoostClassifier, CatBoostRegressor
13
+ from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor, Pool
13
14
  from numpy import log1p
14
15
  from pandas.api.types import is_numeric_dtype
15
16
  from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
@@ -63,7 +64,7 @@ CATBOOST_BINARY_PARAMS = {
63
64
  "verbose": False,
64
65
  "random_state": DEFAULT_RANDOM_STATE,
65
66
  "allow_writing_files": False,
66
- "auto_class_weights": "SqrtBalanced",
67
+ "auto_class_weights": "Balanced",
67
68
  }
68
69
 
69
70
  CATBOOST_MULTICLASS_PARAMS = {
@@ -81,7 +82,7 @@ CATBOOST_MULTICLASS_PARAMS = {
81
82
  "verbose": False,
82
83
  "random_state": DEFAULT_RANDOM_STATE,
83
84
  "allow_writing_files": False,
84
- "auto_class_weights": "SqrtBalanced",
85
+ "auto_class_weights": "Balanced",
85
86
  }
86
87
 
87
88
  LIGHTGBM_PARAMS = {
@@ -288,9 +289,12 @@ class EstimatorWrapper:
288
289
  x, y, _ = self._prepare_data(x, y)
289
290
  return x, y, {}
290
291
 
292
+ def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
293
+ return None
294
+
291
295
  def cross_val_predict(
292
296
  self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
293
- ) -> Optional[float]:
297
+ ) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
294
298
  x, y, groups, fit_params = self._prepare_to_fit(x, y)
295
299
 
296
300
  if x.shape[1] == 0:
@@ -298,6 +302,7 @@ class EstimatorWrapper:
298
302
 
299
303
  scorer = check_scoring(self.estimator, scoring=self.scorer)
300
304
 
305
+ shap_values_all_folds = defaultdict(list)
301
306
  if baseline_score_column is not None and self.metric_name == "GINI":
302
307
  self.logger.info("Calculate baseline GINI on passed baseline_score_column and target")
303
308
  metric = roc_auc_score(y, x[baseline_score_column])
@@ -319,7 +324,29 @@ class EstimatorWrapper:
319
324
  self.check_fold_metrics(metrics_by_fold)
320
325
 
321
326
  metric = np.mean(metrics_by_fold) * self.multiplier
322
- return self.post_process_metric(metric)
327
+
328
+ splits = self.cv.split(x, y, groups)
329
+
330
+ for estimator, split in zip(self.cv_estimators, splits):
331
+ _, validation_idx = split
332
+ cv_x = x.iloc[validation_idx]
333
+ cv_y = y[validation_idx]
334
+ shaps = self.calculate_shap(cv_x, cv_y, estimator)
335
+ if shaps is not None:
336
+ for feature, shap_value in shaps.items():
337
+ # shap_values_all_folds[feature] = shap_values_all_folds.get(feature, []) + shap_value.tolist()
338
+ shap_values_all_folds[feature].extend(shap_value.tolist())
339
+
340
+ if shap_values_all_folds:
341
+ average_shap_values = {
342
+ feature: np.mean(np.array(shaps)) for feature, shaps in shap_values_all_folds.items() if len(shaps) > 0
343
+ }
344
+ if len(average_shap_values) == 0:
345
+ average_shap_values = None
346
+ else:
347
+ average_shap_values = None
348
+
349
+ return self.post_process_metric(metric), average_shap_values
323
350
 
324
351
  def check_fold_metrics(self, metrics_by_fold: List[float]):
325
352
  first_metric_sign = 1 if metrics_by_fold[0] >= 0 else -1
@@ -453,6 +480,7 @@ class CatBoostWrapper(EstimatorWrapper):
453
480
  )
454
481
  self.cat_features = None
455
482
  self.emb_features = None
483
+ self.grouped_embedding_features = None
456
484
  self.exclude_features = []
457
485
 
458
486
  def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
@@ -462,17 +490,16 @@ class CatBoostWrapper(EstimatorWrapper):
462
490
  if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
463
491
  emb_pattern = r"(.+)_emb\d+"
464
492
  self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
465
- embedding_features = []
466
493
  if len(self.emb_features) > 3: # There is no reason to reduce embeddings dimension with less than 4
467
494
  self.logger.info(
468
495
  "Embedding features count more than 3, so group them into one vector for CatBoost: "
469
496
  f"{self.emb_features}"
470
497
  )
471
- x, embedding_features = self.group_embeddings(x)
472
- params["embedding_features"] = embedding_features
498
+ x, self.grouped_embedding_features = self.group_embeddings(x)
499
+ params["embedding_features"] = self.grouped_embedding_features
473
500
  else:
474
501
  self.logger.info(f"Embedding features count less than 3, so use them separately: {self.emb_features}")
475
- self.emb_features = []
502
+ self.grouped_embedding_features = None
476
503
  else:
477
504
  self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
478
505
 
@@ -488,7 +515,7 @@ class CatBoostWrapper(EstimatorWrapper):
488
515
  self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
489
516
 
490
517
  # Find rest categorical features
491
- self.cat_features = _get_cat_features(x, self.text_features, embedding_features)
518
+ self.cat_features = _get_cat_features(x, self.text_features, self.grouped_embedding_features)
492
519
  # x = fill_na_cat_features(x, self.cat_features)
493
520
  unique_cat_features = []
494
521
  for name in self.cat_features:
@@ -548,7 +575,7 @@ class CatBoostWrapper(EstimatorWrapper):
548
575
 
549
576
  def cross_val_predict(
550
577
  self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
551
- ) -> Optional[float]:
578
+ ) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
552
579
  try:
553
580
  return super().cross_val_predict(x, y, baseline_score_column)
554
581
  except Exception as e:
@@ -573,6 +600,36 @@ class CatBoostWrapper(EstimatorWrapper):
573
600
  else:
574
601
  raise e
575
602
 
603
+ def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator: CatBoost) -> Optional[Dict[str, float]]:
604
+ try:
605
+ # Create Pool for fold data, if need (for example, when categorical features are present)
606
+ fold_pool = Pool(
607
+ x,
608
+ y,
609
+ cat_features=self.cat_features,
610
+ text_features=self.text_features,
611
+ embedding_features=self.grouped_embedding_features,
612
+ )
613
+
614
+ # Get SHAP values of current estimator
615
+ shap_values_fold = estimator.get_feature_importance(data=fold_pool, type="ShapValues")
616
+
617
+ # Remove last columns (base value) and flatten
618
+ if self.target_type == ModelTaskType.MULTICLASS:
619
+ all_shaps = shap_values_fold[:, :, :-1]
620
+ all_shaps = [all_shaps[:, :, k].flatten() for k in range(all_shaps.shape[2])]
621
+ else:
622
+ all_shaps = shap_values_fold[:, :-1]
623
+ all_shaps = [all_shaps[:, k].flatten() for k in range(all_shaps.shape[1])]
624
+
625
+ all_shaps = np.abs(all_shaps)
626
+
627
+ return dict(zip(estimator.feature_names_, all_shaps))
628
+
629
+ except Exception:
630
+ self.logger.exception("Failed to recalculate new SHAP values")
631
+ return None
632
+
576
633
 
577
634
  class LightGBMWrapper(EstimatorWrapper):
578
635
  def __init__(
@@ -136,6 +136,7 @@ eval_y_is_empty=y in eval_set is empty.
136
136
  x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
137
137
  baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
138
138
  baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
139
+ missing_features_for_transform=Missing some features for transform that were presented on fit: {}
139
140
  # target validation
140
141
  empty_target=Target is empty in all rows
141
142
  # non_numeric_target=Binary target should be numerical type
@@ -9,6 +9,7 @@ from typing import Callable, List, Optional
9
9
 
10
10
  import pandas as pd
11
11
  from xhtml2pdf import pisa
12
+
12
13
  from upgini.__about__ import __version__
13
14
 
14
15
 
@@ -72,7 +73,9 @@ def make_table(df: pd.DataFrame, wrap_long_string=None) -> str:
72
73
  )
73
74
 
74
75
 
75
- def display_html_dataframe(df: pd.DataFrame, internal_df: pd.DataFrame, header: str):
76
+ def display_html_dataframe(
77
+ df: pd.DataFrame, internal_df: pd.DataFrame, header: str, display_id: Optional[str] = None, display_handle=None
78
+ ):
76
79
  if not ipython_available():
77
80
  print(header)
78
81
  print(internal_df)
@@ -133,7 +136,10 @@ def display_html_dataframe(df: pd.DataFrame, internal_df: pd.DataFrame, header:
133
136
  {table_html}
134
137
  </div>
135
138
  """
136
- display(HTML(result_html))
139
+ if display_handle:
140
+ return display_handle.update(HTML(result_html))
141
+ else:
142
+ return display(HTML(result_html), display_id=display_id)
137
143
 
138
144
 
139
145
  def make_html_report(
@@ -279,6 +285,8 @@ def prepare_and_show_report(
279
285
  search_id: str,
280
286
  email: Optional[str],
281
287
  search_keys: Optional[List[str]] = None,
288
+ display_id: Optional[str] = None,
289
+ display_handle=None,
282
290
  ):
283
291
  if not ipython_available():
284
292
  return
@@ -288,10 +296,12 @@ def prepare_and_show_report(
288
296
  )
289
297
 
290
298
  if len(relevant_features_df) > 0:
291
- show_button_download_pdf(report)
299
+ return show_button_download_pdf(report, display_id=display_id, display_handle=display_handle)
292
300
 
293
301
 
294
- def show_button_download_pdf(source: str, title="\U0001F4CA Download PDF report"):
302
+ def show_button_download_pdf(
303
+ source: str, title="\U0001F4CA Download PDF report", display_id: Optional[str] = None, display_handle=None
304
+ ):
295
305
  from IPython.display import HTML, display
296
306
 
297
307
  file_name = f"upgini-report-{uuid.uuid4()}.pdf"
@@ -303,7 +313,10 @@ def show_button_download_pdf(source: str, title="\U0001F4CA Download PDF report"
303
313
  payload = b64.decode()
304
314
  html = f"""<a download="{file_name}" href="data:application/pdf;base64,{payload}" target="_blank">
305
315
  <button>{title}</button></a>"""
306
- display(HTML(html))
316
+ if display_handle is not None:
317
+ display_handle.update(HTML(html))
318
+ else:
319
+ return display(HTML(html), display_id=display_id)
307
320
 
308
321
 
309
322
  def show_request_quote_button():
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.16a3654.dev2
3
+ Version: 1.2.17
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -145,7 +145,7 @@ Description-Content-Type: text/markdown
145
145
 
146
146
  ## 💼 Tutorials
147
147
 
148
- ### [Search of relevant external features & Automated feature generation for Salary predicton task (use as a template)](https://github.com/upgini/upgini/blob/main/notebooks/Upgini_Features_search%26generation.ipynb)
148
+ ### [Search of relevant external features & Automated feature generation for Salary prediction task (use as a template)](https://github.com/upgini/upgini/blob/main/notebooks/Upgini_Features_search%26generation.ipynb)
149
149
 
150
150
  * The goal is to predict salary for data science job postning based on information about employer and job description.
151
151
  * Following this guide, you'll learn how to **search & auto generate new relevant features with Upgini library**
@@ -1,13 +1,13 @@
1
- upgini/__about__.py,sha256=m8lNgbILu3tdB08_eTU_fFkQaqsqffurburflpHTVjU,33
1
+ upgini/__about__.py,sha256=LyCJKEtzC7sS6MlxViknrdz9t79ni5iIOEGUNPPAnwU,23
2
2
  upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=4lfofrRPndG_CFMownDHZuXTnfMgDF1a8hW-ShdU8ns,188446
6
+ upgini/features_enricher.py,sha256=3Jx6eoGULag64lN8pnwloI-RKwyLlVONrCADxpehwNo,192789
7
7
  upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
9
  upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
10
- upgini/metrics.py,sha256=4iCVwN9dLVXt1907PeSTeWGkBV5BM9LBSdXQpCGQuJA,31262
10
+ upgini/metrics.py,sha256=lhLqFv1tLWNzx3ULELo3MMSqI8eBoHL7P5jKpG8a6PE,33899
11
11
  upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
12
12
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
13
13
  upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1389
@@ -30,7 +30,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
30
30
  upgini/normalizer/normalize_utils.py,sha256=Lv75lq7M46z9cAIutwkdKZtPZkWblgoRzToAJ1BwY8A,7709
31
31
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
32
32
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
33
- upgini/resource_bundle/strings.properties,sha256=eqJP6bGu12zFuQJqMY03QbMhppcdwIfL2bsJWaqmuZ4,27221
33
+ upgini/resource_bundle/strings.properties,sha256=bWWznzu43Lwfd-j4XDrpKJCpoxMMThd73awB7ge7wfo,27319
34
34
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
35
35
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
36
  upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -44,7 +44,7 @@ upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDc
44
44
  upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
45
45
  upgini/utils/datetime_utils.py,sha256=4tsGeehU0KS6wqNsc9gEEWZ9s6T9E0UReUIO3rSuXNU,12174
46
46
  upgini/utils/deduplicate_utils.py,sha256=NpaPtBYXwUtfKTRHWrtz2uUq6tZN6C_Nd719ydPRF2Q,8484
47
- upgini/utils/display_utils.py,sha256=A2ouB5eiZ-Kyt9ykYxkLQwyoRPrdYeJymwNTiajtFXs,10990
47
+ upgini/utils/display_utils.py,sha256=NGhki1aGMsS8OeI69eLXEpmS_s41k8ojKHQxacJaXiU,11493
48
48
  upgini/utils/email_utils.py,sha256=j0Ug1R_0AnCg1Y92zIZ4XMwvKo3G5_pcOlBN1OH_gZs,5191
49
49
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
50
50
  upgini/utils/features_validator.py,sha256=yiOdzVtpArELMufzAa9mtWq32lETB6sIF-w3Yvl3vV8,3614
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
57
57
  upgini/utils/target_utils.py,sha256=qHzZRmICFbLNCrmVqGkaBcjm91L2ERRZMppci36acV4,10085
58
58
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
59
59
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
60
- upgini-1.2.16a3654.dev2.dist-info/METADATA,sha256=WKmV93EZYYJRG_a47sfqJygnPythxkRW-6RXZcLVkxs,48587
61
- upgini-1.2.16a3654.dev2.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
62
- upgini-1.2.16a3654.dev2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
- upgini-1.2.16a3654.dev2.dist-info/RECORD,,
60
+ upgini-1.2.17.dist-info/METADATA,sha256=g8R9yIZmDZNOFNFMVW-65PTooKnQx6tWMX4Z1Pky-yI,48578
61
+ upgini-1.2.17.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
+ upgini-1.2.17.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
+ upgini-1.2.17.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.24.2
2
+ Generator: hatchling 1.25.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any