upgini 1.2.15__py3-none-any.whl → 1.2.16a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.15"
1
+ __version__ = "1.2.16a1"
@@ -871,6 +871,13 @@ class FeaturesEnricher(TransformerMixin):
871
871
  else None
872
872
  )
873
873
 
874
+ if self.X is None:
875
+ self.X = X
876
+ if self.y is None:
877
+ self.y = y
878
+ if self.eval_set is None:
879
+ self.eval_set = effective_eval_set
880
+
874
881
  try:
875
882
  self.__log_debug_information(
876
883
  validated_X,
@@ -938,14 +945,14 @@ class FeaturesEnricher(TransformerMixin):
938
945
 
939
946
  gc.collect()
940
947
 
948
+ if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
949
+ print(self.bundle.get("metrics_no_important_free_features"))
950
+ self.logger.warning("No client or free relevant ADS features found to calculate metrics")
951
+ self.warning_counter.increment()
952
+ return None
953
+
941
954
  print(self.bundle.get("metrics_start"))
942
955
  with Spinner():
943
- if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
944
- print(self.bundle.get("metrics_no_important_free_features"))
945
- self.logger.warning("No client or free relevant ADS features found to calculate metrics")
946
- self.warning_counter.increment()
947
- return None
948
-
949
956
  self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
950
957
 
951
958
  has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
@@ -989,7 +996,7 @@ class FeaturesEnricher(TransformerMixin):
989
996
  text_features=self.generate_features,
990
997
  has_date=has_date,
991
998
  )
992
- etalon_metric = baseline_estimator.cross_val_predict(
999
+ etalon_metric, _ = baseline_estimator.cross_val_predict(
993
1000
  fitting_X, y_sorted, self.baseline_score_column
994
1001
  )
995
1002
  if etalon_metric is None:
@@ -1023,7 +1030,11 @@ class FeaturesEnricher(TransformerMixin):
1023
1030
  text_features=self.generate_features,
1024
1031
  has_date=has_date,
1025
1032
  )
1026
- enriched_metric = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
1033
+ enriched_metric, enriched_shaps = enriched_estimator.cross_val_predict(
1034
+ fitting_enriched_X, enriched_y_sorted
1035
+ )
1036
+
1037
+ print(f"Calculated enriched shaps: {enriched_shaps}")
1027
1038
  if enriched_metric is None:
1028
1039
  self.logger.warning(
1029
1040
  f"Enriched {metric} on train combined features is None (maybe all features was removed)"
@@ -1156,13 +1167,6 @@ class FeaturesEnricher(TransformerMixin):
1156
1167
  elif uplift_col in metrics_df.columns and (metrics_df[uplift_col] < 0).any():
1157
1168
  self.logger.warning("Uplift is negative")
1158
1169
 
1159
- if self.X is None:
1160
- self.X = X
1161
- if self.y is None:
1162
- self.y = y
1163
- if self.eval_set is None:
1164
- self.eval_set = effective_eval_set
1165
-
1166
1170
  return metrics_df
1167
1171
  except Exception as e:
1168
1172
  error_message = "Failed to calculate metrics" + (
@@ -1187,6 +1191,18 @@ class FeaturesEnricher(TransformerMixin):
1187
1191
  finally:
1188
1192
  self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
1189
1193
 
1194
+ def _update_shap_values(self, new_shaps: Dict[str, float]):
1195
+ feature_name_header = self.bundle.get("features_info_name")
1196
+ shap_value_header = self.bundle.get("features_info_shap")
1197
+
1198
+ def update_shap(row):
1199
+ return new_shaps.get(row[feature_name_header], row[shap_value_header])
1200
+
1201
+ self.features_info[shap_value_header] = self.features_info.apply(update_shap, axis=1)
1202
+ self.features_info.sort_values(by=shap_value_header, ascending=False, inplace=True)
1203
+
1204
+ # TODO redraw
1205
+
1190
1206
  def _check_train_and_eval_target_distribution(self, y, eval_set_dict):
1191
1207
  uneven_distribution = False
1192
1208
  for eval_set in eval_set_dict.values():
@@ -1578,7 +1594,9 @@ class FeaturesEnricher(TransformerMixin):
1578
1594
  generated_features.extend(generator.generated_features)
1579
1595
 
1580
1596
  normalizer = Normalizer(self.bundle, self.logger, self.warning_counter)
1581
- df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
1597
+ df, search_keys, generated_features = normalizer.normalize(
1598
+ df, search_keys, generated_features
1599
+ )
1582
1600
  columns_renaming = normalizer.columns_renaming
1583
1601
 
1584
1602
  df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
@@ -2018,7 +2036,9 @@ class FeaturesEnricher(TransformerMixin):
2018
2036
  generated_features.extend(generator.generated_features)
2019
2037
 
2020
2038
  normalizer = Normalizer(self.bundle, self.logger, self.warning_counter, silent_mode)
2021
- df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
2039
+ df, search_keys, generated_features = normalizer.normalize(
2040
+ df, search_keys, generated_features
2041
+ )
2022
2042
  columns_renaming = normalizer.columns_renaming
2023
2043
 
2024
2044
  # Don't pass all features in backend on transform
upgini/metrics.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from collections import defaultdict
3
4
  import inspect
4
5
  import logging
5
6
  import re
@@ -9,7 +10,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
9
10
  import catboost
10
11
  import numpy as np
11
12
  import pandas as pd
12
- from catboost import CatBoostClassifier, CatBoostRegressor
13
+ from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor, Pool
13
14
  from numpy import log1p
14
15
  from pandas.api.types import is_numeric_dtype
15
16
  from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
@@ -288,9 +289,12 @@ class EstimatorWrapper:
288
289
  x, y, _ = self._prepare_data(x, y)
289
290
  return x, y, {}
290
291
 
292
+ def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
293
+ return None
294
+
291
295
  def cross_val_predict(
292
296
  self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
293
- ) -> Optional[float]:
297
+ ) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
294
298
  x, y, groups, fit_params = self._prepare_to_fit(x, y)
295
299
 
296
300
  if x.shape[1] == 0:
@@ -298,6 +302,7 @@ class EstimatorWrapper:
298
302
 
299
303
  scorer = check_scoring(self.estimator, scoring=self.scorer)
300
304
 
305
+ shap_values_all_folds = defaultdict(list)
301
306
  if baseline_score_column is not None and self.metric_name == "GINI":
302
307
  self.logger.info("Calculate baseline GINI on passed baseline_score_column and target")
303
308
  metric = roc_auc_score(y, x[baseline_score_column])
@@ -319,7 +324,29 @@ class EstimatorWrapper:
319
324
  self.check_fold_metrics(metrics_by_fold)
320
325
 
321
326
  metric = np.mean(metrics_by_fold) * self.multiplier
322
- return self.post_process_metric(metric)
327
+
328
+ splits = self.cv.split(x, y, groups)
329
+
330
+ for estimator, split in zip(self.cv_estimators, splits):
331
+ _, validation_idx = split
332
+ cv_x = x.iloc[validation_idx]
333
+ cv_y = y[validation_idx]
334
+ shaps = self.calculate_shap(cv_x, cv_y, estimator)
335
+ if shaps is not None:
336
+ for feature, shap_value in shaps.items():
337
+ # shap_values_all_folds[feature] = shap_values_all_folds.get(feature, []) + shap_value.tolist()
338
+ shap_values_all_folds[feature].extend(shap_value.tolist())
339
+
340
+ if shap_values_all_folds:
341
+ average_shap_values = {
342
+ feature: np.mean(shaps)
343
+ for feature, shaps
344
+ in shap_values_all_folds.items()
345
+ }
346
+ else:
347
+ average_shap_values = None
348
+
349
+ return self.post_process_metric(metric), average_shap_values
323
350
 
324
351
  def check_fold_metrics(self, metrics_by_fold: List[float]):
325
352
  first_metric_sign = 1 if metrics_by_fold[0] >= 0 else -1
@@ -548,7 +575,7 @@ class CatBoostWrapper(EstimatorWrapper):
548
575
 
549
576
  def cross_val_predict(
550
577
  self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
551
- ) -> Optional[float]:
578
+ ) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
552
579
  try:
553
580
  return super().cross_val_predict(x, y, baseline_score_column)
554
581
  except Exception as e:
@@ -573,6 +600,29 @@ class CatBoostWrapper(EstimatorWrapper):
573
600
  else:
574
601
  raise e
575
602
 
603
+ def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator: CatBoost) -> Optional[Dict[str, float]]:
604
+ try:
605
+ # Create Pool for fold data, if need (for example, when categorical features are present)
606
+ fold_pool = Pool(x, y, cat_features=self.cat_features)
607
+
608
+ # Get SHAP values of current estimator
609
+ shap_values_fold = estimator.get_feature_importance(data=fold_pool, type="ShapValues")
610
+
611
+ # Remove last columns (base value) and flatten
612
+ if self.target_type == ModelTaskType.MULTICLASS:
613
+ all_shaps = shap_values_fold[:, :, :-1]
614
+ all_shaps = [all_shaps[:, :, k].flatten() for k in range(all_shaps.shape[2])]
615
+ else:
616
+ all_shaps = shap_values_fold[:, :-1]
617
+ all_shaps = [all_shaps[:, k].flatten() for k in range(all_shaps.shape[1])]
618
+
619
+ all_shaps = np.abs(all_shaps)
620
+
621
+ return dict(zip(estimator.feature_names_, all_shaps))
622
+
623
+ except Exception:
624
+ return None
625
+
576
626
 
577
627
  class LightGBMWrapper(EstimatorWrapper):
578
628
  def __init__(
@@ -49,7 +49,10 @@ class Normalizer:
49
49
  self.generated_features = []
50
50
 
51
51
  def normalize(
52
- self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], generated_features: List[str]
52
+ self,
53
+ df: pd.DataFrame,
54
+ search_keys: Dict[str, SearchKey],
55
+ generated_features: List[str],
53
56
  ) -> Tuple[pd.DataFrame, Dict[str, SearchKey], List[str]]:
54
57
  self.search_keys = search_keys.copy()
55
58
  self.generated_features = generated_features.copy()
@@ -72,7 +72,7 @@ def make_table(df: pd.DataFrame, wrap_long_string=None) -> str:
72
72
  )
73
73
 
74
74
 
75
- def display_html_dataframe(df: pd.DataFrame, internal_df: pd.DataFrame, header: str):
75
+ def display_html_dataframe(df: pd.DataFrame, internal_df: pd.DataFrame, header: str, display_id: str):
76
76
  if not ipython_available():
77
77
  print(header)
78
78
  print(internal_df)
@@ -133,7 +133,7 @@ def display_html_dataframe(df: pd.DataFrame, internal_df: pd.DataFrame, header:
133
133
  {table_html}
134
134
  </div>
135
135
  """
136
- display(HTML(result_html))
136
+ return display(HTML(result_html))
137
137
 
138
138
 
139
139
  def make_html_report(
@@ -58,6 +58,10 @@ class FeaturesValidator:
58
58
 
59
59
  columns_renaming = columns_renaming or {}
60
60
 
61
+ if features_for_generate:
62
+ empty_or_constant_features = [
63
+ f for f in empty_or_constant_features if columns_renaming.get(f, f) not in features_for_generate
64
+ ]
61
65
  if empty_or_constant_features:
62
66
  msg = bundle.get("empty_or_contant_features").format(
63
67
  [columns_renaming.get(f, f) for f in empty_or_constant_features]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.15
3
+ Version: 1.2.16a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,13 +1,13 @@
1
- upgini/__about__.py,sha256=Q6rDLuL8XHKQggYBtRCtxzpPQJgFYWn4x0gcVlH7H4g,23
1
+ upgini/__about__.py,sha256=vMDC8s3UWLhN6avUSjtfizIVhxWIHW-WKTw04ha19HE,25
2
2
  upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=vRC7g6n6XQxSrvzXk6NJjP0ZytDQhWR4sTAo4Hp7gmA,188319
6
+ upgini/features_enricher.py,sha256=oEWJjD3v4v_0fZr8ZWSzqFCs08yJrjVTDMNPEFsFL_E,188978
7
7
  upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
9
  upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
10
- upgini/metrics.py,sha256=bgi1rc3vCCeCuwRX1doQSQCzaV5OEiYHv_6XIvapnaw,31254
10
+ upgini/metrics.py,sha256=zs_gnjZCdk8AUYOj-mD7V1k-8Gn2EfHcXvK7J6RWOxA,33492
11
11
  upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
12
12
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
13
13
  upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1389
@@ -27,7 +27,7 @@ upgini/data_source/data_source_publisher.py,sha256=X-8aGtVgzGmxyXkMVBoBLIGDMb4lY
27
27
  upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
28
28
  upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
29
29
  upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
- upgini/normalizer/normalize_utils.py,sha256=Lv75lq7M46z9cAIutwkdKZtPZkWblgoRzToAJ1BwY8A,7709
30
+ upgini/normalizer/normalize_utils.py,sha256=w7S4yQZkdlBptC7peqmrn8zqs-Z0RPq2rp78IZuoE7M,7734
31
31
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
32
32
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
33
33
  upgini/resource_bundle/strings.properties,sha256=eqJP6bGu12zFuQJqMY03QbMhppcdwIfL2bsJWaqmuZ4,27221
@@ -44,10 +44,10 @@ upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDc
44
44
  upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
45
45
  upgini/utils/datetime_utils.py,sha256=4tsGeehU0KS6wqNsc9gEEWZ9s6T9E0UReUIO3rSuXNU,12174
46
46
  upgini/utils/deduplicate_utils.py,sha256=NpaPtBYXwUtfKTRHWrtz2uUq6tZN6C_Nd719ydPRF2Q,8484
47
- upgini/utils/display_utils.py,sha256=A2ouB5eiZ-Kyt9ykYxkLQwyoRPrdYeJymwNTiajtFXs,10990
47
+ upgini/utils/display_utils.py,sha256=kOY3lKKbJDIb424TFAF0wQiFUhcARTy2Flz0bQ2M8NY,11014
48
48
  upgini/utils/email_utils.py,sha256=j0Ug1R_0AnCg1Y92zIZ4XMwvKo3G5_pcOlBN1OH_gZs,5191
49
49
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
50
- upgini/utils/features_validator.py,sha256=yiOdzVtpArELMufzAa9mtWq32lETB6sIF-w3Yvl3vV8,3614
50
+ upgini/utils/features_validator.py,sha256=URNywJnfPVpRKGAK9drJIdyHarGczB298y9QGQwOVGE,3818
51
51
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
52
52
  upgini/utils/ip_utils.py,sha256=Q6vb7Sr5Khx3Sq3eENjW2qCXKej_S5jZbneH6zEOkzQ,5171
53
53
  upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
57
57
  upgini/utils/target_utils.py,sha256=qHzZRmICFbLNCrmVqGkaBcjm91L2ERRZMppci36acV4,10085
58
58
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
59
59
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
60
- upgini-1.2.15.dist-info/METADATA,sha256=Hua2FUNftyzzpi9eR090MFJ-5F8S_KS_5SrZhwOUgco,48577
61
- upgini-1.2.15.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
- upgini-1.2.15.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
- upgini-1.2.15.dist-info/RECORD,,
60
+ upgini-1.2.16a1.dist-info/METADATA,sha256=0w4SeT93Uz51cWP6Y0uHw0Eh2iMVqkUIOjlaaD_Jduw,48579
61
+ upgini-1.2.16a1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
+ upgini-1.2.16a1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
+ upgini-1.2.16a1.dist-info/RECORD,,