upgini 1.2.15__py3-none-any.whl → 1.2.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +90 -29
- upgini/metrics.py +66 -9
- upgini/utils/display_utils.py +8 -2
- {upgini-1.2.15.dist-info → upgini-1.2.16.dist-info}/METADATA +2 -2
- {upgini-1.2.15.dist-info → upgini-1.2.16.dist-info}/RECORD +8 -8
- {upgini-1.2.15.dist-info → upgini-1.2.16.dist-info}/WHEEL +0 -0
- {upgini-1.2.15.dist-info → upgini-1.2.16.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.16"
|
upgini/features_enricher.py
CHANGED
|
@@ -336,6 +336,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
336
336
|
self.exclude_columns = exclude_columns
|
|
337
337
|
self.baseline_score_column = baseline_score_column
|
|
338
338
|
self.add_date_if_missing = add_date_if_missing
|
|
339
|
+
self.features_info_display_handle = None
|
|
339
340
|
|
|
340
341
|
def _get_api_key(self):
|
|
341
342
|
return self._api_key
|
|
@@ -871,6 +872,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
871
872
|
else None
|
|
872
873
|
)
|
|
873
874
|
|
|
875
|
+
if self.X is None:
|
|
876
|
+
self.X = X
|
|
877
|
+
if self.y is None:
|
|
878
|
+
self.y = y
|
|
879
|
+
if self.eval_set is None:
|
|
880
|
+
self.eval_set = effective_eval_set
|
|
881
|
+
|
|
874
882
|
try:
|
|
875
883
|
self.__log_debug_information(
|
|
876
884
|
validated_X,
|
|
@@ -938,14 +946,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
938
946
|
|
|
939
947
|
gc.collect()
|
|
940
948
|
|
|
949
|
+
if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
|
|
950
|
+
print(self.bundle.get("metrics_no_important_free_features"))
|
|
951
|
+
self.logger.warning("No client or free relevant ADS features found to calculate metrics")
|
|
952
|
+
self.warning_counter.increment()
|
|
953
|
+
return None
|
|
954
|
+
|
|
941
955
|
print(self.bundle.get("metrics_start"))
|
|
942
956
|
with Spinner():
|
|
943
|
-
if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
|
|
944
|
-
print(self.bundle.get("metrics_no_important_free_features"))
|
|
945
|
-
self.logger.warning("No client or free relevant ADS features found to calculate metrics")
|
|
946
|
-
self.warning_counter.increment()
|
|
947
|
-
return None
|
|
948
|
-
|
|
949
957
|
self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
|
|
950
958
|
|
|
951
959
|
has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
|
|
@@ -989,7 +997,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
989
997
|
text_features=self.generate_features,
|
|
990
998
|
has_date=has_date,
|
|
991
999
|
)
|
|
992
|
-
etalon_metric = baseline_estimator.cross_val_predict(
|
|
1000
|
+
etalon_metric, _ = baseline_estimator.cross_val_predict(
|
|
993
1001
|
fitting_X, y_sorted, self.baseline_score_column
|
|
994
1002
|
)
|
|
995
1003
|
if etalon_metric is None:
|
|
@@ -1023,7 +1031,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1023
1031
|
text_features=self.generate_features,
|
|
1024
1032
|
has_date=has_date,
|
|
1025
1033
|
)
|
|
1026
|
-
enriched_metric = enriched_estimator.cross_val_predict(
|
|
1034
|
+
enriched_metric, enriched_shaps = enriched_estimator.cross_val_predict(
|
|
1035
|
+
fitting_enriched_X, enriched_y_sorted
|
|
1036
|
+
)
|
|
1037
|
+
|
|
1038
|
+
if enriched_shaps is not None:
|
|
1039
|
+
self._update_shap_values(enriched_shaps)
|
|
1040
|
+
|
|
1027
1041
|
if enriched_metric is None:
|
|
1028
1042
|
self.logger.warning(
|
|
1029
1043
|
f"Enriched {metric} on train combined features is None (maybe all features was removed)"
|
|
@@ -1156,13 +1170,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1156
1170
|
elif uplift_col in metrics_df.columns and (metrics_df[uplift_col] < 0).any():
|
|
1157
1171
|
self.logger.warning("Uplift is negative")
|
|
1158
1172
|
|
|
1159
|
-
if self.X is None:
|
|
1160
|
-
self.X = X
|
|
1161
|
-
if self.y is None:
|
|
1162
|
-
self.y = y
|
|
1163
|
-
if self.eval_set is None:
|
|
1164
|
-
self.eval_set = effective_eval_set
|
|
1165
|
-
|
|
1166
1173
|
return metrics_df
|
|
1167
1174
|
except Exception as e:
|
|
1168
1175
|
error_message = "Failed to calculate metrics" + (
|
|
@@ -1187,6 +1194,48 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1187
1194
|
finally:
|
|
1188
1195
|
self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
|
|
1189
1196
|
|
|
1197
|
+
def _update_shap_values(self, new_shaps: Dict[str, float]):
|
|
1198
|
+
new_shaps = {
|
|
1199
|
+
feature: self._round_shap_value(shap)
|
|
1200
|
+
for feature, shap in new_shaps.items()
|
|
1201
|
+
if feature in self.feature_names_
|
|
1202
|
+
}
|
|
1203
|
+
features_importances = list(new_shaps.items())
|
|
1204
|
+
features_importances.sort(key=lambda m: (-m[1], m[0]))
|
|
1205
|
+
self.feature_names_, self.feature_importances_ = zip(*features_importances)
|
|
1206
|
+
self.feature_names_ = list(self.feature_names_)
|
|
1207
|
+
self.feature_importances_ = list(self.feature_importances_)
|
|
1208
|
+
|
|
1209
|
+
feature_name_header = self.bundle.get("features_info_name")
|
|
1210
|
+
shap_value_header = self.bundle.get("features_info_shap")
|
|
1211
|
+
|
|
1212
|
+
def update_shap(row):
|
|
1213
|
+
return new_shaps.get(row[feature_name_header], row[shap_value_header])
|
|
1214
|
+
|
|
1215
|
+
self.features_info[shap_value_header] = self.features_info.apply(update_shap, axis=1)
|
|
1216
|
+
self._internal_features_info[shap_value_header] = self._internal_features_info.apply(update_shap, axis=1)
|
|
1217
|
+
self._features_info_without_links[shap_value_header] = self._features_info_without_links.apply(
|
|
1218
|
+
update_shap, axis=1
|
|
1219
|
+
)
|
|
1220
|
+
self.logger.info(f"Recalculated SHAP values:\n{self._features_info_without_links}")
|
|
1221
|
+
|
|
1222
|
+
self.features_info.sort_values(by=shap_value_header, ascending=False, inplace=True)
|
|
1223
|
+
self._internal_features_info.sort_values(by=shap_value_header, ascending=False, inplace=True)
|
|
1224
|
+
self._features_info_without_links.sort_values(by=shap_value_header, ascending=False, inplace=True)
|
|
1225
|
+
|
|
1226
|
+
if self.features_info_display_handle:
|
|
1227
|
+
try:
|
|
1228
|
+
_ = get_ipython() # type: ignore
|
|
1229
|
+
|
|
1230
|
+
display_html_dataframe(
|
|
1231
|
+
self.features_info,
|
|
1232
|
+
self._features_info_without_links,
|
|
1233
|
+
self.bundle.get("relevant_features_header"),
|
|
1234
|
+
display_handle=self.features_info_display_handle,
|
|
1235
|
+
)
|
|
1236
|
+
except (ImportError, NameError):
|
|
1237
|
+
print(self._internal_features_info)
|
|
1238
|
+
|
|
1190
1239
|
def _check_train_and_eval_target_distribution(self, y, eval_set_dict):
|
|
1191
1240
|
uneven_distribution = False
|
|
1192
1241
|
for eval_set in eval_set_dict.values():
|
|
@@ -1515,11 +1564,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1515
1564
|
self.logger.info("No external features selected. So use only input datasets for metrics calculation")
|
|
1516
1565
|
return self.__sample_only_input(validated_X, validated_y, eval_set, is_demo_dataset)
|
|
1517
1566
|
# TODO save and check if dataset was deduplicated - use imbalance branch for such case
|
|
1518
|
-
elif
|
|
1567
|
+
elif (
|
|
1568
|
+
not self.imbalanced
|
|
1569
|
+
and not exclude_features_sources
|
|
1570
|
+
and is_input_same_as_fit
|
|
1571
|
+
and self.df_with_original_index is not None
|
|
1572
|
+
):
|
|
1519
1573
|
self.logger.info("Dataset is not imbalanced, so use enriched_X from fit")
|
|
1520
1574
|
return self.__sample_balanced(eval_set, trace_id, remove_outliers_calc_metrics)
|
|
1521
1575
|
else:
|
|
1522
|
-
self.logger.info(
|
|
1576
|
+
self.logger.info(
|
|
1577
|
+
"Dataset is imbalanced or exclude_features_sources or X was passed or this is saved search."
|
|
1578
|
+
" Run transform"
|
|
1579
|
+
)
|
|
1523
1580
|
print(self.bundle.get("prepare_data_for_metrics"))
|
|
1524
1581
|
return self.__sample_imbalanced(
|
|
1525
1582
|
validated_X,
|
|
@@ -3374,6 +3431,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3374
3431
|
|
|
3375
3432
|
return result_train, result_eval_sets
|
|
3376
3433
|
|
|
3434
|
+
@staticmethod
|
|
3435
|
+
def _round_shap_value(shap: float) -> float:
|
|
3436
|
+
if shap > 0.0 and shap < 0.0001:
|
|
3437
|
+
return 0.0001
|
|
3438
|
+
else:
|
|
3439
|
+
return round(shap, 4)
|
|
3440
|
+
|
|
3377
3441
|
def __prepare_feature_importances(self, trace_id: str, x_columns: List[str], silent=False):
|
|
3378
3442
|
llm_source = "LLM with external data augmentation"
|
|
3379
3443
|
if self._search_task is None:
|
|
@@ -3391,12 +3455,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3391
3455
|
features_info_without_links = []
|
|
3392
3456
|
internal_features_info = []
|
|
3393
3457
|
|
|
3394
|
-
def round_shap_value(shap: float) -> float:
|
|
3395
|
-
if shap > 0.0 and shap < 0.0001:
|
|
3396
|
-
return 0.0001
|
|
3397
|
-
else:
|
|
3398
|
-
return round(shap, 4)
|
|
3399
|
-
|
|
3400
3458
|
def list_or_single(lst: List[str], single: str):
|
|
3401
3459
|
return lst or ([single] if single else [])
|
|
3402
3460
|
|
|
@@ -3429,7 +3487,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3429
3487
|
|
|
3430
3488
|
feature_sample = []
|
|
3431
3489
|
self.feature_names_.append(feature_meta.name)
|
|
3432
|
-
self.feature_importances_.append(
|
|
3490
|
+
self.feature_importances_.append(self._round_shap_value(feature_meta.shap_value))
|
|
3433
3491
|
if feature_meta.name in features_df.columns:
|
|
3434
3492
|
feature_sample = np.random.choice(features_df[feature_meta.name].dropna().unique(), 3).tolist()
|
|
3435
3493
|
if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
|
|
@@ -3468,7 +3526,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3468
3526
|
features_info.append(
|
|
3469
3527
|
{
|
|
3470
3528
|
self.bundle.get("features_info_name"): feature_name,
|
|
3471
|
-
self.bundle.get("features_info_shap"):
|
|
3529
|
+
self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
|
|
3472
3530
|
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3473
3531
|
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3474
3532
|
self.bundle.get("features_info_provider"): provider,
|
|
@@ -3479,7 +3537,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3479
3537
|
features_info_without_links.append(
|
|
3480
3538
|
{
|
|
3481
3539
|
self.bundle.get("features_info_name"): internal_feature_name,
|
|
3482
|
-
self.bundle.get("features_info_shap"):
|
|
3540
|
+
self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
|
|
3483
3541
|
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3484
3542
|
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3485
3543
|
self.bundle.get("features_info_provider"): internal_provider,
|
|
@@ -3491,7 +3549,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3491
3549
|
{
|
|
3492
3550
|
self.bundle.get("features_info_name"): internal_feature_name,
|
|
3493
3551
|
"feature_link": feature_meta.doc_link,
|
|
3494
|
-
self.bundle.get("features_info_shap"):
|
|
3552
|
+
self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
|
|
3495
3553
|
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3496
3554
|
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3497
3555
|
self.bundle.get("features_info_provider"): internal_provider,
|
|
@@ -3771,8 +3829,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3771
3829
|
print(Format.GREEN + Format.BOLD + msg + Format.END)
|
|
3772
3830
|
self.logger.info(msg)
|
|
3773
3831
|
if len(self.feature_names_) > 0:
|
|
3774
|
-
display_html_dataframe(
|
|
3775
|
-
self.features_info,
|
|
3832
|
+
self.features_info_display_handle = display_html_dataframe(
|
|
3833
|
+
self.features_info,
|
|
3834
|
+
self._features_info_without_links,
|
|
3835
|
+
self.bundle.get("relevant_features_header"),
|
|
3836
|
+
display_id="features_info",
|
|
3776
3837
|
)
|
|
3777
3838
|
|
|
3778
3839
|
display_html_dataframe(
|
upgini/metrics.py
CHANGED
|
@@ -3,13 +3,14 @@ from __future__ import annotations
|
|
|
3
3
|
import inspect
|
|
4
4
|
import logging
|
|
5
5
|
import re
|
|
6
|
+
from collections import defaultdict
|
|
6
7
|
from copy import deepcopy
|
|
7
8
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
8
9
|
|
|
9
10
|
import catboost
|
|
10
11
|
import numpy as np
|
|
11
12
|
import pandas as pd
|
|
12
|
-
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
13
|
+
from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor, Pool
|
|
13
14
|
from numpy import log1p
|
|
14
15
|
from pandas.api.types import is_numeric_dtype
|
|
15
16
|
from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
|
|
@@ -288,9 +289,12 @@ class EstimatorWrapper:
|
|
|
288
289
|
x, y, _ = self._prepare_data(x, y)
|
|
289
290
|
return x, y, {}
|
|
290
291
|
|
|
292
|
+
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
|
293
|
+
return None
|
|
294
|
+
|
|
291
295
|
def cross_val_predict(
|
|
292
296
|
self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
293
|
-
) -> Optional[float]:
|
|
297
|
+
) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
|
|
294
298
|
x, y, groups, fit_params = self._prepare_to_fit(x, y)
|
|
295
299
|
|
|
296
300
|
if x.shape[1] == 0:
|
|
@@ -298,6 +302,7 @@ class EstimatorWrapper:
|
|
|
298
302
|
|
|
299
303
|
scorer = check_scoring(self.estimator, scoring=self.scorer)
|
|
300
304
|
|
|
305
|
+
shap_values_all_folds = defaultdict(list)
|
|
301
306
|
if baseline_score_column is not None and self.metric_name == "GINI":
|
|
302
307
|
self.logger.info("Calculate baseline GINI on passed baseline_score_column and target")
|
|
303
308
|
metric = roc_auc_score(y, x[baseline_score_column])
|
|
@@ -319,7 +324,29 @@ class EstimatorWrapper:
|
|
|
319
324
|
self.check_fold_metrics(metrics_by_fold)
|
|
320
325
|
|
|
321
326
|
metric = np.mean(metrics_by_fold) * self.multiplier
|
|
322
|
-
|
|
327
|
+
|
|
328
|
+
splits = self.cv.split(x, y, groups)
|
|
329
|
+
|
|
330
|
+
for estimator, split in zip(self.cv_estimators, splits):
|
|
331
|
+
_, validation_idx = split
|
|
332
|
+
cv_x = x.iloc[validation_idx]
|
|
333
|
+
cv_y = y[validation_idx]
|
|
334
|
+
shaps = self.calculate_shap(cv_x, cv_y, estimator)
|
|
335
|
+
if shaps is not None:
|
|
336
|
+
for feature, shap_value in shaps.items():
|
|
337
|
+
# shap_values_all_folds[feature] = shap_values_all_folds.get(feature, []) + shap_value.tolist()
|
|
338
|
+
shap_values_all_folds[feature].extend(shap_value.tolist())
|
|
339
|
+
|
|
340
|
+
if shap_values_all_folds:
|
|
341
|
+
average_shap_values = {
|
|
342
|
+
feature: np.mean(np.array(shaps)) for feature, shaps in shap_values_all_folds.items() if len(shaps) > 0
|
|
343
|
+
}
|
|
344
|
+
if len(average_shap_values) == 0:
|
|
345
|
+
average_shap_values = None
|
|
346
|
+
else:
|
|
347
|
+
average_shap_values = None
|
|
348
|
+
|
|
349
|
+
return self.post_process_metric(metric), average_shap_values
|
|
323
350
|
|
|
324
351
|
def check_fold_metrics(self, metrics_by_fold: List[float]):
|
|
325
352
|
first_metric_sign = 1 if metrics_by_fold[0] >= 0 else -1
|
|
@@ -453,6 +480,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
453
480
|
)
|
|
454
481
|
self.cat_features = None
|
|
455
482
|
self.emb_features = None
|
|
483
|
+
self.grouped_embedding_features = None
|
|
456
484
|
self.exclude_features = []
|
|
457
485
|
|
|
458
486
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
|
@@ -462,17 +490,16 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
462
490
|
if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
|
|
463
491
|
emb_pattern = r"(.+)_emb\d+"
|
|
464
492
|
self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
|
|
465
|
-
embedding_features = []
|
|
466
493
|
if len(self.emb_features) > 3: # There is no reason to reduce embeddings dimension with less than 4
|
|
467
494
|
self.logger.info(
|
|
468
495
|
"Embedding features count more than 3, so group them into one vector for CatBoost: "
|
|
469
496
|
f"{self.emb_features}"
|
|
470
497
|
)
|
|
471
|
-
x,
|
|
472
|
-
params["embedding_features"] =
|
|
498
|
+
x, self.grouped_embedding_features = self.group_embeddings(x)
|
|
499
|
+
params["embedding_features"] = self.grouped_embedding_features
|
|
473
500
|
else:
|
|
474
501
|
self.logger.info(f"Embedding features count less than 3, so use them separately: {self.emb_features}")
|
|
475
|
-
self.
|
|
502
|
+
self.grouped_embedding_features = None
|
|
476
503
|
else:
|
|
477
504
|
self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
|
|
478
505
|
|
|
@@ -488,7 +515,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
488
515
|
self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
|
|
489
516
|
|
|
490
517
|
# Find rest categorical features
|
|
491
|
-
self.cat_features = _get_cat_features(x, self.text_features,
|
|
518
|
+
self.cat_features = _get_cat_features(x, self.text_features, self.grouped_embedding_features)
|
|
492
519
|
# x = fill_na_cat_features(x, self.cat_features)
|
|
493
520
|
unique_cat_features = []
|
|
494
521
|
for name in self.cat_features:
|
|
@@ -548,7 +575,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
548
575
|
|
|
549
576
|
def cross_val_predict(
|
|
550
577
|
self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
551
|
-
) -> Optional[float]:
|
|
578
|
+
) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
|
|
552
579
|
try:
|
|
553
580
|
return super().cross_val_predict(x, y, baseline_score_column)
|
|
554
581
|
except Exception as e:
|
|
@@ -573,6 +600,36 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
573
600
|
else:
|
|
574
601
|
raise e
|
|
575
602
|
|
|
603
|
+
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator: CatBoost) -> Optional[Dict[str, float]]:
|
|
604
|
+
try:
|
|
605
|
+
# Create Pool for fold data, if need (for example, when categorical features are present)
|
|
606
|
+
fold_pool = Pool(
|
|
607
|
+
x,
|
|
608
|
+
y,
|
|
609
|
+
cat_features=self.cat_features,
|
|
610
|
+
text_features=self.text_features,
|
|
611
|
+
embedding_features=self.grouped_embedding_features,
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
# Get SHAP values of current estimator
|
|
615
|
+
shap_values_fold = estimator.get_feature_importance(data=fold_pool, type="ShapValues")
|
|
616
|
+
|
|
617
|
+
# Remove last columns (base value) and flatten
|
|
618
|
+
if self.target_type == ModelTaskType.MULTICLASS:
|
|
619
|
+
all_shaps = shap_values_fold[:, :, :-1]
|
|
620
|
+
all_shaps = [all_shaps[:, :, k].flatten() for k in range(all_shaps.shape[2])]
|
|
621
|
+
else:
|
|
622
|
+
all_shaps = shap_values_fold[:, :-1]
|
|
623
|
+
all_shaps = [all_shaps[:, k].flatten() for k in range(all_shaps.shape[1])]
|
|
624
|
+
|
|
625
|
+
all_shaps = np.abs(all_shaps)
|
|
626
|
+
|
|
627
|
+
return dict(zip(estimator.feature_names_, all_shaps))
|
|
628
|
+
|
|
629
|
+
except Exception:
|
|
630
|
+
self.logger.exception("Failed to recalculate new SHAP values")
|
|
631
|
+
return None
|
|
632
|
+
|
|
576
633
|
|
|
577
634
|
class LightGBMWrapper(EstimatorWrapper):
|
|
578
635
|
def __init__(
|
upgini/utils/display_utils.py
CHANGED
|
@@ -9,6 +9,7 @@ from typing import Callable, List, Optional
|
|
|
9
9
|
|
|
10
10
|
import pandas as pd
|
|
11
11
|
from xhtml2pdf import pisa
|
|
12
|
+
|
|
12
13
|
from upgini.__about__ import __version__
|
|
13
14
|
|
|
14
15
|
|
|
@@ -72,7 +73,9 @@ def make_table(df: pd.DataFrame, wrap_long_string=None) -> str:
|
|
|
72
73
|
)
|
|
73
74
|
|
|
74
75
|
|
|
75
|
-
def display_html_dataframe(
|
|
76
|
+
def display_html_dataframe(
|
|
77
|
+
df: pd.DataFrame, internal_df: pd.DataFrame, header: str, display_id: Optional[str] = None, display_handle=None
|
|
78
|
+
):
|
|
76
79
|
if not ipython_available():
|
|
77
80
|
print(header)
|
|
78
81
|
print(internal_df)
|
|
@@ -133,7 +136,10 @@ def display_html_dataframe(df: pd.DataFrame, internal_df: pd.DataFrame, header:
|
|
|
133
136
|
{table_html}
|
|
134
137
|
</div>
|
|
135
138
|
"""
|
|
136
|
-
|
|
139
|
+
if display_handle:
|
|
140
|
+
return display_handle.update(HTML(result_html))
|
|
141
|
+
else:
|
|
142
|
+
return display(HTML(result_html), display_id=display_id)
|
|
137
143
|
|
|
138
144
|
|
|
139
145
|
def make_html_report(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.16
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -145,7 +145,7 @@ Description-Content-Type: text/markdown
|
|
|
145
145
|
|
|
146
146
|
## 💼 Tutorials
|
|
147
147
|
|
|
148
|
-
### [Search of relevant external features & Automated feature generation for Salary
|
|
148
|
+
### [Search of relevant external features & Automated feature generation for Salary prediction task (use as a template)](https://github.com/upgini/upgini/blob/main/notebooks/Upgini_Features_search%26generation.ipynb)
|
|
149
149
|
|
|
150
150
|
* The goal is to predict salary for data science job postning based on information about employer and job description.
|
|
151
151
|
* Following this guide, you'll learn how to **search & auto generate new relevant features with Upgini library**
|
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=7RIJ-Nh9kHJf5O5NvahUeP8DNXq6oIzbYcIt_yKv0lQ,23
|
|
2
2
|
upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=d3-QHyb8vYuWBImHumeA-BBrgT--5V-L91WgU1vduR8,190892
|
|
7
7
|
upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
|
|
10
|
-
upgini/metrics.py,sha256=
|
|
10
|
+
upgini/metrics.py,sha256=lhLqFv1tLWNzx3ULELo3MMSqI8eBoHL7P5jKpG8a6PE,33899
|
|
11
11
|
upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
13
13
|
upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1389
|
|
@@ -44,7 +44,7 @@ upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDc
|
|
|
44
44
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
45
45
|
upgini/utils/datetime_utils.py,sha256=4tsGeehU0KS6wqNsc9gEEWZ9s6T9E0UReUIO3rSuXNU,12174
|
|
46
46
|
upgini/utils/deduplicate_utils.py,sha256=NpaPtBYXwUtfKTRHWrtz2uUq6tZN6C_Nd719ydPRF2Q,8484
|
|
47
|
-
upgini/utils/display_utils.py,sha256=
|
|
47
|
+
upgini/utils/display_utils.py,sha256=ntmOs8VchrkPFVNp4iEhdXRRtYx823vJ51saECoY6Bk,11175
|
|
48
48
|
upgini/utils/email_utils.py,sha256=j0Ug1R_0AnCg1Y92zIZ4XMwvKo3G5_pcOlBN1OH_gZs,5191
|
|
49
49
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
|
50
50
|
upgini/utils/features_validator.py,sha256=yiOdzVtpArELMufzAa9mtWq32lETB6sIF-w3Yvl3vV8,3614
|
|
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
57
57
|
upgini/utils/target_utils.py,sha256=qHzZRmICFbLNCrmVqGkaBcjm91L2ERRZMppci36acV4,10085
|
|
58
58
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
59
59
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
60
|
-
upgini-1.2.
|
|
61
|
-
upgini-1.2.
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
60
|
+
upgini-1.2.16.dist-info/METADATA,sha256=hmLjva5M3m8LpawvBgEjKCrilENMBJAfax6mDnygVyw,48578
|
|
61
|
+
upgini-1.2.16.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
62
|
+
upgini-1.2.16.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
63
|
+
upgini-1.2.16.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|