upgini 1.2.16__py3-none-any.whl → 1.2.16a3654.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +34 -92
- upgini/metrics.py +9 -66
- upgini/utils/display_utils.py +2 -8
- {upgini-1.2.16.dist-info → upgini-1.2.16a3654.dev1.dist-info}/METADATA +2 -2
- {upgini-1.2.16.dist-info → upgini-1.2.16a3654.dev1.dist-info}/RECORD +8 -8
- {upgini-1.2.16.dist-info → upgini-1.2.16a3654.dev1.dist-info}/WHEEL +1 -1
- {upgini-1.2.16.dist-info → upgini-1.2.16a3654.dev1.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.16a3654.dev1"
|
upgini/features_enricher.py
CHANGED
|
@@ -165,6 +165,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
165
165
|
RANDOM_STATE = 42
|
|
166
166
|
CALCULATE_METRICS_THRESHOLD = 50_000_000
|
|
167
167
|
CALCULATE_METRICS_MIN_THRESHOLD = 500
|
|
168
|
+
TEXT_FEATURES_THRESHOLD = 5_000
|
|
168
169
|
GENERATE_FEATURES_LIMIT = 10
|
|
169
170
|
EMPTY_FEATURES_INFO = pd.DataFrame(
|
|
170
171
|
columns=[
|
|
@@ -336,7 +337,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
336
337
|
self.exclude_columns = exclude_columns
|
|
337
338
|
self.baseline_score_column = baseline_score_column
|
|
338
339
|
self.add_date_if_missing = add_date_if_missing
|
|
339
|
-
self.features_info_display_handle = None
|
|
340
340
|
|
|
341
341
|
def _get_api_key(self):
|
|
342
342
|
return self._api_key
|
|
@@ -872,13 +872,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
872
872
|
else None
|
|
873
873
|
)
|
|
874
874
|
|
|
875
|
-
if self.X is None:
|
|
876
|
-
self.X = X
|
|
877
|
-
if self.y is None:
|
|
878
|
-
self.y = y
|
|
879
|
-
if self.eval_set is None:
|
|
880
|
-
self.eval_set = effective_eval_set
|
|
881
|
-
|
|
882
875
|
try:
|
|
883
876
|
self.__log_debug_information(
|
|
884
877
|
validated_X,
|
|
@@ -946,14 +939,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
946
939
|
|
|
947
940
|
gc.collect()
|
|
948
941
|
|
|
949
|
-
if fitting_X.shape[
|
|
950
|
-
print(self.bundle.get("metrics_no_important_free_features"))
|
|
951
|
-
self.logger.warning("No client or free relevant ADS features found to calculate metrics")
|
|
952
|
-
self.warning_counter.increment()
|
|
953
|
-
return None
|
|
942
|
+
text_features = self.generate_features if fitting_X.shape[0] >= self.TEXT_FEATURES_THRESHOLD else []
|
|
954
943
|
|
|
955
944
|
print(self.bundle.get("metrics_start"))
|
|
956
945
|
with Spinner():
|
|
946
|
+
if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
|
|
947
|
+
print(self.bundle.get("metrics_no_important_free_features"))
|
|
948
|
+
self.logger.warning("No client or free relevant ADS features found to calculate metrics")
|
|
949
|
+
self.warning_counter.increment()
|
|
950
|
+
return None
|
|
951
|
+
|
|
957
952
|
self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
|
|
958
953
|
|
|
959
954
|
has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
|
|
@@ -967,7 +962,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
967
962
|
fitting_enriched_X,
|
|
968
963
|
scoring,
|
|
969
964
|
groups=groups,
|
|
970
|
-
text_features=
|
|
965
|
+
text_features=text_features,
|
|
971
966
|
has_date=has_date,
|
|
972
967
|
)
|
|
973
968
|
metric = wrapper.metric_name
|
|
@@ -994,10 +989,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
994
989
|
cat_features,
|
|
995
990
|
add_params=custom_loss_add_params,
|
|
996
991
|
groups=groups,
|
|
997
|
-
text_features=
|
|
992
|
+
text_features=text_features,
|
|
998
993
|
has_date=has_date,
|
|
999
994
|
)
|
|
1000
|
-
etalon_metric
|
|
995
|
+
etalon_metric = baseline_estimator.cross_val_predict(
|
|
1001
996
|
fitting_X, y_sorted, self.baseline_score_column
|
|
1002
997
|
)
|
|
1003
998
|
if etalon_metric is None:
|
|
@@ -1028,16 +1023,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1028
1023
|
cat_features,
|
|
1029
1024
|
add_params=custom_loss_add_params,
|
|
1030
1025
|
groups=groups,
|
|
1031
|
-
text_features=
|
|
1026
|
+
text_features=text_features,
|
|
1032
1027
|
has_date=has_date,
|
|
1033
1028
|
)
|
|
1034
|
-
enriched_metric
|
|
1035
|
-
fitting_enriched_X, enriched_y_sorted
|
|
1036
|
-
)
|
|
1037
|
-
|
|
1038
|
-
if enriched_shaps is not None:
|
|
1039
|
-
self._update_shap_values(enriched_shaps)
|
|
1040
|
-
|
|
1029
|
+
enriched_metric = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
|
|
1041
1030
|
if enriched_metric is None:
|
|
1042
1031
|
self.logger.warning(
|
|
1043
1032
|
f"Enriched {metric} on train combined features is None (maybe all features was removed)"
|
|
@@ -1170,6 +1159,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1170
1159
|
elif uplift_col in metrics_df.columns and (metrics_df[uplift_col] < 0).any():
|
|
1171
1160
|
self.logger.warning("Uplift is negative")
|
|
1172
1161
|
|
|
1162
|
+
if self.X is None:
|
|
1163
|
+
self.X = X
|
|
1164
|
+
if self.y is None:
|
|
1165
|
+
self.y = y
|
|
1166
|
+
if self.eval_set is None:
|
|
1167
|
+
self.eval_set = effective_eval_set
|
|
1168
|
+
|
|
1173
1169
|
return metrics_df
|
|
1174
1170
|
except Exception as e:
|
|
1175
1171
|
error_message = "Failed to calculate metrics" + (
|
|
@@ -1194,48 +1190,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1194
1190
|
finally:
|
|
1195
1191
|
self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
|
|
1196
1192
|
|
|
1197
|
-
def _update_shap_values(self, new_shaps: Dict[str, float]):
|
|
1198
|
-
new_shaps = {
|
|
1199
|
-
feature: self._round_shap_value(shap)
|
|
1200
|
-
for feature, shap in new_shaps.items()
|
|
1201
|
-
if feature in self.feature_names_
|
|
1202
|
-
}
|
|
1203
|
-
features_importances = list(new_shaps.items())
|
|
1204
|
-
features_importances.sort(key=lambda m: (-m[1], m[0]))
|
|
1205
|
-
self.feature_names_, self.feature_importances_ = zip(*features_importances)
|
|
1206
|
-
self.feature_names_ = list(self.feature_names_)
|
|
1207
|
-
self.feature_importances_ = list(self.feature_importances_)
|
|
1208
|
-
|
|
1209
|
-
feature_name_header = self.bundle.get("features_info_name")
|
|
1210
|
-
shap_value_header = self.bundle.get("features_info_shap")
|
|
1211
|
-
|
|
1212
|
-
def update_shap(row):
|
|
1213
|
-
return new_shaps.get(row[feature_name_header], row[shap_value_header])
|
|
1214
|
-
|
|
1215
|
-
self.features_info[shap_value_header] = self.features_info.apply(update_shap, axis=1)
|
|
1216
|
-
self._internal_features_info[shap_value_header] = self._internal_features_info.apply(update_shap, axis=1)
|
|
1217
|
-
self._features_info_without_links[shap_value_header] = self._features_info_without_links.apply(
|
|
1218
|
-
update_shap, axis=1
|
|
1219
|
-
)
|
|
1220
|
-
self.logger.info(f"Recalculated SHAP values:\n{self._features_info_without_links}")
|
|
1221
|
-
|
|
1222
|
-
self.features_info.sort_values(by=shap_value_header, ascending=False, inplace=True)
|
|
1223
|
-
self._internal_features_info.sort_values(by=shap_value_header, ascending=False, inplace=True)
|
|
1224
|
-
self._features_info_without_links.sort_values(by=shap_value_header, ascending=False, inplace=True)
|
|
1225
|
-
|
|
1226
|
-
if self.features_info_display_handle:
|
|
1227
|
-
try:
|
|
1228
|
-
_ = get_ipython() # type: ignore
|
|
1229
|
-
|
|
1230
|
-
display_html_dataframe(
|
|
1231
|
-
self.features_info,
|
|
1232
|
-
self._features_info_without_links,
|
|
1233
|
-
self.bundle.get("relevant_features_header"),
|
|
1234
|
-
display_handle=self.features_info_display_handle,
|
|
1235
|
-
)
|
|
1236
|
-
except (ImportError, NameError):
|
|
1237
|
-
print(self._internal_features_info)
|
|
1238
|
-
|
|
1239
1193
|
def _check_train_and_eval_target_distribution(self, y, eval_set_dict):
|
|
1240
1194
|
uneven_distribution = False
|
|
1241
1195
|
for eval_set in eval_set_dict.values():
|
|
@@ -1564,19 +1518,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1564
1518
|
self.logger.info("No external features selected. So use only input datasets for metrics calculation")
|
|
1565
1519
|
return self.__sample_only_input(validated_X, validated_y, eval_set, is_demo_dataset)
|
|
1566
1520
|
# TODO save and check if dataset was deduplicated - use imbalance branch for such case
|
|
1567
|
-
elif
|
|
1568
|
-
not self.imbalanced
|
|
1569
|
-
and not exclude_features_sources
|
|
1570
|
-
and is_input_same_as_fit
|
|
1571
|
-
and self.df_with_original_index is not None
|
|
1572
|
-
):
|
|
1521
|
+
elif not self.imbalanced and not exclude_features_sources and is_input_same_as_fit:
|
|
1573
1522
|
self.logger.info("Dataset is not imbalanced, so use enriched_X from fit")
|
|
1574
1523
|
return self.__sample_balanced(eval_set, trace_id, remove_outliers_calc_metrics)
|
|
1575
1524
|
else:
|
|
1576
|
-
self.logger.info(
|
|
1577
|
-
"Dataset is imbalanced or exclude_features_sources or X was passed or this is saved search."
|
|
1578
|
-
" Run transform"
|
|
1579
|
-
)
|
|
1525
|
+
self.logger.info("Dataset is imbalanced or exclude_features_sources or X was passed. Run transform")
|
|
1580
1526
|
print(self.bundle.get("prepare_data_for_metrics"))
|
|
1581
1527
|
return self.__sample_imbalanced(
|
|
1582
1528
|
validated_X,
|
|
@@ -3431,13 +3377,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3431
3377
|
|
|
3432
3378
|
return result_train, result_eval_sets
|
|
3433
3379
|
|
|
3434
|
-
@staticmethod
|
|
3435
|
-
def _round_shap_value(shap: float) -> float:
|
|
3436
|
-
if shap > 0.0 and shap < 0.0001:
|
|
3437
|
-
return 0.0001
|
|
3438
|
-
else:
|
|
3439
|
-
return round(shap, 4)
|
|
3440
|
-
|
|
3441
3380
|
def __prepare_feature_importances(self, trace_id: str, x_columns: List[str], silent=False):
|
|
3442
3381
|
llm_source = "LLM with external data augmentation"
|
|
3443
3382
|
if self._search_task is None:
|
|
@@ -3455,6 +3394,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3455
3394
|
features_info_without_links = []
|
|
3456
3395
|
internal_features_info = []
|
|
3457
3396
|
|
|
3397
|
+
def round_shap_value(shap: float) -> float:
|
|
3398
|
+
if shap > 0.0 and shap < 0.0001:
|
|
3399
|
+
return 0.0001
|
|
3400
|
+
else:
|
|
3401
|
+
return round(shap, 4)
|
|
3402
|
+
|
|
3458
3403
|
def list_or_single(lst: List[str], single: str):
|
|
3459
3404
|
return lst or ([single] if single else [])
|
|
3460
3405
|
|
|
@@ -3487,7 +3432,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3487
3432
|
|
|
3488
3433
|
feature_sample = []
|
|
3489
3434
|
self.feature_names_.append(feature_meta.name)
|
|
3490
|
-
self.feature_importances_.append(
|
|
3435
|
+
self.feature_importances_.append(round_shap_value(feature_meta.shap_value))
|
|
3491
3436
|
if feature_meta.name in features_df.columns:
|
|
3492
3437
|
feature_sample = np.random.choice(features_df[feature_meta.name].dropna().unique(), 3).tolist()
|
|
3493
3438
|
if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
|
|
@@ -3526,7 +3471,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3526
3471
|
features_info.append(
|
|
3527
3472
|
{
|
|
3528
3473
|
self.bundle.get("features_info_name"): feature_name,
|
|
3529
|
-
self.bundle.get("features_info_shap"):
|
|
3474
|
+
self.bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
|
|
3530
3475
|
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3531
3476
|
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3532
3477
|
self.bundle.get("features_info_provider"): provider,
|
|
@@ -3537,7 +3482,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3537
3482
|
features_info_without_links.append(
|
|
3538
3483
|
{
|
|
3539
3484
|
self.bundle.get("features_info_name"): internal_feature_name,
|
|
3540
|
-
self.bundle.get("features_info_shap"):
|
|
3485
|
+
self.bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
|
|
3541
3486
|
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3542
3487
|
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3543
3488
|
self.bundle.get("features_info_provider"): internal_provider,
|
|
@@ -3549,7 +3494,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3549
3494
|
{
|
|
3550
3495
|
self.bundle.get("features_info_name"): internal_feature_name,
|
|
3551
3496
|
"feature_link": feature_meta.doc_link,
|
|
3552
|
-
self.bundle.get("features_info_shap"):
|
|
3497
|
+
self.bundle.get("features_info_shap"): round_shap_value(feature_meta.shap_value),
|
|
3553
3498
|
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3554
3499
|
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3555
3500
|
self.bundle.get("features_info_provider"): internal_provider,
|
|
@@ -3829,11 +3774,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3829
3774
|
print(Format.GREEN + Format.BOLD + msg + Format.END)
|
|
3830
3775
|
self.logger.info(msg)
|
|
3831
3776
|
if len(self.feature_names_) > 0:
|
|
3832
|
-
|
|
3833
|
-
self.features_info,
|
|
3834
|
-
self._features_info_without_links,
|
|
3835
|
-
self.bundle.get("relevant_features_header"),
|
|
3836
|
-
display_id="features_info",
|
|
3777
|
+
display_html_dataframe(
|
|
3778
|
+
self.features_info, self._features_info_without_links, self.bundle.get("relevant_features_header")
|
|
3837
3779
|
)
|
|
3838
3780
|
|
|
3839
3781
|
display_html_dataframe(
|
upgini/metrics.py
CHANGED
|
@@ -3,14 +3,13 @@ from __future__ import annotations
|
|
|
3
3
|
import inspect
|
|
4
4
|
import logging
|
|
5
5
|
import re
|
|
6
|
-
from collections import defaultdict
|
|
7
6
|
from copy import deepcopy
|
|
8
7
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
9
8
|
|
|
10
9
|
import catboost
|
|
11
10
|
import numpy as np
|
|
12
11
|
import pandas as pd
|
|
13
|
-
from catboost import
|
|
12
|
+
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
14
13
|
from numpy import log1p
|
|
15
14
|
from pandas.api.types import is_numeric_dtype
|
|
16
15
|
from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
|
|
@@ -289,12 +288,9 @@ class EstimatorWrapper:
|
|
|
289
288
|
x, y, _ = self._prepare_data(x, y)
|
|
290
289
|
return x, y, {}
|
|
291
290
|
|
|
292
|
-
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
|
293
|
-
return None
|
|
294
|
-
|
|
295
291
|
def cross_val_predict(
|
|
296
292
|
self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
297
|
-
) ->
|
|
293
|
+
) -> Optional[float]:
|
|
298
294
|
x, y, groups, fit_params = self._prepare_to_fit(x, y)
|
|
299
295
|
|
|
300
296
|
if x.shape[1] == 0:
|
|
@@ -302,7 +298,6 @@ class EstimatorWrapper:
|
|
|
302
298
|
|
|
303
299
|
scorer = check_scoring(self.estimator, scoring=self.scorer)
|
|
304
300
|
|
|
305
|
-
shap_values_all_folds = defaultdict(list)
|
|
306
301
|
if baseline_score_column is not None and self.metric_name == "GINI":
|
|
307
302
|
self.logger.info("Calculate baseline GINI on passed baseline_score_column and target")
|
|
308
303
|
metric = roc_auc_score(y, x[baseline_score_column])
|
|
@@ -324,29 +319,7 @@ class EstimatorWrapper:
|
|
|
324
319
|
self.check_fold_metrics(metrics_by_fold)
|
|
325
320
|
|
|
326
321
|
metric = np.mean(metrics_by_fold) * self.multiplier
|
|
327
|
-
|
|
328
|
-
splits = self.cv.split(x, y, groups)
|
|
329
|
-
|
|
330
|
-
for estimator, split in zip(self.cv_estimators, splits):
|
|
331
|
-
_, validation_idx = split
|
|
332
|
-
cv_x = x.iloc[validation_idx]
|
|
333
|
-
cv_y = y[validation_idx]
|
|
334
|
-
shaps = self.calculate_shap(cv_x, cv_y, estimator)
|
|
335
|
-
if shaps is not None:
|
|
336
|
-
for feature, shap_value in shaps.items():
|
|
337
|
-
# shap_values_all_folds[feature] = shap_values_all_folds.get(feature, []) + shap_value.tolist()
|
|
338
|
-
shap_values_all_folds[feature].extend(shap_value.tolist())
|
|
339
|
-
|
|
340
|
-
if shap_values_all_folds:
|
|
341
|
-
average_shap_values = {
|
|
342
|
-
feature: np.mean(np.array(shaps)) for feature, shaps in shap_values_all_folds.items() if len(shaps) > 0
|
|
343
|
-
}
|
|
344
|
-
if len(average_shap_values) == 0:
|
|
345
|
-
average_shap_values = None
|
|
346
|
-
else:
|
|
347
|
-
average_shap_values = None
|
|
348
|
-
|
|
349
|
-
return self.post_process_metric(metric), average_shap_values
|
|
322
|
+
return self.post_process_metric(metric)
|
|
350
323
|
|
|
351
324
|
def check_fold_metrics(self, metrics_by_fold: List[float]):
|
|
352
325
|
first_metric_sign = 1 if metrics_by_fold[0] >= 0 else -1
|
|
@@ -480,7 +453,6 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
480
453
|
)
|
|
481
454
|
self.cat_features = None
|
|
482
455
|
self.emb_features = None
|
|
483
|
-
self.grouped_embedding_features = None
|
|
484
456
|
self.exclude_features = []
|
|
485
457
|
|
|
486
458
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
|
@@ -490,16 +462,17 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
490
462
|
if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
|
|
491
463
|
emb_pattern = r"(.+)_emb\d+"
|
|
492
464
|
self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
|
|
465
|
+
embedding_features = []
|
|
493
466
|
if len(self.emb_features) > 3: # There is no reason to reduce embeddings dimension with less than 4
|
|
494
467
|
self.logger.info(
|
|
495
468
|
"Embedding features count more than 3, so group them into one vector for CatBoost: "
|
|
496
469
|
f"{self.emb_features}"
|
|
497
470
|
)
|
|
498
|
-
x,
|
|
499
|
-
params["embedding_features"] =
|
|
471
|
+
x, embedding_features = self.group_embeddings(x)
|
|
472
|
+
params["embedding_features"] = embedding_features
|
|
500
473
|
else:
|
|
501
474
|
self.logger.info(f"Embedding features count less than 3, so use them separately: {self.emb_features}")
|
|
502
|
-
self.
|
|
475
|
+
self.emb_features = []
|
|
503
476
|
else:
|
|
504
477
|
self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
|
|
505
478
|
|
|
@@ -515,7 +488,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
515
488
|
self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
|
|
516
489
|
|
|
517
490
|
# Find rest categorical features
|
|
518
|
-
self.cat_features = _get_cat_features(x, self.text_features,
|
|
491
|
+
self.cat_features = _get_cat_features(x, self.text_features, embedding_features)
|
|
519
492
|
# x = fill_na_cat_features(x, self.cat_features)
|
|
520
493
|
unique_cat_features = []
|
|
521
494
|
for name in self.cat_features:
|
|
@@ -575,7 +548,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
575
548
|
|
|
576
549
|
def cross_val_predict(
|
|
577
550
|
self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
578
|
-
) ->
|
|
551
|
+
) -> Optional[float]:
|
|
579
552
|
try:
|
|
580
553
|
return super().cross_val_predict(x, y, baseline_score_column)
|
|
581
554
|
except Exception as e:
|
|
@@ -600,36 +573,6 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
600
573
|
else:
|
|
601
574
|
raise e
|
|
602
575
|
|
|
603
|
-
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator: CatBoost) -> Optional[Dict[str, float]]:
|
|
604
|
-
try:
|
|
605
|
-
# Create Pool for fold data, if need (for example, when categorical features are present)
|
|
606
|
-
fold_pool = Pool(
|
|
607
|
-
x,
|
|
608
|
-
y,
|
|
609
|
-
cat_features=self.cat_features,
|
|
610
|
-
text_features=self.text_features,
|
|
611
|
-
embedding_features=self.grouped_embedding_features,
|
|
612
|
-
)
|
|
613
|
-
|
|
614
|
-
# Get SHAP values of current estimator
|
|
615
|
-
shap_values_fold = estimator.get_feature_importance(data=fold_pool, type="ShapValues")
|
|
616
|
-
|
|
617
|
-
# Remove last columns (base value) and flatten
|
|
618
|
-
if self.target_type == ModelTaskType.MULTICLASS:
|
|
619
|
-
all_shaps = shap_values_fold[:, :, :-1]
|
|
620
|
-
all_shaps = [all_shaps[:, :, k].flatten() for k in range(all_shaps.shape[2])]
|
|
621
|
-
else:
|
|
622
|
-
all_shaps = shap_values_fold[:, :-1]
|
|
623
|
-
all_shaps = [all_shaps[:, k].flatten() for k in range(all_shaps.shape[1])]
|
|
624
|
-
|
|
625
|
-
all_shaps = np.abs(all_shaps)
|
|
626
|
-
|
|
627
|
-
return dict(zip(estimator.feature_names_, all_shaps))
|
|
628
|
-
|
|
629
|
-
except Exception:
|
|
630
|
-
self.logger.exception("Failed to recalculate new SHAP values")
|
|
631
|
-
return None
|
|
632
|
-
|
|
633
576
|
|
|
634
577
|
class LightGBMWrapper(EstimatorWrapper):
|
|
635
578
|
def __init__(
|
upgini/utils/display_utils.py
CHANGED
|
@@ -9,7 +9,6 @@ from typing import Callable, List, Optional
|
|
|
9
9
|
|
|
10
10
|
import pandas as pd
|
|
11
11
|
from xhtml2pdf import pisa
|
|
12
|
-
|
|
13
12
|
from upgini.__about__ import __version__
|
|
14
13
|
|
|
15
14
|
|
|
@@ -73,9 +72,7 @@ def make_table(df: pd.DataFrame, wrap_long_string=None) -> str:
|
|
|
73
72
|
)
|
|
74
73
|
|
|
75
74
|
|
|
76
|
-
def display_html_dataframe(
|
|
77
|
-
df: pd.DataFrame, internal_df: pd.DataFrame, header: str, display_id: Optional[str] = None, display_handle=None
|
|
78
|
-
):
|
|
75
|
+
def display_html_dataframe(df: pd.DataFrame, internal_df: pd.DataFrame, header: str):
|
|
79
76
|
if not ipython_available():
|
|
80
77
|
print(header)
|
|
81
78
|
print(internal_df)
|
|
@@ -136,10 +133,7 @@ def display_html_dataframe(
|
|
|
136
133
|
{table_html}
|
|
137
134
|
</div>
|
|
138
135
|
"""
|
|
139
|
-
|
|
140
|
-
return display_handle.update(HTML(result_html))
|
|
141
|
-
else:
|
|
142
|
-
return display(HTML(result_html), display_id=display_id)
|
|
136
|
+
display(HTML(result_html))
|
|
143
137
|
|
|
144
138
|
|
|
145
139
|
def make_html_report(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.16a3654.dev1
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -145,7 +145,7 @@ Description-Content-Type: text/markdown
|
|
|
145
145
|
|
|
146
146
|
## 💼 Tutorials
|
|
147
147
|
|
|
148
|
-
### [Search of relevant external features & Automated feature generation for Salary
|
|
148
|
+
### [Search of relevant external features & Automated feature generation for Salary predicton task (use as a template)](https://github.com/upgini/upgini/blob/main/notebooks/Upgini_Features_search%26generation.ipynb)
|
|
149
149
|
|
|
150
150
|
* The goal is to predict salary for data science job postning based on information about employer and job description.
|
|
151
151
|
* Following this guide, you'll learn how to **search & auto generate new relevant features with Upgini library**
|
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=upE7bNrEHNfQDR1MGPVPLLPvm1ag4pPCWAONnpxZPyE,33
|
|
2
2
|
upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=4lfofrRPndG_CFMownDHZuXTnfMgDF1a8hW-ShdU8ns,188446
|
|
7
7
|
upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
|
|
10
|
-
upgini/metrics.py,sha256=
|
|
10
|
+
upgini/metrics.py,sha256=bgi1rc3vCCeCuwRX1doQSQCzaV5OEiYHv_6XIvapnaw,31254
|
|
11
11
|
upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
13
13
|
upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1389
|
|
@@ -44,7 +44,7 @@ upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDc
|
|
|
44
44
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
45
45
|
upgini/utils/datetime_utils.py,sha256=4tsGeehU0KS6wqNsc9gEEWZ9s6T9E0UReUIO3rSuXNU,12174
|
|
46
46
|
upgini/utils/deduplicate_utils.py,sha256=NpaPtBYXwUtfKTRHWrtz2uUq6tZN6C_Nd719ydPRF2Q,8484
|
|
47
|
-
upgini/utils/display_utils.py,sha256=
|
|
47
|
+
upgini/utils/display_utils.py,sha256=A2ouB5eiZ-Kyt9ykYxkLQwyoRPrdYeJymwNTiajtFXs,10990
|
|
48
48
|
upgini/utils/email_utils.py,sha256=j0Ug1R_0AnCg1Y92zIZ4XMwvKo3G5_pcOlBN1OH_gZs,5191
|
|
49
49
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
|
50
50
|
upgini/utils/features_validator.py,sha256=yiOdzVtpArELMufzAa9mtWq32lETB6sIF-w3Yvl3vV8,3614
|
|
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
57
57
|
upgini/utils/target_utils.py,sha256=qHzZRmICFbLNCrmVqGkaBcjm91L2ERRZMppci36acV4,10085
|
|
58
58
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
59
59
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
60
|
-
upgini-1.2.
|
|
61
|
-
upgini-1.2.
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
60
|
+
upgini-1.2.16a3654.dev1.dist-info/METADATA,sha256=Xg2vPhmfT0fLFYRqiv7k5FYiBLA-vpHa11oks6tRDhI,48587
|
|
61
|
+
upgini-1.2.16a3654.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
|
62
|
+
upgini-1.2.16a3654.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
63
|
+
upgini-1.2.16a3654.dev1.dist-info/RECORD,,
|
|
File without changes
|