upgini 1.2.16a3654.dev3__py3-none-any.whl → 1.2.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +133 -39
- upgini/metrics.py +68 -11
- upgini/resource_bundle/strings.properties +1 -0
- upgini/utils/display_utils.py +18 -5
- {upgini-1.2.16a3654.dev3.dist-info → upgini-1.2.17.dist-info}/METADATA +2 -2
- {upgini-1.2.16a3654.dev3.dist-info → upgini-1.2.17.dist-info}/RECORD +9 -9
- {upgini-1.2.16a3654.dev3.dist-info → upgini-1.2.17.dist-info}/WHEEL +1 -1
- {upgini-1.2.16a3654.dev3.dist-info → upgini-1.2.17.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.17"
|
upgini/features_enricher.py
CHANGED
|
@@ -165,7 +165,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
165
165
|
RANDOM_STATE = 42
|
|
166
166
|
CALCULATE_METRICS_THRESHOLD = 50_000_000
|
|
167
167
|
CALCULATE_METRICS_MIN_THRESHOLD = 500
|
|
168
|
-
TEXT_FEATURES_THRESHOLD = 5_000
|
|
169
168
|
GENERATE_FEATURES_LIMIT = 10
|
|
170
169
|
EMPTY_FEATURES_INFO = pd.DataFrame(
|
|
171
170
|
columns=[
|
|
@@ -337,6 +336,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
337
336
|
self.exclude_columns = exclude_columns
|
|
338
337
|
self.baseline_score_column = baseline_score_column
|
|
339
338
|
self.add_date_if_missing = add_date_if_missing
|
|
339
|
+
self.features_info_display_handle = None
|
|
340
|
+
self.data_sources_display_handle = None
|
|
341
|
+
self.report_button_handle = None
|
|
340
342
|
|
|
341
343
|
def _get_api_key(self):
|
|
342
344
|
return self._api_key
|
|
@@ -872,6 +874,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
872
874
|
else None
|
|
873
875
|
)
|
|
874
876
|
|
|
877
|
+
if self.X is None:
|
|
878
|
+
self.X = X
|
|
879
|
+
if self.y is None:
|
|
880
|
+
self.y = y
|
|
881
|
+
if self.eval_set is None:
|
|
882
|
+
self.eval_set = effective_eval_set
|
|
883
|
+
|
|
875
884
|
try:
|
|
876
885
|
self.__log_debug_information(
|
|
877
886
|
validated_X,
|
|
@@ -939,16 +948,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
939
948
|
|
|
940
949
|
gc.collect()
|
|
941
950
|
|
|
942
|
-
|
|
951
|
+
if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
|
|
952
|
+
print(self.bundle.get("metrics_no_important_free_features"))
|
|
953
|
+
self.logger.warning("No client or free relevant ADS features found to calculate metrics")
|
|
954
|
+
self.warning_counter.increment()
|
|
955
|
+
return None
|
|
943
956
|
|
|
944
957
|
print(self.bundle.get("metrics_start"))
|
|
945
958
|
with Spinner():
|
|
946
|
-
if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
|
|
947
|
-
print(self.bundle.get("metrics_no_important_free_features"))
|
|
948
|
-
self.logger.warning("No client or free relevant ADS features found to calculate metrics")
|
|
949
|
-
self.warning_counter.increment()
|
|
950
|
-
return None
|
|
951
|
-
|
|
952
959
|
self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
|
|
953
960
|
|
|
954
961
|
has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
|
|
@@ -962,7 +969,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
962
969
|
fitting_enriched_X,
|
|
963
970
|
scoring,
|
|
964
971
|
groups=groups,
|
|
965
|
-
text_features=
|
|
972
|
+
text_features=self.generate_features,
|
|
966
973
|
has_date=has_date,
|
|
967
974
|
)
|
|
968
975
|
metric = wrapper.metric_name
|
|
@@ -989,10 +996,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
989
996
|
cat_features,
|
|
990
997
|
add_params=custom_loss_add_params,
|
|
991
998
|
groups=groups,
|
|
992
|
-
text_features=
|
|
999
|
+
text_features=self.generate_features,
|
|
993
1000
|
has_date=has_date,
|
|
994
1001
|
)
|
|
995
|
-
etalon_metric = baseline_estimator.cross_val_predict(
|
|
1002
|
+
etalon_metric, _ = baseline_estimator.cross_val_predict(
|
|
996
1003
|
fitting_X, y_sorted, self.baseline_score_column
|
|
997
1004
|
)
|
|
998
1005
|
if etalon_metric is None:
|
|
@@ -1023,10 +1030,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1023
1030
|
cat_features,
|
|
1024
1031
|
add_params=custom_loss_add_params,
|
|
1025
1032
|
groups=groups,
|
|
1026
|
-
text_features=
|
|
1033
|
+
text_features=self.generate_features,
|
|
1027
1034
|
has_date=has_date,
|
|
1028
1035
|
)
|
|
1029
|
-
enriched_metric = enriched_estimator.cross_val_predict(
|
|
1036
|
+
enriched_metric, enriched_shaps = enriched_estimator.cross_val_predict(
|
|
1037
|
+
fitting_enriched_X, enriched_y_sorted
|
|
1038
|
+
)
|
|
1039
|
+
|
|
1040
|
+
if enriched_shaps is not None:
|
|
1041
|
+
self._update_shap_values(enriched_shaps)
|
|
1042
|
+
|
|
1030
1043
|
if enriched_metric is None:
|
|
1031
1044
|
self.logger.warning(
|
|
1032
1045
|
f"Enriched {metric} on train combined features is None (maybe all features was removed)"
|
|
@@ -1159,13 +1172,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1159
1172
|
elif uplift_col in metrics_df.columns and (metrics_df[uplift_col] < 0).any():
|
|
1160
1173
|
self.logger.warning("Uplift is negative")
|
|
1161
1174
|
|
|
1162
|
-
if self.X is None:
|
|
1163
|
-
self.X = X
|
|
1164
|
-
if self.y is None:
|
|
1165
|
-
self.y = y
|
|
1166
|
-
if self.eval_set is None:
|
|
1167
|
-
self.eval_set = effective_eval_set
|
|
1168
|
-
|
|
1169
1175
|
return metrics_df
|
|
1170
1176
|
except Exception as e:
|
|
1171
1177
|
error_message = "Failed to calculate metrics" + (
|
|
@@ -1190,6 +1196,72 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1190
1196
|
finally:
|
|
1191
1197
|
self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
|
|
1192
1198
|
|
|
1199
|
+
def _update_shap_values(self, new_shaps: Dict[str, float]):
|
|
1200
|
+
new_shaps = {
|
|
1201
|
+
feature: self._round_shap_value(shap)
|
|
1202
|
+
for feature, shap in new_shaps.items()
|
|
1203
|
+
if feature in self.feature_names_
|
|
1204
|
+
}
|
|
1205
|
+
features_importances = list(new_shaps.items())
|
|
1206
|
+
features_importances.sort(key=lambda m: (-m[1], m[0]))
|
|
1207
|
+
self.feature_names_, self.feature_importances_ = zip(*features_importances)
|
|
1208
|
+
self.feature_names_ = list(self.feature_names_)
|
|
1209
|
+
self.feature_importances_ = list(self.feature_importances_)
|
|
1210
|
+
|
|
1211
|
+
feature_name_header = self.bundle.get("features_info_name")
|
|
1212
|
+
shap_value_header = self.bundle.get("features_info_shap")
|
|
1213
|
+
|
|
1214
|
+
def update_shap(row):
|
|
1215
|
+
return new_shaps.get(row[feature_name_header], row[shap_value_header])
|
|
1216
|
+
|
|
1217
|
+
self.features_info[shap_value_header] = self.features_info.apply(update_shap, axis=1)
|
|
1218
|
+
self._internal_features_info[shap_value_header] = self._internal_features_info.apply(update_shap, axis=1)
|
|
1219
|
+
self._features_info_without_links[shap_value_header] = self._features_info_without_links.apply(
|
|
1220
|
+
update_shap, axis=1
|
|
1221
|
+
)
|
|
1222
|
+
self.logger.info(f"Recalculated SHAP values:\n{self._features_info_without_links}")
|
|
1223
|
+
|
|
1224
|
+
self.features_info.sort_values(by=shap_value_header, ascending=False, inplace=True)
|
|
1225
|
+
self._internal_features_info.sort_values(by=shap_value_header, ascending=False, inplace=True)
|
|
1226
|
+
self._features_info_without_links.sort_values(by=shap_value_header, ascending=False, inplace=True)
|
|
1227
|
+
|
|
1228
|
+
self.relevant_data_sources = self._group_relevant_data_sources(self.features_info, self.bundle)
|
|
1229
|
+
self._relevant_data_sources_wo_links = self._group_relevant_data_sources(
|
|
1230
|
+
self._features_info_without_links, self.bundle
|
|
1231
|
+
)
|
|
1232
|
+
|
|
1233
|
+
if self.features_info_display_handle is not None:
|
|
1234
|
+
try:
|
|
1235
|
+
_ = get_ipython() # type: ignore
|
|
1236
|
+
|
|
1237
|
+
display_html_dataframe(
|
|
1238
|
+
self.features_info,
|
|
1239
|
+
self._features_info_without_links,
|
|
1240
|
+
self.bundle.get("relevant_features_header"),
|
|
1241
|
+
display_handle=self.features_info_display_handle,
|
|
1242
|
+
)
|
|
1243
|
+
except (ImportError, NameError):
|
|
1244
|
+
print(self._internal_features_info)
|
|
1245
|
+
if self.data_sources_display_handle is not None:
|
|
1246
|
+
try:
|
|
1247
|
+
_ = get_ipython() # type: ignore
|
|
1248
|
+
|
|
1249
|
+
display_html_dataframe(
|
|
1250
|
+
self.relevant_data_sources,
|
|
1251
|
+
self._relevant_data_sources_wo_links,
|
|
1252
|
+
self.bundle.get("relevant_features_header"),
|
|
1253
|
+
display_handle=self.data_sources_display_handle,
|
|
1254
|
+
)
|
|
1255
|
+
except (ImportError, NameError):
|
|
1256
|
+
print(self._relevant_data_sources_wo_links)
|
|
1257
|
+
if self.report_button_handle is not None:
|
|
1258
|
+
try:
|
|
1259
|
+
_ = get_ipython() # type: ignore
|
|
1260
|
+
|
|
1261
|
+
self.__show_report_button(display_handle=self.report_button_handle)
|
|
1262
|
+
except (ImportError, NameError):
|
|
1263
|
+
pass
|
|
1264
|
+
|
|
1193
1265
|
def _check_train_and_eval_target_distribution(self, y, eval_set_dict):
|
|
1194
1266
|
uneven_distribution = False
|
|
1195
1267
|
for eval_set in eval_set_dict.values():
|
|
@@ -1518,11 +1590,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1518
1590
|
self.logger.info("No external features selected. So use only input datasets for metrics calculation")
|
|
1519
1591
|
return self.__sample_only_input(validated_X, validated_y, eval_set, is_demo_dataset)
|
|
1520
1592
|
# TODO save and check if dataset was deduplicated - use imbalance branch for such case
|
|
1521
|
-
elif
|
|
1593
|
+
elif (
|
|
1594
|
+
not self.imbalanced
|
|
1595
|
+
and not exclude_features_sources
|
|
1596
|
+
and is_input_same_as_fit
|
|
1597
|
+
and self.df_with_original_index is not None
|
|
1598
|
+
):
|
|
1522
1599
|
self.logger.info("Dataset is not imbalanced, so use enriched_X from fit")
|
|
1523
1600
|
return self.__sample_balanced(eval_set, trace_id, remove_outliers_calc_metrics)
|
|
1524
1601
|
else:
|
|
1525
|
-
self.logger.info(
|
|
1602
|
+
self.logger.info(
|
|
1603
|
+
"Dataset is imbalanced or exclude_features_sources or X was passed or this is saved search."
|
|
1604
|
+
" Run transform"
|
|
1605
|
+
)
|
|
1526
1606
|
print(self.bundle.get("prepare_data_for_metrics"))
|
|
1527
1607
|
return self.__sample_imbalanced(
|
|
1528
1608
|
validated_X,
|
|
@@ -2028,6 +2108,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2028
2108
|
runtime_parameters = self._get_copy_of_runtime_parameters()
|
|
2029
2109
|
features_for_transform = self._search_task.get_features_for_transform() or []
|
|
2030
2110
|
if len(features_for_transform) > 0:
|
|
2111
|
+
missing_features_for_transform = [
|
|
2112
|
+
columns_renaming.get(f) for f in features_for_transform if f not in df.columns
|
|
2113
|
+
]
|
|
2114
|
+
if len(missing_features_for_transform) > 0:
|
|
2115
|
+
raise ValidationError(
|
|
2116
|
+
self.bundle.get("missing_features_for_transform").format(missing_features_for_transform)
|
|
2117
|
+
)
|
|
2031
2118
|
runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
|
|
2032
2119
|
|
|
2033
2120
|
columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
|
|
@@ -2702,10 +2789,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2702
2789
|
progress_callback,
|
|
2703
2790
|
)
|
|
2704
2791
|
except Exception:
|
|
2705
|
-
self.__show_report_button()
|
|
2792
|
+
self.report_button_handle = self.__show_report_button(display_id="report_button")
|
|
2706
2793
|
raise
|
|
2707
2794
|
|
|
2708
|
-
self.__show_report_button()
|
|
2795
|
+
self.report_button_handle = self.__show_report_button(display_id="report_button")
|
|
2709
2796
|
|
|
2710
2797
|
if not self.warning_counter.has_warnings():
|
|
2711
2798
|
self.__display_support_link(self.bundle.get("all_ok_community_invite"))
|
|
@@ -3377,6 +3464,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3377
3464
|
|
|
3378
3465
|
return result_train, result_eval_sets
|
|
3379
3466
|
|
|
3467
|
+
@staticmethod
|
|
3468
|
+
def _round_shap_value(shap: float) -> float:
|
|
3469
|
+
if shap > 0.0 and shap < 0.0001:
|
|
3470
|
+
return 0.0001
|
|
3471
|
+
else:
|
|
3472
|
+
return round(shap, 4)
|
|
3473
|
+
|
|
3380
3474
|
def __prepare_feature_importances(self, trace_id: str, x_columns: List[str], silent=False):
|
|
3381
3475
|
llm_source = "LLM with external data augmentation"
|
|
3382
3476
|
if self._search_task is None:
|
|
@@ -3394,12 +3488,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3394
3488
|
features_info_without_links = []
|
|
3395
3489
|
internal_features_info = []
|
|
3396
3490
|
|
|
3397
|
-
def round_shap_value(shap: float) -> float:
|
|
3398
|
-
if shap > 0.0 and shap < 0.0001:
|
|
3399
|
-
return 0.0001
|
|
3400
|
-
else:
|
|
3401
|
-
return round(shap, 4)
|
|
3402
|
-
|
|
3403
3491
|
def list_or_single(lst: List[str], single: str):
|
|
3404
3492
|
return lst or ([single] if single else [])
|
|
3405
3493
|
|
|
@@ -3432,7 +3520,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3432
3520
|
|
|
3433
3521
|
feature_sample = []
|
|
3434
3522
|
self.feature_names_.append(feature_meta.name)
|
|
3435
|
-
self.feature_importances_.append(
|
|
3523
|
+
self.feature_importances_.append(self._round_shap_value(feature_meta.shap_value))
|
|
3436
3524
|
if feature_meta.name in features_df.columns:
|
|
3437
3525
|
feature_sample = np.random.choice(features_df[feature_meta.name].dropna().unique(), 3).tolist()
|
|
3438
3526
|
if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
|
|
@@ -3471,7 +3559,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3471
3559
|
features_info.append(
|
|
3472
3560
|
{
|
|
3473
3561
|
self.bundle.get("features_info_name"): feature_name,
|
|
3474
|
-
self.bundle.get("features_info_shap"):
|
|
3562
|
+
self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
|
|
3475
3563
|
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3476
3564
|
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3477
3565
|
self.bundle.get("features_info_provider"): provider,
|
|
@@ -3482,7 +3570,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3482
3570
|
features_info_without_links.append(
|
|
3483
3571
|
{
|
|
3484
3572
|
self.bundle.get("features_info_name"): internal_feature_name,
|
|
3485
|
-
self.bundle.get("features_info_shap"):
|
|
3573
|
+
self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
|
|
3486
3574
|
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3487
3575
|
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3488
3576
|
self.bundle.get("features_info_provider"): internal_provider,
|
|
@@ -3494,7 +3582,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3494
3582
|
{
|
|
3495
3583
|
self.bundle.get("features_info_name"): internal_feature_name,
|
|
3496
3584
|
"feature_link": feature_meta.doc_link,
|
|
3497
|
-
self.bundle.get("features_info_shap"):
|
|
3585
|
+
self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
|
|
3498
3586
|
self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
|
|
3499
3587
|
self.bundle.get("features_info_value_preview"): feature_sample,
|
|
3500
3588
|
self.bundle.get("features_info_provider"): internal_provider,
|
|
@@ -3774,14 +3862,18 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3774
3862
|
print(Format.GREEN + Format.BOLD + msg + Format.END)
|
|
3775
3863
|
self.logger.info(msg)
|
|
3776
3864
|
if len(self.feature_names_) > 0:
|
|
3777
|
-
display_html_dataframe(
|
|
3778
|
-
self.features_info,
|
|
3865
|
+
self.features_info_display_handle = display_html_dataframe(
|
|
3866
|
+
self.features_info,
|
|
3867
|
+
self._features_info_without_links,
|
|
3868
|
+
self.bundle.get("relevant_features_header"),
|
|
3869
|
+
display_id="features_info",
|
|
3779
3870
|
)
|
|
3780
3871
|
|
|
3781
|
-
display_html_dataframe(
|
|
3872
|
+
self.data_sources_display_handle = display_html_dataframe(
|
|
3782
3873
|
self.relevant_data_sources,
|
|
3783
3874
|
self._relevant_data_sources_wo_links,
|
|
3784
3875
|
self.bundle.get("relevant_data_sources_header"),
|
|
3876
|
+
display_id="data_sources",
|
|
3785
3877
|
)
|
|
3786
3878
|
else:
|
|
3787
3879
|
msg = self.bundle.get("features_info_zero_important_features")
|
|
@@ -3792,9 +3884,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3792
3884
|
print(msg)
|
|
3793
3885
|
print(self._internal_features_info)
|
|
3794
3886
|
|
|
3795
|
-
def __show_report_button(self):
|
|
3887
|
+
def __show_report_button(self, display_id: Optional[str] = None, display_handle=None):
|
|
3796
3888
|
try:
|
|
3797
|
-
prepare_and_show_report(
|
|
3889
|
+
return prepare_and_show_report(
|
|
3798
3890
|
relevant_features_df=self._features_info_without_links,
|
|
3799
3891
|
relevant_datasources_df=self.relevant_data_sources,
|
|
3800
3892
|
metrics_df=self.metrics,
|
|
@@ -3802,6 +3894,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3802
3894
|
search_id=self._search_task.search_task_id,
|
|
3803
3895
|
email=self.rest_client.get_current_email(),
|
|
3804
3896
|
search_keys=[str(sk) for sk in self.search_keys.values()],
|
|
3897
|
+
display_id=display_id,
|
|
3898
|
+
display_handle=display_handle,
|
|
3805
3899
|
)
|
|
3806
3900
|
except Exception:
|
|
3807
3901
|
pass
|
upgini/metrics.py
CHANGED
|
@@ -3,13 +3,14 @@ from __future__ import annotations
|
|
|
3
3
|
import inspect
|
|
4
4
|
import logging
|
|
5
5
|
import re
|
|
6
|
+
from collections import defaultdict
|
|
6
7
|
from copy import deepcopy
|
|
7
8
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
8
9
|
|
|
9
10
|
import catboost
|
|
10
11
|
import numpy as np
|
|
11
12
|
import pandas as pd
|
|
12
|
-
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
13
|
+
from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor, Pool
|
|
13
14
|
from numpy import log1p
|
|
14
15
|
from pandas.api.types import is_numeric_dtype
|
|
15
16
|
from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
|
|
@@ -63,7 +64,7 @@ CATBOOST_BINARY_PARAMS = {
|
|
|
63
64
|
"verbose": False,
|
|
64
65
|
"random_state": DEFAULT_RANDOM_STATE,
|
|
65
66
|
"allow_writing_files": False,
|
|
66
|
-
|
|
67
|
+
"auto_class_weights": "Balanced",
|
|
67
68
|
}
|
|
68
69
|
|
|
69
70
|
CATBOOST_MULTICLASS_PARAMS = {
|
|
@@ -81,7 +82,7 @@ CATBOOST_MULTICLASS_PARAMS = {
|
|
|
81
82
|
"verbose": False,
|
|
82
83
|
"random_state": DEFAULT_RANDOM_STATE,
|
|
83
84
|
"allow_writing_files": False,
|
|
84
|
-
"auto_class_weights": "
|
|
85
|
+
"auto_class_weights": "Balanced",
|
|
85
86
|
}
|
|
86
87
|
|
|
87
88
|
LIGHTGBM_PARAMS = {
|
|
@@ -288,9 +289,12 @@ class EstimatorWrapper:
|
|
|
288
289
|
x, y, _ = self._prepare_data(x, y)
|
|
289
290
|
return x, y, {}
|
|
290
291
|
|
|
292
|
+
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
|
293
|
+
return None
|
|
294
|
+
|
|
291
295
|
def cross_val_predict(
|
|
292
296
|
self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
293
|
-
) -> Optional[float]:
|
|
297
|
+
) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
|
|
294
298
|
x, y, groups, fit_params = self._prepare_to_fit(x, y)
|
|
295
299
|
|
|
296
300
|
if x.shape[1] == 0:
|
|
@@ -298,6 +302,7 @@ class EstimatorWrapper:
|
|
|
298
302
|
|
|
299
303
|
scorer = check_scoring(self.estimator, scoring=self.scorer)
|
|
300
304
|
|
|
305
|
+
shap_values_all_folds = defaultdict(list)
|
|
301
306
|
if baseline_score_column is not None and self.metric_name == "GINI":
|
|
302
307
|
self.logger.info("Calculate baseline GINI on passed baseline_score_column and target")
|
|
303
308
|
metric = roc_auc_score(y, x[baseline_score_column])
|
|
@@ -319,7 +324,29 @@ class EstimatorWrapper:
|
|
|
319
324
|
self.check_fold_metrics(metrics_by_fold)
|
|
320
325
|
|
|
321
326
|
metric = np.mean(metrics_by_fold) * self.multiplier
|
|
322
|
-
|
|
327
|
+
|
|
328
|
+
splits = self.cv.split(x, y, groups)
|
|
329
|
+
|
|
330
|
+
for estimator, split in zip(self.cv_estimators, splits):
|
|
331
|
+
_, validation_idx = split
|
|
332
|
+
cv_x = x.iloc[validation_idx]
|
|
333
|
+
cv_y = y[validation_idx]
|
|
334
|
+
shaps = self.calculate_shap(cv_x, cv_y, estimator)
|
|
335
|
+
if shaps is not None:
|
|
336
|
+
for feature, shap_value in shaps.items():
|
|
337
|
+
# shap_values_all_folds[feature] = shap_values_all_folds.get(feature, []) + shap_value.tolist()
|
|
338
|
+
shap_values_all_folds[feature].extend(shap_value.tolist())
|
|
339
|
+
|
|
340
|
+
if shap_values_all_folds:
|
|
341
|
+
average_shap_values = {
|
|
342
|
+
feature: np.mean(np.array(shaps)) for feature, shaps in shap_values_all_folds.items() if len(shaps) > 0
|
|
343
|
+
}
|
|
344
|
+
if len(average_shap_values) == 0:
|
|
345
|
+
average_shap_values = None
|
|
346
|
+
else:
|
|
347
|
+
average_shap_values = None
|
|
348
|
+
|
|
349
|
+
return self.post_process_metric(metric), average_shap_values
|
|
323
350
|
|
|
324
351
|
def check_fold_metrics(self, metrics_by_fold: List[float]):
|
|
325
352
|
first_metric_sign = 1 if metrics_by_fold[0] >= 0 else -1
|
|
@@ -453,6 +480,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
453
480
|
)
|
|
454
481
|
self.cat_features = None
|
|
455
482
|
self.emb_features = None
|
|
483
|
+
self.grouped_embedding_features = None
|
|
456
484
|
self.exclude_features = []
|
|
457
485
|
|
|
458
486
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
|
@@ -462,17 +490,16 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
462
490
|
if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
|
|
463
491
|
emb_pattern = r"(.+)_emb\d+"
|
|
464
492
|
self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
|
|
465
|
-
embedding_features = []
|
|
466
493
|
if len(self.emb_features) > 3: # There is no reason to reduce embeddings dimension with less than 4
|
|
467
494
|
self.logger.info(
|
|
468
495
|
"Embedding features count more than 3, so group them into one vector for CatBoost: "
|
|
469
496
|
f"{self.emb_features}"
|
|
470
497
|
)
|
|
471
|
-
x,
|
|
472
|
-
params["embedding_features"] =
|
|
498
|
+
x, self.grouped_embedding_features = self.group_embeddings(x)
|
|
499
|
+
params["embedding_features"] = self.grouped_embedding_features
|
|
473
500
|
else:
|
|
474
501
|
self.logger.info(f"Embedding features count less than 3, so use them separately: {self.emb_features}")
|
|
475
|
-
self.
|
|
502
|
+
self.grouped_embedding_features = None
|
|
476
503
|
else:
|
|
477
504
|
self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
|
|
478
505
|
|
|
@@ -488,7 +515,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
488
515
|
self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
|
|
489
516
|
|
|
490
517
|
# Find rest categorical features
|
|
491
|
-
self.cat_features = _get_cat_features(x, self.text_features,
|
|
518
|
+
self.cat_features = _get_cat_features(x, self.text_features, self.grouped_embedding_features)
|
|
492
519
|
# x = fill_na_cat_features(x, self.cat_features)
|
|
493
520
|
unique_cat_features = []
|
|
494
521
|
for name in self.cat_features:
|
|
@@ -548,7 +575,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
548
575
|
|
|
549
576
|
def cross_val_predict(
|
|
550
577
|
self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
551
|
-
) -> Optional[float]:
|
|
578
|
+
) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
|
|
552
579
|
try:
|
|
553
580
|
return super().cross_val_predict(x, y, baseline_score_column)
|
|
554
581
|
except Exception as e:
|
|
@@ -573,6 +600,36 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
573
600
|
else:
|
|
574
601
|
raise e
|
|
575
602
|
|
|
603
|
+
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator: CatBoost) -> Optional[Dict[str, float]]:
|
|
604
|
+
try:
|
|
605
|
+
# Create Pool for fold data, if need (for example, when categorical features are present)
|
|
606
|
+
fold_pool = Pool(
|
|
607
|
+
x,
|
|
608
|
+
y,
|
|
609
|
+
cat_features=self.cat_features,
|
|
610
|
+
text_features=self.text_features,
|
|
611
|
+
embedding_features=self.grouped_embedding_features,
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
# Get SHAP values of current estimator
|
|
615
|
+
shap_values_fold = estimator.get_feature_importance(data=fold_pool, type="ShapValues")
|
|
616
|
+
|
|
617
|
+
# Remove last columns (base value) and flatten
|
|
618
|
+
if self.target_type == ModelTaskType.MULTICLASS:
|
|
619
|
+
all_shaps = shap_values_fold[:, :, :-1]
|
|
620
|
+
all_shaps = [all_shaps[:, :, k].flatten() for k in range(all_shaps.shape[2])]
|
|
621
|
+
else:
|
|
622
|
+
all_shaps = shap_values_fold[:, :-1]
|
|
623
|
+
all_shaps = [all_shaps[:, k].flatten() for k in range(all_shaps.shape[1])]
|
|
624
|
+
|
|
625
|
+
all_shaps = np.abs(all_shaps)
|
|
626
|
+
|
|
627
|
+
return dict(zip(estimator.feature_names_, all_shaps))
|
|
628
|
+
|
|
629
|
+
except Exception:
|
|
630
|
+
self.logger.exception("Failed to recalculate new SHAP values")
|
|
631
|
+
return None
|
|
632
|
+
|
|
576
633
|
|
|
577
634
|
class LightGBMWrapper(EstimatorWrapper):
|
|
578
635
|
def __init__(
|
|
@@ -136,6 +136,7 @@ eval_y_is_empty=y in eval_set is empty.
|
|
|
136
136
|
x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
|
|
137
137
|
baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
|
|
138
138
|
baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
|
|
139
|
+
missing_features_for_transform=Missing some features for transform that were presented on fit: {}
|
|
139
140
|
# target validation
|
|
140
141
|
empty_target=Target is empty in all rows
|
|
141
142
|
# non_numeric_target=Binary target should be numerical type
|
upgini/utils/display_utils.py
CHANGED
|
@@ -9,6 +9,7 @@ from typing import Callable, List, Optional
|
|
|
9
9
|
|
|
10
10
|
import pandas as pd
|
|
11
11
|
from xhtml2pdf import pisa
|
|
12
|
+
|
|
12
13
|
from upgini.__about__ import __version__
|
|
13
14
|
|
|
14
15
|
|
|
@@ -72,7 +73,9 @@ def make_table(df: pd.DataFrame, wrap_long_string=None) -> str:
|
|
|
72
73
|
)
|
|
73
74
|
|
|
74
75
|
|
|
75
|
-
def display_html_dataframe(
|
|
76
|
+
def display_html_dataframe(
|
|
77
|
+
df: pd.DataFrame, internal_df: pd.DataFrame, header: str, display_id: Optional[str] = None, display_handle=None
|
|
78
|
+
):
|
|
76
79
|
if not ipython_available():
|
|
77
80
|
print(header)
|
|
78
81
|
print(internal_df)
|
|
@@ -133,7 +136,10 @@ def display_html_dataframe(df: pd.DataFrame, internal_df: pd.DataFrame, header:
|
|
|
133
136
|
{table_html}
|
|
134
137
|
</div>
|
|
135
138
|
"""
|
|
136
|
-
|
|
139
|
+
if display_handle:
|
|
140
|
+
return display_handle.update(HTML(result_html))
|
|
141
|
+
else:
|
|
142
|
+
return display(HTML(result_html), display_id=display_id)
|
|
137
143
|
|
|
138
144
|
|
|
139
145
|
def make_html_report(
|
|
@@ -279,6 +285,8 @@ def prepare_and_show_report(
|
|
|
279
285
|
search_id: str,
|
|
280
286
|
email: Optional[str],
|
|
281
287
|
search_keys: Optional[List[str]] = None,
|
|
288
|
+
display_id: Optional[str] = None,
|
|
289
|
+
display_handle=None,
|
|
282
290
|
):
|
|
283
291
|
if not ipython_available():
|
|
284
292
|
return
|
|
@@ -288,10 +296,12 @@ def prepare_and_show_report(
|
|
|
288
296
|
)
|
|
289
297
|
|
|
290
298
|
if len(relevant_features_df) > 0:
|
|
291
|
-
show_button_download_pdf(report)
|
|
299
|
+
return show_button_download_pdf(report, display_id=display_id, display_handle=display_handle)
|
|
292
300
|
|
|
293
301
|
|
|
294
|
-
def show_button_download_pdf(
|
|
302
|
+
def show_button_download_pdf(
|
|
303
|
+
source: str, title="\U0001F4CA Download PDF report", display_id: Optional[str] = None, display_handle=None
|
|
304
|
+
):
|
|
295
305
|
from IPython.display import HTML, display
|
|
296
306
|
|
|
297
307
|
file_name = f"upgini-report-{uuid.uuid4()}.pdf"
|
|
@@ -303,7 +313,10 @@ def show_button_download_pdf(source: str, title="\U0001F4CA Download PDF report"
|
|
|
303
313
|
payload = b64.decode()
|
|
304
314
|
html = f"""<a download="{file_name}" href="data:application/pdf;base64,{payload}" target="_blank">
|
|
305
315
|
<button>{title}</button></a>"""
|
|
306
|
-
|
|
316
|
+
if display_handle is not None:
|
|
317
|
+
display_handle.update(HTML(html))
|
|
318
|
+
else:
|
|
319
|
+
return display(HTML(html), display_id=display_id)
|
|
307
320
|
|
|
308
321
|
|
|
309
322
|
def show_request_quote_button():
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.17
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -145,7 +145,7 @@ Description-Content-Type: text/markdown
|
|
|
145
145
|
|
|
146
146
|
## 💼 Tutorials
|
|
147
147
|
|
|
148
|
-
### [Search of relevant external features & Automated feature generation for Salary
|
|
148
|
+
### [Search of relevant external features & Automated feature generation for Salary prediction task (use as a template)](https://github.com/upgini/upgini/blob/main/notebooks/Upgini_Features_search%26generation.ipynb)
|
|
149
149
|
|
|
150
150
|
* The goal is to predict salary for data science job postning based on information about employer and job description.
|
|
151
151
|
* Following this guide, you'll learn how to **search & auto generate new relevant features with Upgini library**
|
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=LyCJKEtzC7sS6MlxViknrdz9t79ni5iIOEGUNPPAnwU,23
|
|
2
2
|
upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=3Jx6eoGULag64lN8pnwloI-RKwyLlVONrCADxpehwNo,192789
|
|
7
7
|
upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
|
|
10
|
-
upgini/metrics.py,sha256=
|
|
10
|
+
upgini/metrics.py,sha256=lhLqFv1tLWNzx3ULELo3MMSqI8eBoHL7P5jKpG8a6PE,33899
|
|
11
11
|
upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
13
13
|
upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1389
|
|
@@ -30,7 +30,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
|
30
30
|
upgini/normalizer/normalize_utils.py,sha256=Lv75lq7M46z9cAIutwkdKZtPZkWblgoRzToAJ1BwY8A,7709
|
|
31
31
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
32
32
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
33
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
33
|
+
upgini/resource_bundle/strings.properties,sha256=bWWznzu43Lwfd-j4XDrpKJCpoxMMThd73awB7ge7wfo,27319
|
|
34
34
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
35
35
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
36
36
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
|
@@ -44,7 +44,7 @@ upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDc
|
|
|
44
44
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
45
45
|
upgini/utils/datetime_utils.py,sha256=4tsGeehU0KS6wqNsc9gEEWZ9s6T9E0UReUIO3rSuXNU,12174
|
|
46
46
|
upgini/utils/deduplicate_utils.py,sha256=NpaPtBYXwUtfKTRHWrtz2uUq6tZN6C_Nd719ydPRF2Q,8484
|
|
47
|
-
upgini/utils/display_utils.py,sha256=
|
|
47
|
+
upgini/utils/display_utils.py,sha256=NGhki1aGMsS8OeI69eLXEpmS_s41k8ojKHQxacJaXiU,11493
|
|
48
48
|
upgini/utils/email_utils.py,sha256=j0Ug1R_0AnCg1Y92zIZ4XMwvKo3G5_pcOlBN1OH_gZs,5191
|
|
49
49
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
|
50
50
|
upgini/utils/features_validator.py,sha256=yiOdzVtpArELMufzAa9mtWq32lETB6sIF-w3Yvl3vV8,3614
|
|
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
57
57
|
upgini/utils/target_utils.py,sha256=qHzZRmICFbLNCrmVqGkaBcjm91L2ERRZMppci36acV4,10085
|
|
58
58
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
59
59
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
60
|
-
upgini-1.2.
|
|
61
|
-
upgini-1.2.
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
60
|
+
upgini-1.2.17.dist-info/METADATA,sha256=g8R9yIZmDZNOFNFMVW-65PTooKnQx6tWMX4Z1Pky-yI,48578
|
|
61
|
+
upgini-1.2.17.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
62
|
+
upgini-1.2.17.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
63
|
+
upgini-1.2.17.dist-info/RECORD,,
|
|
File without changes
|