PyPI - upgini - Versions diffs - 1.2.15__py3-none-any.whl → 1.2.16a1__py3-none-any.whl - Mend

upgini 1.2.15py3-none-any.whl → 1.2.16a1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (10) hide show

upgini/__about__.py +1 -1
upgini/features_enricher.py +37 -17
upgini/metrics.py +54 -4
upgini/normalizer/normalize_utils.py +4 -1
upgini/utils/display_utils.py +2 -2
upgini/utils/features_validator.py +4 -0
{upgini-1.2.15.dist-info → upgini-1.2.16a1.dist-info}/METADATA +1 -1
{upgini-1.2.15.dist-info → upgini-1.2.16a1.dist-info}/RECORD +10 -10
{upgini-1.2.15.dist-info → upgini-1.2.16a1.dist-info}/WHEEL +0 -0
{upgini-1.2.15.dist-info → upgini-1.2.16a1.dist-info}/licenses/LICENSE +0 -0

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.15"
1	+ __version__ = "1.2.16a1"

upgini/features_enricher.py CHANGED Viewed

@@ -871,6 +871,13 @@ class FeaturesEnricher(TransformerMixin):
                 else None
             )
+            if self.X is None:
+                self.X = X
+            if self.y is None:
+                self.y = y
+            if self.eval_set is None:
+                self.eval_set = effective_eval_set
             try:
                 self.__log_debug_information(
                     validated_X,
@@ -938,14 +945,14 @@ class FeaturesEnricher(TransformerMixin):
                 gc.collect()
+                if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
+                    print(self.bundle.get("metrics_no_important_free_features"))
+                    self.logger.warning("No client or free relevant ADS features found to calculate metrics")
+                    self.warning_counter.increment()
+                    return None
                 print(self.bundle.get("metrics_start"))
                 with Spinner():
-                    if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
-                        print(self.bundle.get("metrics_no_important_free_features"))
-                        self.logger.warning("No client or free relevant ADS features found to calculate metrics")
-                        self.warning_counter.increment()
-                        return None
                     self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
                     has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
@@ -989,7 +996,7 @@ class FeaturesEnricher(TransformerMixin):
                             text_features=self.generate_features,
                             has_date=has_date,
                         )
-                        etalon_metric = baseline_estimator.cross_val_predict(
+                        etalon_metric, _ = baseline_estimator.cross_val_predict(
                             fitting_X, y_sorted, self.baseline_score_column
                         )
                         if etalon_metric is None:
@@ -1023,7 +1030,11 @@ class FeaturesEnricher(TransformerMixin):
                             text_features=self.generate_features,
                             has_date=has_date,
                         )
-                        enriched_metric = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
+                        enriched_metric, enriched_shaps = enriched_estimator.cross_val_predict(
+                            fitting_enriched_X, enriched_y_sorted
+                        )
+                        print(f"Calculated enriched shaps: {enriched_shaps}")
                         if enriched_metric is None:
                             self.logger.warning(
                                 f"Enriched {metric} on train combined features is None (maybe all features was removed)"
@@ -1156,13 +1167,6 @@ class FeaturesEnricher(TransformerMixin):
                     elif uplift_col in metrics_df.columns and (metrics_df[uplift_col] < 0).any():
                         self.logger.warning("Uplift is negative")
-                    if self.X is None:
-                        self.X = X
-                    if self.y is None:
-                        self.y = y
-                    if self.eval_set is None:
-                        self.eval_set = effective_eval_set
                     return metrics_df
             except Exception as e:
                 error_message = "Failed to calculate metrics" + (
@@ -1187,6 +1191,18 @@ class FeaturesEnricher(TransformerMixin):
             finally:
                 self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
+    def _update_shap_values(self, new_shaps: Dict[str, float]):
+        feature_name_header = self.bundle.get("features_info_name")
+        shap_value_header = self.bundle.get("features_info_shap")
+        def update_shap(row):
+            return new_shaps.get(row[feature_name_header], row[shap_value_header])
+        self.features_info[shap_value_header] = self.features_info.apply(update_shap, axis=1)
+        self.features_info.sort_values(by=shap_value_header, ascending=False, inplace=True)
+        # TODO redraw
     def _check_train_and_eval_target_distribution(self, y, eval_set_dict):
         uneven_distribution = False
         for eval_set in eval_set_dict.values():
@@ -1578,7 +1594,9 @@ class FeaturesEnricher(TransformerMixin):
             generated_features.extend(generator.generated_features)
         normalizer = Normalizer(self.bundle, self.logger, self.warning_counter)
-        df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
+        df, search_keys, generated_features = normalizer.normalize(
+            df, search_keys, generated_features
+        )
         columns_renaming = normalizer.columns_renaming
         df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
@@ -2018,7 +2036,9 @@ class FeaturesEnricher(TransformerMixin):
                 generated_features.extend(generator.generated_features)
             normalizer = Normalizer(self.bundle, self.logger, self.warning_counter, silent_mode)
-            df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
+            df, search_keys, generated_features = normalizer.normalize(
+                df, search_keys, generated_features
+            )
             columns_renaming = normalizer.columns_renaming
             # Don't pass all features in backend on transform

upgini/metrics.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+from collections import defaultdict
 import inspect
 import logging
 import re
@@ -9,7 +10,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 import catboost
 import numpy as np
 import pandas as pd
-from catboost import CatBoostClassifier, CatBoostRegressor
+from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor, Pool
 from numpy import log1p
 from pandas.api.types import is_numeric_dtype
 from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
@@ -288,9 +289,12 @@ class EstimatorWrapper:
         x, y, _ = self._prepare_data(x, y)
         return x, y, {}
+    def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
+        return None
     def cross_val_predict(
         self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
-    ) -> Optional[float]:
+    ) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
         x, y, groups, fit_params = self._prepare_to_fit(x, y)
         if x.shape[1] == 0:
@@ -298,6 +302,7 @@ class EstimatorWrapper:
         scorer = check_scoring(self.estimator, scoring=self.scorer)
+        shap_values_all_folds = defaultdict(list)
         if baseline_score_column is not None and self.metric_name == "GINI":
             self.logger.info("Calculate baseline GINI on passed baseline_score_column and target")
             metric = roc_auc_score(y, x[baseline_score_column])
@@ -319,7 +324,29 @@ class EstimatorWrapper:
             self.check_fold_metrics(metrics_by_fold)
             metric = np.mean(metrics_by_fold) * self.multiplier
-        return self.post_process_metric(metric)
+            splits = self.cv.split(x, y, groups)
+            for estimator, split in zip(self.cv_estimators, splits):
+                _, validation_idx = split
+                cv_x = x.iloc[validation_idx]
+                cv_y = y[validation_idx]
+                shaps = self.calculate_shap(cv_x, cv_y, estimator)
+                if shaps is not None:
+                    for feature, shap_value in shaps.items():
+                        # shap_values_all_folds[feature] = shap_values_all_folds.get(feature, []) + shap_value.tolist()
+                        shap_values_all_folds[feature].extend(shap_value.tolist())
+        if shap_values_all_folds:
+            average_shap_values = {
+                feature: np.mean(shaps)
+                for feature, shaps
+                in shap_values_all_folds.items()
+            }
+        else:
+            average_shap_values = None
+        return self.post_process_metric(metric), average_shap_values
     def check_fold_metrics(self, metrics_by_fold: List[float]):
         first_metric_sign = 1 if metrics_by_fold[0] >= 0 else -1
@@ -548,7 +575,7 @@ class CatBoostWrapper(EstimatorWrapper):
     def cross_val_predict(
         self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
-    ) -> Optional[float]:
+    ) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
         try:
             return super().cross_val_predict(x, y, baseline_score_column)
         except Exception as e:
@@ -573,6 +600,29 @@ class CatBoostWrapper(EstimatorWrapper):
             else:
                 raise e
+    def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator: CatBoost) -> Optional[Dict[str, float]]:
+        try:
+            # Create Pool for fold data, if need (for example, when categorical features are present)
+            fold_pool = Pool(x, y, cat_features=self.cat_features)
+            # Get SHAP values of current estimator
+            shap_values_fold = estimator.get_feature_importance(data=fold_pool, type="ShapValues")
+            # Remove last columns (base value) and flatten
+            if self.target_type == ModelTaskType.MULTICLASS:
+                all_shaps = shap_values_fold[:, :, :-1]
+                all_shaps = [all_shaps[:, :, k].flatten() for k in range(all_shaps.shape[2])]
+            else:
+                all_shaps = shap_values_fold[:, :-1]
+                all_shaps = [all_shaps[:, k].flatten() for k in range(all_shaps.shape[1])]
+            all_shaps = np.abs(all_shaps)
+            return dict(zip(estimator.feature_names_, all_shaps))
+        except Exception:
+            return None
 class LightGBMWrapper(EstimatorWrapper):
     def __init__(

upgini/normalizer/normalize_utils.py CHANGED Viewed

@@ -49,7 +49,10 @@ class Normalizer:
         self.generated_features = []
     def normalize(
-        self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], generated_features: List[str]
+        self,
+        df: pd.DataFrame,
+        search_keys: Dict[str, SearchKey],
+        generated_features: List[str],
     ) -> Tuple[pd.DataFrame, Dict[str, SearchKey], List[str]]:
         self.search_keys = search_keys.copy()
         self.generated_features = generated_features.copy()

upgini/utils/display_utils.py CHANGED Viewed

@@ -72,7 +72,7 @@ def make_table(df: pd.DataFrame, wrap_long_string=None) -> str:
     )
-def display_html_dataframe(df: pd.DataFrame, internal_df: pd.DataFrame, header: str):
+def display_html_dataframe(df: pd.DataFrame, internal_df: pd.DataFrame, header: str, display_id: str):
     if not ipython_available():
         print(header)
         print(internal_df)
@@ -133,7 +133,7 @@ def display_html_dataframe(df: pd.DataFrame, internal_df: pd.DataFrame, header:
             {table_html}
         </div>
         """
-    display(HTML(result_html))
+    return display(HTML(result_html))
 def make_html_report(

upgini/utils/features_validator.py CHANGED Viewed

@@ -58,6 +58,10 @@ class FeaturesValidator:
         columns_renaming = columns_renaming or {}
+        if features_for_generate:
+            empty_or_constant_features = [
+                f for f in empty_or_constant_features if columns_renaming.get(f, f) not in features_for_generate
+            ]
         if empty_or_constant_features:
             msg = bundle.get("empty_or_contant_features").format(
                 [columns_renaming.get(f, f) for f in empty_or_constant_features]

{upgini-1.2.15.dist-info → upgini-1.2.16a1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.15
+Version: 1.2.16a1
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

{upgini-1.2.15.dist-info → upgini-1.2.16a1.dist-info}/RECORD RENAMED Viewed

@@ -1,13 +1,13 @@
-upgini/__about__.py,sha256=Q6rDLuL8XHKQggYBtRCtxzpPQJgFYWn4x0gcVlH7H4g,23
+upgini/__about__.py,sha256=vMDC8s3UWLhN6avUSjtfizIVhxWIHW-WKTw04ha19HE,25
 upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
 upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
 upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
-upgini/features_enricher.py,sha256=vRC7g6n6XQxSrvzXk6NJjP0ZytDQhWR4sTAo4Hp7gmA,188319
+upgini/features_enricher.py,sha256=oEWJjD3v4v_0fZr8ZWSzqFCs08yJrjVTDMNPEFsFL_E,188978
 upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
 upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
 upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
-upgini/metrics.py,sha256=bgi1rc3vCCeCuwRX1doQSQCzaV5OEiYHv_6XIvapnaw,31254
+upgini/metrics.py,sha256=zs_gnjZCdk8AUYOj-mD7V1k-8Gn2EfHcXvK7J6RWOxA,33492
 upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
 upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
 upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1389
@@ -27,7 +27,7 @@ upgini/data_source/data_source_publisher.py,sha256=X-8aGtVgzGmxyXkMVBoBLIGDMb4lY
 upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
 upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
 upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-upgini/normalizer/normalize_utils.py,sha256=Lv75lq7M46z9cAIutwkdKZtPZkWblgoRzToAJ1BwY8A,7709
+upgini/normalizer/normalize_utils.py,sha256=w7S4yQZkdlBptC7peqmrn8zqs-Z0RPq2rp78IZuoE7M,7734
 upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
 upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
 upgini/resource_bundle/strings.properties,sha256=eqJP6bGu12zFuQJqMY03QbMhppcdwIfL2bsJWaqmuZ4,27221
@@ -44,10 +44,10 @@ upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDc
 upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
 upgini/utils/datetime_utils.py,sha256=4tsGeehU0KS6wqNsc9gEEWZ9s6T9E0UReUIO3rSuXNU,12174
 upgini/utils/deduplicate_utils.py,sha256=NpaPtBYXwUtfKTRHWrtz2uUq6tZN6C_Nd719ydPRF2Q,8484
-upgini/utils/display_utils.py,sha256=A2ouB5eiZ-Kyt9ykYxkLQwyoRPrdYeJymwNTiajtFXs,10990
+upgini/utils/display_utils.py,sha256=kOY3lKKbJDIb424TFAF0wQiFUhcARTy2Flz0bQ2M8NY,11014
 upgini/utils/email_utils.py,sha256=j0Ug1R_0AnCg1Y92zIZ4XMwvKo3G5_pcOlBN1OH_gZs,5191
 upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
-upgini/utils/features_validator.py,sha256=yiOdzVtpArELMufzAa9mtWq32lETB6sIF-w3Yvl3vV8,3614
+upgini/utils/features_validator.py,sha256=URNywJnfPVpRKGAK9drJIdyHarGczB298y9QGQwOVGE,3818
 upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
 upgini/utils/ip_utils.py,sha256=Q6vb7Sr5Khx3Sq3eENjW2qCXKej_S5jZbneH6zEOkzQ,5171
 upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
 upgini/utils/target_utils.py,sha256=qHzZRmICFbLNCrmVqGkaBcjm91L2ERRZMppci36acV4,10085
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
 upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
-upgini-1.2.15.dist-info/METADATA,sha256=Hua2FUNftyzzpi9eR090MFJ-5F8S_KS_5SrZhwOUgco,48577
-upgini-1.2.15.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-upgini-1.2.15.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.15.dist-info/RECORD,,
+upgini-1.2.16a1.dist-info/METADATA,sha256=0w4SeT93Uz51cWP6Y0uHw0Eh2iMVqkUIOjlaaD_Jduw,48579
+upgini-1.2.16a1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
+upgini-1.2.16a1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.16a1.dist-info/RECORD,,

{upgini-1.2.15.dist-info → upgini-1.2.16a1.dist-info}/WHEEL RENAMED Viewed

File without changes

{upgini-1.2.15.dist-info → upgini-1.2.16a1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.15__py3-none-any.whl → 1.2.16a1__py3-none-any.whl

Potentially problematic release.

upgini 1.2.15py3-none-any.whl → 1.2.16a1py3-none-any.whl