PyPI - upgini - Versions diffs - 1.2.16a1__tar.gz → 1.2.16a3654.dev2__tar.gz - Mend

upgini 1.2.16a1tar.gz → 1.2.16a3654.dev2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

{upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.16a1
+Version: 1.2.16a3654.dev2
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

upgini-1.2.16a3654.dev2/src/upgini/__about__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.2.16a3654.dev2"

{upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/features_enricher.py RENAMED Viewed

@@ -165,6 +165,7 @@ class FeaturesEnricher(TransformerMixin):
     RANDOM_STATE = 42
     CALCULATE_METRICS_THRESHOLD = 50_000_000
     CALCULATE_METRICS_MIN_THRESHOLD = 500
+    TEXT_FEATURES_THRESHOLD = 5_000
     GENERATE_FEATURES_LIMIT = 10
     EMPTY_FEATURES_INFO = pd.DataFrame(
         columns=[
@@ -871,13 +872,6 @@ class FeaturesEnricher(TransformerMixin):
                 else None
             )
-            if self.X is None:
-                self.X = X
-            if self.y is None:
-                self.y = y
-            if self.eval_set is None:
-                self.eval_set = effective_eval_set
             try:
                 self.__log_debug_information(
                     validated_X,
@@ -945,14 +939,16 @@ class FeaturesEnricher(TransformerMixin):
                 gc.collect()
-                if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
-                    print(self.bundle.get("metrics_no_important_free_features"))
-                    self.logger.warning("No client or free relevant ADS features found to calculate metrics")
-                    self.warning_counter.increment()
-                    return None
+                text_features = self.generate_features if fitting_X.shape[0] >= self.TEXT_FEATURES_THRESHOLD else []
                 print(self.bundle.get("metrics_start"))
                 with Spinner():
+                    if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
+                        print(self.bundle.get("metrics_no_important_free_features"))
+                        self.logger.warning("No client or free relevant ADS features found to calculate metrics")
+                        self.warning_counter.increment()
+                        return None
                     self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
                     has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
@@ -966,7 +962,7 @@ class FeaturesEnricher(TransformerMixin):
                         fitting_enriched_X,
                         scoring,
                         groups=groups,
-                        text_features=self.generate_features,
+                        text_features=text_features,
                         has_date=has_date,
                     )
                     metric = wrapper.metric_name
@@ -993,10 +989,10 @@ class FeaturesEnricher(TransformerMixin):
                             cat_features,
                             add_params=custom_loss_add_params,
                             groups=groups,
-                            text_features=self.generate_features,
+                            text_features=text_features,
                             has_date=has_date,
                         )
-                        etalon_metric, _ = baseline_estimator.cross_val_predict(
+                        etalon_metric = baseline_estimator.cross_val_predict(
                             fitting_X, y_sorted, self.baseline_score_column
                         )
                         if etalon_metric is None:
@@ -1027,14 +1023,10 @@ class FeaturesEnricher(TransformerMixin):
                             cat_features,
                             add_params=custom_loss_add_params,
                             groups=groups,
-                            text_features=self.generate_features,
+                            text_features=text_features,
                             has_date=has_date,
                         )
-                        enriched_metric, enriched_shaps = enriched_estimator.cross_val_predict(
-                            fitting_enriched_X, enriched_y_sorted
-                        )
-                        print(f"Calculated enriched shaps: {enriched_shaps}")
+                        enriched_metric = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
                         if enriched_metric is None:
                             self.logger.warning(
                                 f"Enriched {metric} on train combined features is None (maybe all features was removed)"
@@ -1167,6 +1159,13 @@ class FeaturesEnricher(TransformerMixin):
                     elif uplift_col in metrics_df.columns and (metrics_df[uplift_col] < 0).any():
                         self.logger.warning("Uplift is negative")
+                    if self.X is None:
+                        self.X = X
+                    if self.y is None:
+                        self.y = y
+                    if self.eval_set is None:
+                        self.eval_set = effective_eval_set
                     return metrics_df
             except Exception as e:
                 error_message = "Failed to calculate metrics" + (
@@ -1191,18 +1190,6 @@ class FeaturesEnricher(TransformerMixin):
             finally:
                 self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
-    def _update_shap_values(self, new_shaps: Dict[str, float]):
-        feature_name_header = self.bundle.get("features_info_name")
-        shap_value_header = self.bundle.get("features_info_shap")
-        def update_shap(row):
-            return new_shaps.get(row[feature_name_header], row[shap_value_header])
-        self.features_info[shap_value_header] = self.features_info.apply(update_shap, axis=1)
-        self.features_info.sort_values(by=shap_value_header, ascending=False, inplace=True)
-        # TODO redraw
     def _check_train_and_eval_target_distribution(self, y, eval_set_dict):
         uneven_distribution = False
         for eval_set in eval_set_dict.values():
@@ -1594,9 +1581,7 @@ class FeaturesEnricher(TransformerMixin):
             generated_features.extend(generator.generated_features)
         normalizer = Normalizer(self.bundle, self.logger, self.warning_counter)
-        df, search_keys, generated_features = normalizer.normalize(
-            df, search_keys, generated_features
-        )
+        df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
         columns_renaming = normalizer.columns_renaming
         df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
@@ -2036,9 +2021,7 @@ class FeaturesEnricher(TransformerMixin):
                 generated_features.extend(generator.generated_features)
             normalizer = Normalizer(self.bundle, self.logger, self.warning_counter, silent_mode)
-            df, search_keys, generated_features = normalizer.normalize(
-                df, search_keys, generated_features
-            )
+            df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
             columns_renaming = normalizer.columns_renaming
             # Don't pass all features in backend on transform

{upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/metrics.py RENAMED Viewed

@@ -1,6 +1,5 @@
 from __future__ import annotations
-from collections import defaultdict
 import inspect
 import logging
 import re
@@ -10,7 +9,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 import catboost
 import numpy as np
 import pandas as pd
-from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor, Pool
+from catboost import CatBoostClassifier, CatBoostRegressor
 from numpy import log1p
 from pandas.api.types import is_numeric_dtype
 from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
@@ -64,7 +63,7 @@ CATBOOST_BINARY_PARAMS = {
     "verbose": False,
     "random_state": DEFAULT_RANDOM_STATE,
     "allow_writing_files": False,
-    "auto_class_weights": "Balanced",
+    "auto_class_weights": "SqrtBalanced",
 }
 CATBOOST_MULTICLASS_PARAMS = {
@@ -82,7 +81,7 @@ CATBOOST_MULTICLASS_PARAMS = {
     "verbose": False,
     "random_state": DEFAULT_RANDOM_STATE,
     "allow_writing_files": False,
-    "auto_class_weights": "Balanced",
+    "auto_class_weights": "SqrtBalanced",
 }
 LIGHTGBM_PARAMS = {
@@ -289,12 +288,9 @@ class EstimatorWrapper:
         x, y, _ = self._prepare_data(x, y)
         return x, y, {}
-    def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
-        return None
     def cross_val_predict(
         self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
-    ) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
+    ) -> Optional[float]:
         x, y, groups, fit_params = self._prepare_to_fit(x, y)
         if x.shape[1] == 0:
@@ -302,7 +298,6 @@ class EstimatorWrapper:
         scorer = check_scoring(self.estimator, scoring=self.scorer)
-        shap_values_all_folds = defaultdict(list)
         if baseline_score_column is not None and self.metric_name == "GINI":
             self.logger.info("Calculate baseline GINI on passed baseline_score_column and target")
             metric = roc_auc_score(y, x[baseline_score_column])
@@ -324,29 +319,7 @@ class EstimatorWrapper:
             self.check_fold_metrics(metrics_by_fold)
             metric = np.mean(metrics_by_fold) * self.multiplier
-            splits = self.cv.split(x, y, groups)
-            for estimator, split in zip(self.cv_estimators, splits):
-                _, validation_idx = split
-                cv_x = x.iloc[validation_idx]
-                cv_y = y[validation_idx]
-                shaps = self.calculate_shap(cv_x, cv_y, estimator)
-                if shaps is not None:
-                    for feature, shap_value in shaps.items():
-                        # shap_values_all_folds[feature] = shap_values_all_folds.get(feature, []) + shap_value.tolist()
-                        shap_values_all_folds[feature].extend(shap_value.tolist())
-        if shap_values_all_folds:
-            average_shap_values = {
-                feature: np.mean(shaps)
-                for feature, shaps
-                in shap_values_all_folds.items()
-            }
-        else:
-            average_shap_values = None
-        return self.post_process_metric(metric), average_shap_values
+        return self.post_process_metric(metric)
     def check_fold_metrics(self, metrics_by_fold: List[float]):
         first_metric_sign = 1 if metrics_by_fold[0] >= 0 else -1
@@ -575,7 +548,7 @@ class CatBoostWrapper(EstimatorWrapper):
     def cross_val_predict(
         self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
-    ) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
+    ) -> Optional[float]:
         try:
             return super().cross_val_predict(x, y, baseline_score_column)
         except Exception as e:
@@ -600,29 +573,6 @@ class CatBoostWrapper(EstimatorWrapper):
             else:
                 raise e
-    def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator: CatBoost) -> Optional[Dict[str, float]]:
-        try:
-            # Create Pool for fold data, if need (for example, when categorical features are present)
-            fold_pool = Pool(x, y, cat_features=self.cat_features)
-            # Get SHAP values of current estimator
-            shap_values_fold = estimator.get_feature_importance(data=fold_pool, type="ShapValues")
-            # Remove last columns (base value) and flatten
-            if self.target_type == ModelTaskType.MULTICLASS:
-                all_shaps = shap_values_fold[:, :, :-1]
-                all_shaps = [all_shaps[:, :, k].flatten() for k in range(all_shaps.shape[2])]
-            else:
-                all_shaps = shap_values_fold[:, :-1]
-                all_shaps = [all_shaps[:, k].flatten() for k in range(all_shaps.shape[1])]
-            all_shaps = np.abs(all_shaps)
-            return dict(zip(estimator.feature_names_, all_shaps))
-        except Exception:
-            return None
 class LightGBMWrapper(EstimatorWrapper):
     def __init__(

{upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/normalizer/normalize_utils.py RENAMED Viewed

@@ -49,10 +49,7 @@ class Normalizer:
         self.generated_features = []
     def normalize(
-        self,
-        df: pd.DataFrame,
-        search_keys: Dict[str, SearchKey],
-        generated_features: List[str],
+        self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], generated_features: List[str]
     ) -> Tuple[pd.DataFrame, Dict[str, SearchKey], List[str]]:
         self.search_keys = search_keys.copy()
         self.generated_features = generated_features.copy()

{upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/utils/display_utils.py RENAMED Viewed

@@ -72,7 +72,7 @@ def make_table(df: pd.DataFrame, wrap_long_string=None) -> str:
     )
-def display_html_dataframe(df: pd.DataFrame, internal_df: pd.DataFrame, header: str, display_id: str):
+def display_html_dataframe(df: pd.DataFrame, internal_df: pd.DataFrame, header: str):
     if not ipython_available():
         print(header)
         print(internal_df)
@@ -133,7 +133,7 @@ def display_html_dataframe(df: pd.DataFrame, internal_df: pd.DataFrame, header:
             {table_html}
         </div>
         """
-    return display(HTML(result_html))
+    display(HTML(result_html))
 def make_html_report(

{upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/utils/features_validator.py RENAMED Viewed

@@ -58,10 +58,6 @@ class FeaturesValidator:
         columns_renaming = columns_renaming or {}
-        if features_for_generate:
-            empty_or_constant_features = [
-                f for f in empty_or_constant_features if columns_renaming.get(f, f) not in features_for_generate
-            ]
         if empty_or_constant_features:
             msg = bundle.get("empty_or_contant_features").format(
                 [columns_renaming.get(f, f) for f in empty_or_constant_features]