PyPI - upgini - Versions diffs - 1.2.71a3832.dev9__py3-none-any.whl → 1.2.71a3832.dev11__py3-none-any.whl - Mend

upgini 1.2.71a3832.dev9py3-none-any.whl → 1.2.71a3832.dev11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.71a3832.~~dev9~~"
1	+ __version__ = "1.2.71a3832.dev11"

upgini/features_enricher.py CHANGED Viewed

@@ -1512,8 +1512,7 @@ class FeaturesEnricher(TransformerMixin):
         self.logger.info(f"Client features column on prepare data for metrics: {client_features}")
         filtered_enriched_features = self.__filtered_enriched_features(
-            importance_threshold,
-            max_features,
+            importance_threshold, max_features, trace_id, validated_X
         )
         filtered_enriched_features = [c for c in filtered_enriched_features if c not in client_features]
@@ -2541,7 +2540,9 @@ if response.status_code == 200:
                 for c in itertools.chain(validated_Xy.columns.tolist(), generated_features)
                 if c not in self.dropped_client_feature_names_
             ]
-            filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
+            filtered_columns = self.__filtered_enriched_features(
+                importance_threshold, max_features, trace_id, validated_X
+            )
             selecting_columns.extend(
                 c for c in filtered_columns if c in result.columns and c not in validated_X.columns
             )
@@ -3805,6 +3806,46 @@ if response.status_code == 200:
         return result_features
+    def __get_features_importance_from_server(self, trace_id: str, df: pd.DataFrame):
+        if self._search_task is None:
+            raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
+        features_meta = self._search_task.get_all_features_metadata_v2()
+        if features_meta is None:
+            raise Exception(self.bundle.get("missing_features_meta"))
+        original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
+        df = df.rename(columns=original_names_dict)
+        features_meta.sort(key=lambda m: (-m.shap_value, m.name))
+        importances = {}
+        for feature_meta in features_meta:
+            if feature_meta.name in original_names_dict.keys():
+                feature_meta.name = original_names_dict[feature_meta.name]
+            is_client_feature = feature_meta.name in df.columns
+            if feature_meta.shap_value == 0.0:
+                continue
+            # Use only important features
+            if (
+                feature_meta.name == COUNTRY
+                # In select_features mode we select also from etalon features and need to show them
+                or (not self.fit_select_features and is_client_feature)
+            ):
+                continue
+            # Temporary workaround for duplicate features metadata
+            if feature_meta.name in importances:
+                self.logger.warning(f"WARNING: Duplicate feature metadata: {feature_meta}")
+                continue
+            importances[feature_meta.name] = feature_meta.shap_value
+        return importances
     def __prepare_feature_importances(
         self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
     ):
@@ -3990,9 +4031,12 @@ if response.status_code == 200:
         )
     def __filtered_importance_names(
-        self, importance_threshold: Optional[float], max_features: Optional[int]
+        self, importance_threshold: Optional[float], max_features: Optional[int], trace_id: str, df: pd.DataFrame
     ) -> List[str]:
-        if len(self.feature_names_) == 0:
+        # get features importance from server
+        filtered_importances = self.__get_features_importance_from_server(trace_id, df)
+        if len(filtered_importances) == 0:
             return []
         filtered_importances = list(zip(self.feature_names_, self.feature_importances_))
@@ -4212,11 +4256,13 @@ if response.status_code == 200:
         self,
         importance_threshold: Optional[float],
         max_features: Optional[int],
+        trace_id: str,
+        df: pd.DataFrame,
     ) -> List[str]:
         importance_threshold = self.__validate_importance_threshold(importance_threshold)
         max_features = self.__validate_max_features(max_features)
-        return self.__filtered_importance_names(importance_threshold, max_features)
+        return self.__filtered_importance_names(importance_threshold, max_features, trace_id, df)
     def __detect_missing_search_keys(
         self,

upgini/metrics.py CHANGED Viewed

@@ -3,7 +3,6 @@ from __future__ import annotations
 import inspect
 import logging
 import re
-import warnings
 from collections import defaultdict
 from copy import deepcopy
 from dataclasses import dataclass
@@ -755,9 +754,12 @@ class LightGBMWrapper(EstimatorWrapper):
             logger=logger,
         )
         self.cat_features = None
+        self.n_classes = None
     def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
         x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
+        if self.target_type in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]:
+            self.n_classes = len(np.unique(y_numpy))
         if LIGHTGBM_EARLY_STOPPING_ROUNDS is not None:
             params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
         self.cat_features = _get_cat_features(x)
@@ -783,31 +785,40 @@ class LightGBMWrapper(EstimatorWrapper):
     def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
         try:
-            # Suppress specific warning from SHAP for LightGBM binary classifier
-            warnings.filterwarnings(
-                "ignore",
-                message=(
-                    "LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray"
-                ),
+            shap_matrix = estimator.predict(
+                x,
+                predict_disable_shape_check=True,
+                raw_score=True,
+                pred_leaf=False,
+                pred_early_stop=True,
+                pred_contrib=True,
             )
-            from shap import TreeExplainer
-            if not isinstance(estimator, (LGBMRegressor, LGBMClassifier)):
-                return None
-            explainer = TreeExplainer(estimator)
-            shap_values = explainer.shap_values(x)
+            if self.target_type == ModelTaskType.MULTICLASS:
+                n_feat = x.shape[1]
+                shap_matrix.shape = (shap_matrix.shape[0], self.n_classes, n_feat + 1)
+                shap_matrix = np.mean(np.abs(shap_matrix), axis=1)
-            # For classification, shap_values is returned as a list for each class
-            # Take values for the positive class
-            if isinstance(shap_values, list):
-                shap_values = shap_values[1]
+            # exclude base value
+            shap_matrix = shap_matrix[:, :-1]
-            # Calculate mean absolute SHAP value for each feature
             feature_importance = {}
             for i, col in enumerate(x.columns):
-                feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
+                feature_importance[col] = np.mean(np.abs(shap_matrix[:, i]))
+            # # exclude last column (base value)
+            # shap_values_only = shap_values[:, :-1]
+            # mean_abs_shap = np.mean(np.abs(shap_values_only), axis=0)
+            # # For classification, shap_values is returned as a list for each class
+            # # Take values for the positive class
+            # if isinstance(shap_values, list):
+            #     shap_values = shap_values[1]
+            # # Calculate mean absolute SHAP value for each feature
+            # feature_importance = {}
+            # for i, col in enumerate(x.columns):
+            #     feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
             return feature_importance

{upgini-1.2.71a3832.dev9.dist-info → upgini-1.2.71a3832.dev11.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.71a3832.dev9
+Version: 1.2.71a3832.dev11
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

{upgini-1.2.71a3832.dev9.dist-info → upgini-1.2.71a3832.dev11.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,12 @@
-upgini/__about__.py,sha256=9j3uTsEOnjsLQKejDMGIw2xcYlikwx_zlJRLURdTIkE,33
+upgini/__about__.py,sha256=MPYFg9v0SOhqTxe0IfYh4m6Nh3TlmyfHR9sua58WXBM,34
 upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
 upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
 upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
-upgini/features_enricher.py,sha256=Z6RSjqcqneGwWflsq1Q5rjf83awPNYqKpAgHRh7jils,204680
+upgini/features_enricher.py,sha256=oYOBaHIyPjm-EEZvJT9pU35_DW8bArEQKymZyhW8LbE,206592
 upgini/http.py,sha256=RvzcShpDXssLs6ycGN8xilkKi8ZV9XGUrrk8bwdUzbw,43607
 upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
-upgini/metrics.py,sha256=PlyjxL82AtUWgbCWRv_0r4NiPwmflO5ItIPlKb4nxcs,38293
+upgini/metrics.py,sha256=9AaQi7Yb22ZNnycUOAUpcP7TWF5Pfy_NGACcDj10aMs,38820
 upgini/search_task.py,sha256=EuCGp0iCWz2fpuJgN6M47aP_CtIi3Oq9zw78w0mkKiU,17595
 upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
 upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
 upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
 upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
-upgini-1.2.71a3832.dev9.dist-info/METADATA,sha256=MMv6QMrMxHQ50ISC1VoKtdK00hxtsgnz-IuuHuKUh7M,49101
-upgini-1.2.71a3832.dev9.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-upgini-1.2.71a3832.dev9.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.71a3832.dev9.dist-info/RECORD,,
+upgini-1.2.71a3832.dev11.dist-info/METADATA,sha256=QuI4m49RjcWmDJ74fXMWfNqBKPXGKDsKGhhO_wR1Kfw,49102
+upgini-1.2.71a3832.dev11.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
+upgini-1.2.71a3832.dev11.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.71a3832.dev11.dist-info/RECORD,,

{upgini-1.2.71a3832.dev9.dist-info → upgini-1.2.71a3832.dev11.dist-info}/WHEEL RENAMED Viewed

File without changes

{upgini-1.2.71a3832.dev9.dist-info → upgini-1.2.71a3832.dev11.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.71a3832.dev9__py3-none-any.whl → 1.2.71a3832.dev11__py3-none-any.whl

upgini 1.2.71a3832.dev9py3-none-any.whl → 1.2.71a3832.dev11py3-none-any.whl