PyPI - upgini - Versions diffs - 1.2.71a3832.dev8__tar.gz → 1.2.71a3832.dev10__tar.gz - Mend

upgini 1.2.71a3832.dev8tar.gz → 1.2.71a3832.dev10tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

{upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.71a3832.dev8
+Version: 1.2.71a3832.dev10
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

upgini-1.2.71a3832.dev10/src/upgini/__about__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.2.71a3832.dev10"

{upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/features_enricher.py RENAMED Viewed

@@ -1514,6 +1514,8 @@ class FeaturesEnricher(TransformerMixin):
         filtered_enriched_features = self.__filtered_enriched_features(
             importance_threshold,
             max_features,
+            trace_id,
+            validated_X
         )
         filtered_enriched_features = [c for c in filtered_enriched_features if c not in client_features]
@@ -3805,6 +3807,46 @@ if response.status_code == 200:
         return result_features
+    def __get_features_importance_from_server(self, trace_id: str, df: pd.DataFrame):
+        if self._search_task is None:
+            raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
+        features_meta = self._search_task.get_all_features_metadata_v2()
+        if features_meta is None:
+            raise Exception(self.bundle.get("missing_features_meta"))
+        original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
+        df = df.rename(columns=original_names_dict)
+        features_meta.sort(key=lambda m: (-m.shap_value, m.name))
+        importances = {}
+        for feature_meta in features_meta:
+            if feature_meta.name in original_names_dict.keys():
+                feature_meta.name = original_names_dict[feature_meta.name]
+            is_client_feature = feature_meta.name in df.columns
+            if feature_meta.shap_value == 0.0:
+                continue
+            # Use only important features
+            if (
+                feature_meta.name == COUNTRY
+                # In select_features mode we select also from etalon features and need to show them
+                or (not self.fit_select_features and is_client_feature)
+            ):
+                continue
+            # Temporary workaround for duplicate features metadata
+            if feature_meta.name in self.feature_names_:
+                self.logger.warning(f"WARNING: Duplicate feature metadata: {feature_meta}")
+                continue
+            importances[feature_meta.name] = feature_meta.shap_value
+        return importances
     def __prepare_feature_importances(
         self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
     ):
@@ -3990,9 +4032,12 @@ if response.status_code == 200:
         )
     def __filtered_importance_names(
-        self, importance_threshold: Optional[float], max_features: Optional[int]
+        self, importance_threshold: Optional[float], max_features: Optional[int], trace_id: str, df: pd.DataFrame
     ) -> List[str]:
-        if len(self.feature_names_) == 0:
+        # get features importance from server
+        filtered_importances = self.__get_features_importance_from_server(trace_id, df)
+        if len(filtered_importances) == 0:
             return []
         filtered_importances = list(zip(self.feature_names_, self.feature_importances_))
@@ -4212,11 +4257,13 @@ if response.status_code == 200:
         self,
         importance_threshold: Optional[float],
         max_features: Optional[int],
+        trace_id: str,
+        df: pd.DataFrame,
     ) -> List[str]:
         importance_threshold = self.__validate_importance_threshold(importance_threshold)
         max_features = self.__validate_max_features(max_features)
-        return self.__filtered_importance_names(importance_threshold, max_features)
+        return self.__filtered_importance_names(importance_threshold, max_features, trace_id, df)
     def __detect_missing_search_keys(
         self,

{upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/metrics.py RENAMED Viewed

@@ -3,7 +3,6 @@ from __future__ import annotations
 import inspect
 import logging
 import re
-import warnings
 from collections import defaultdict
 from copy import deepcopy
 from dataclasses import dataclass
@@ -119,18 +118,16 @@ LIGHTGBM_REGRESSION_PARAMS = {
 LIGHTGBM_MULTICLASS_PARAMS = {
     "random_state": DEFAULT_RANDOM_STATE,
-    "deterministic": True,
-    "min_gain_to_split": 0.001,
     "n_estimators": 275,
-    "max_depth": 3,
+    "max_depth": 5,
+    "learning_rate": 0.05,
+    "min_gain_to_split": 0.001,
     "max_cat_threshold": 80,
-    "min_data_per_group": 25,
-    "cat_l2": 10,
-    "cat_smooth": 12,
-    "learning_rate": 0.25,  # CatBoost 0.25
-    "min_sum_hessian_in_leaf": 0.01,
-    "class_weight": "balanced",  # TODO pass dict with weights for each class
+    "min_data_per_group": 20,
+    "cat_smooth": 18,
+    "cat_l2" : 8,
     "objective": "multiclass",
+    "class_weight": "balanced",
     "use_quantized_grad": "true",
     "num_grad_quant_bins": "8",
     "stochastic_rounding": "true",
@@ -139,19 +136,17 @@ LIGHTGBM_MULTICLASS_PARAMS = {
 LIGHTGBM_BINARY_PARAMS = {
     "random_state": DEFAULT_RANDOM_STATE,
-    "deterministic": True,
     "min_gain_to_split": 0.001,
     "n_estimators": 275,
     "max_depth": 5,
-    "max_cat_threshold": 80,
-    "min_data_per_group": 25,
-    "cat_l2": 10,
-    "cat_smooth": 12,
     "learning_rate": 0.05,
-    "feature_fraction": 1.0,
-    "min_sum_hessian_in_leaf": 0.01,
     "objective": "binary",
-    "class_weight": "balanced",  # TODO pass dict with weights for each class
+    "class_weight": "balanced",
+    "deterministic": True,
+    "max_cat_threshold": 80,
+    "min_data_per_group": 20,
+    "cat_smooth": 18,
+    "cat_l2" : 8,
     "verbosity": -1,
 }
@@ -759,9 +754,12 @@ class LightGBMWrapper(EstimatorWrapper):
             logger=logger,
         )
         self.cat_features = None
+        self.n_classes = None
     def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
         x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
+        if self.target_type in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]:
+            self.n_classes = len(np.unique(y_numpy))
         if LIGHTGBM_EARLY_STOPPING_ROUNDS is not None:
             params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
         self.cat_features = _get_cat_features(x)
@@ -787,31 +785,40 @@ class LightGBMWrapper(EstimatorWrapper):
     def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
         try:
-            # Suppress specific warning from SHAP for LightGBM binary classifier
-            warnings.filterwarnings(
-                "ignore",
-                message=(
-                    "LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray"
-                ),
+            shap_matrix = estimator.predict(
+                x,
+                predict_disable_shape_check=True,
+                raw_score=True,
+                pred_leaf=False,
+                pred_early_stop=True,
+                pred_contrib=True,
             )
-            from shap import TreeExplainer
-            if not isinstance(estimator, (LGBMRegressor, LGBMClassifier)):
-                return None
-            explainer = TreeExplainer(estimator)
-            shap_values = explainer.shap_values(x)
+            if self.target_type == ModelTaskType.MULTICLASS:
+                n_feat = x.shape[1]
+                shap_matrix.shape = (shap_matrix.shape[0], self.n_classes, n_feat + 1)
+                shap_matrix = np.mean(np.abs(shap_matrix), axis=1)
-            # For classification, shap_values is returned as a list for each class
-            # Take values for the positive class
-            if isinstance(shap_values, list):
-                shap_values = shap_values[1]
+            # exclude base value
+            shap_matrix = shap_matrix[:, :-1]
-            # Calculate mean absolute SHAP value for each feature
             feature_importance = {}
             for i, col in enumerate(x.columns):
-                feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
+                feature_importance[col] = np.mean(np.abs(shap_matrix[:, i]))
+            # # exclude last column (base value)
+            # shap_values_only = shap_values[:, :-1]
+            # mean_abs_shap = np.mean(np.abs(shap_values_only), axis=0)
+            # # For classification, shap_values is returned as a list for each class
+            # # Take values for the positive class
+            # if isinstance(shap_values, list):
+            #     shap_values = shap_values[1]
+            # # Calculate mean absolute SHAP value for each feature
+            # feature_importance = {}
+            # for i, col in enumerate(x.columns):
+            #     feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
             return feature_importance