PyPI - upgini - Versions diffs - 1.2.81a3832.dev10__py3-none-any.whl → 1.2.81a3832.dev12__py3-none-any.whl - Mend

upgini 1.2.81a3832.dev10py3-none-any.whl → 1.2.81a3832.dev12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (8) hide show

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.81a3832.~~dev10~~"
1	+ __version__ = "1.2.81a3832.dev12"

upgini/features_enricher.py CHANGED Viewed

@@ -1017,6 +1017,12 @@ class FeaturesEnricher(TransformerMixin):
                 else:
                     client_cat_features = []
+                # rename baseline_score_column
+                reversed_renaming = {v: k for k, v in columns_renaming.items()}
+                baseline_score_column = self.baseline_score_column
+                if baseline_score_column is not None:
+                    baseline_score_column = reversed_renaming[baseline_score_column]
                 gc.collect()
                 if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
@@ -1069,7 +1075,7 @@ class FeaturesEnricher(TransformerMixin):
                             has_date=has_date,
                         )
                         etalon_cv_result = baseline_estimator.cross_val_predict(
-                            fitting_X, y_sorted, self.baseline_score_column
+                            fitting_X, y_sorted, baseline_score_column
                         )
                         etalon_metric = etalon_cv_result.get_display_metric()
                         if etalon_metric is None:
@@ -1165,7 +1171,7 @@ class FeaturesEnricher(TransformerMixin):
                                     f"on client features: {eval_X_sorted.columns.to_list()}"
                                 )
                                 etalon_eval_results = baseline_estimator.calculate_metric(
-                                    eval_X_sorted, eval_y_sorted, self.baseline_score_column
+                                    eval_X_sorted, eval_y_sorted, baseline_score_column
                                 )
                                 etalon_eval_metric = etalon_eval_results.get_display_metric()
                                 self.logger.info(
@@ -1959,6 +1965,14 @@ class FeaturesEnricher(TransformerMixin):
             enriched_df, x_columns, enriched_X.columns.tolist(), len(eval_set) if has_eval_set else 0
         )
+        # Add hash-suffixes because output of transform has original names
+        reversed_renaming = {v: k for k, v in columns_renaming.items()}
+        X_sampled.rename(columns=reversed_renaming, inplace=True)
+        enriched_X.rename(columns=reversed_renaming, inplace=True)
+        for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
+            eval_X_sampled.rename(columns=reversed_renaming, inplace=True)
+            enriched_eval_X.rename(columns=reversed_renaming, inplace=True)
         # Cache and return results
         datasets_hash = hash_input(validated_X, validated_y, eval_set)
         return self.__cache_and_return_results(
@@ -4245,7 +4259,7 @@ if response.status_code == 200:
     def __show_selected_features(self, search_keys: Dict[str, SearchKey]):
         search_key_names = [col for col, tpe in search_keys.items() if tpe != SearchKey.CUSTOM_KEY]
         if self.fit_columns_renaming:
-            search_key_names = [self.fit_columns_renaming.get(col, col) for col in search_key_names]
+            search_key_names = sorted(set([self.fit_columns_renaming.get(col, col) for col in search_key_names]))
         msg = self.bundle.get("features_info_header").format(len(self.feature_names_), search_key_names)
         try:

upgini/metrics.py CHANGED Viewed

@@ -15,7 +15,7 @@ from catboost import CatBoostClassifier, CatBoostRegressor
 from category_encoders.cat_boost import CatBoostEncoder
 from lightgbm import LGBMClassifier, LGBMRegressor
 from numpy import log1p
-from pandas.api.types import is_numeric_dtype
+from pandas.api.types import is_numeric_dtype, is_integer_dtype, is_float_dtype
 from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
 from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
@@ -324,6 +324,9 @@ class EstimatorWrapper:
         self.text_features = text_features
         self.logger = logger or logging.getLogger()
         self.droped_features = []
+        self.converted_to_int = []
+        self.converted_to_str = []
+        self.converted_to_numeric = []
     def fit(self, x: pd.DataFrame, y: np.ndarray, **kwargs):
         x, y, _, fit_params = self._prepare_to_fit(x, y)
@@ -335,44 +338,6 @@ class EstimatorWrapper:
         x, _, _ = self._prepare_to_calculate(x, None)
         return self.estimator.predict(x, **kwargs)
-    def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
-        x, y, groups = self._prepare_data(x, y, groups=self.groups)
-        self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
-        self.droped_features = []
-        for c in x.columns:
-            if _get_unique_count(x[c]) < 2:
-                self.logger.warning(f"Remove feature {c} because it has less than 2 unique values")
-                self.droped_features.append(c)
-                if c in self.cat_features:
-                    self.cat_features.remove(c)
-                x.drop(columns=[c], inplace=True)
-            elif c in self.cat_features:
-                if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
-                    x[c] = x[c].astype(np.int64)
-                elif is_numeric_object(x[c]):
-                    self.logger.warning(
-                        f"Convert numeric feature {c} of type {x[c].dtype} to numeric and remove from cat_features"
-                    )
-                    x[c] = pd.to_numeric(x[c], errors="coerce")
-                    self.cat_features.remove(c)
-                elif x[c].dtype != "category":
-                    x[c] = x[c].astype(str)
-            elif self.text_features is not None and c in self.text_features:
-                x[c] = x[c].astype(str)
-            else:
-                if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
-                    x[c] = x[c].astype(np.int64)
-                elif not is_valid_numeric_array_data(x[c]):
-                    try:
-                        x[c] = pd.to_numeric(x[c], errors="raise")
-                    except (ValueError, TypeError):
-                        self.logger.warning(f"Remove feature {c} because it is not numeric and not in cat_features")
-                        self.droped_features.append(c)
-                        x.drop(columns=[c], inplace=True)
-        return x, y, groups, {}
     def _prepare_data(
         self, x: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
     ) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
@@ -403,26 +368,82 @@ class EstimatorWrapper:
         return x, y
-    def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
-        x, y, _ = self._prepare_data(x, y)
-        if self.droped_features:
-            self.logger.warning(f"Dropped features: {self.droped_features}")
-            x = x.drop(columns=self.droped_features)
+    def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
+        x, y, groups = self._prepare_data(x, y, groups=self.groups)
+        self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
+        self.droped_features = []
+        self.converted_to_int = []
+        self.converted_to_str = []
+        self.converted_to_numeric = []
         for c in x.columns:
-            if c in self.cat_features:
+            if _get_unique_count(x[c]) < 2:
+                self.logger.warning(f"Remove feature {c} because it has less than 2 unique values")
+                if c in self.cat_features:
+                    self.cat_features.remove(c)
+                x.drop(columns=[c], inplace=True)
+                self.droped_features.append(c)
+            elif self.text_features is not None and c in self.text_features:
+                x[c] = x[c].astype(str)
+                self.converted_to_str.append(c)
+            elif c in self.cat_features:
                 if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
                     x[c] = x[c].astype(np.int64)
-                elif x[c].dtype != "category":
+                    self.converted_to_int.append(c)
+                elif x[c].dtype == "category" and is_integer_dtype(x[c].cat.categories):
+                    self.logger.info(
+                        f"Convert categorical feature {c} with integer categories"
+                        " to int64 and remove from cat_features"
+                    )
+                    x[c] = x[c].astype(np.int64)
+                    self.converted_to_int.append(c)
+                    self.cat_features.remove(c)
+                elif is_float_dtype(x[c]) or (x[c].dtype == "category" and is_float_dtype(x[c].cat.categories)):
+                    self.logger.info(
+                        f"Convert float cat feature {c} to string"
+                    )
                     x[c] = x[c].astype(str)
-            elif self.text_features is not None and c in self.text_features:
-                x[c] = x[c].astype(str)
+                    self.converted_to_str.append(c)
+                elif x[c].dtype not in ["category", "int64"]:
+                    x[c] = x[c].astype(str)
+                    self.converted_to_str.append(c)
             else:
                 if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
+                    self.logger.info(f"Convert bool feature {c} to int64")
                     x[c] = x[c].astype(np.int64)
+                    self.converted_to_int.append(c)
                 elif not is_valid_numeric_array_data(x[c]):
-                    x[c] = pd.to_numeric(x[c], errors="coerce")
+                    try:
+                        x[c] = pd.to_numeric(x[c], errors="raise")
+                        self.converted_to_numeric.append(c)
+                    except (ValueError, TypeError):
+                        self.logger.warning(f"Remove feature {c} because it is not numeric and not in cat_features")
+                        x.drop(columns=[c], inplace=True)
+                        self.droped_features.append(c)
+        return x, y, groups, {}
+    def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
+        x, y, _ = self._prepare_data(x, y)
+        if self.droped_features:
+            self.logger.info(f"Drop features on calculate metrics: {self.droped_features}")
+            x = x.drop(columns=self.droped_features)
+        if self.converted_to_int:
+            self.logger.info(f"Convert to int features on calculate metrics: {self.converted_to_int}")
+            for c in self.converted_to_int:
+                x[c] = x[c].astype(np.int64)
+        if self.converted_to_str:
+            self.logger.info(f"Convert to str features on calculate metrics: {self.converted_to_str}")
+            for c in self.converted_to_str:
+                x[c] = x[c].astype(str)
+        if self.converted_to_numeric:
+            self.logger.info(f"Convert to numeric features on calculate metrics: {self.converted_to_numeric}")
+            for c in self.converted_to_numeric:
+                x[c] = pd.to_numeric(x[c], errors="coerce")
         return x, y, {}
@@ -443,6 +464,8 @@ class EstimatorWrapper:
         if baseline_score_column is not None and self.metric_name == "GINI":
             self.logger.info("Calculate baseline GINI on passed baseline_score_column and target")
             metric = roc_auc_score(y, x[baseline_score_column])
+            metric_std = None
+            average_shap_values = None
         else:
             self.logger.info(f"Cross validate with estimeator: {self.estimator}")
             cv_results = cross_validate(

upgini/search_task.py CHANGED Viewed

@@ -179,6 +179,7 @@ class SearchTask:
                     for f in meta.generated_features
                     for c in f.base_columns
                     if c.ads_definition_id is None
+                    and not c.original_name.endswith("_emb")  # embeddings already added
                 )
         return list(features_for_transform)

{upgini-1.2.81a3832.dev10.dist-info → upgini-1.2.81a3832.dev12.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.81a3832.dev10
+Version: 1.2.81a3832.dev12
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

{upgini-1.2.81a3832.dev10.dist-info → upgini-1.2.81a3832.dev12.dist-info}/RECORD RENAMED Viewed

@@ -1,13 +1,13 @@
-upgini/__about__.py,sha256=dUnN248oLg0rBaOttshEyx0_AtLIiP6ku5lXmtwrlQo,34
+upgini/__about__.py,sha256=QoAMu0gkmwzsYvsLvBmcg4CfaE-sL6T-rz9s8HCGZY4,34
 upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
 upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
 upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
-upgini/features_enricher.py,sha256=ZSSukaq4_mngCkJyQe-XCssXbH8nOD7ByWfSHi9nypc,210847
+upgini/features_enricher.py,sha256=cbQydnSOr7-ioQuEs-X3KYd0ays1BPuwFE_sKmOQc5E,211702
 upgini/http.py,sha256=AfaJ3c8z_tK2hZFEehNybDKE0mp1tYcyAP_l0_p8bLQ,43933
 upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
-upgini/metrics.py,sha256=DpXJtooXDCLTJUf3JlfIsJiwx9Hg-2vv4-k4RWkXFMU,42269
-upgini/search_task.py,sha256=RcvAE785yksWTsTNWuZFVNlk32jHElMoEna1T_C5N8Q,17823
+upgini/metrics.py,sha256=sbxnFyMWCUsVSAy-OwNmDYJxVlGEnTArVUnTOID7miU,43373
+upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
 upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
 upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
 upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
 upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
 upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
-upgini-1.2.81a3832.dev10.dist-info/METADATA,sha256=F0Eg-CF-u-X2QDwUGlH0Fom-Ys1Br4bfoR_RBUq0ob8,49173
-upgini-1.2.81a3832.dev10.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-upgini-1.2.81a3832.dev10.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.81a3832.dev10.dist-info/RECORD,,
+upgini-1.2.81a3832.dev12.dist-info/METADATA,sha256=2cf3_AwHclmjPzAluKb_Y2I_4OecghsB-DqKoJVODls,49173
+upgini-1.2.81a3832.dev12.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
+upgini-1.2.81a3832.dev12.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.81a3832.dev12.dist-info/RECORD,,

{upgini-1.2.81a3832.dev10.dist-info → upgini-1.2.81a3832.dev12.dist-info}/WHEEL RENAMED Viewed

File without changes

{upgini-1.2.81a3832.dev10.dist-info → upgini-1.2.81a3832.dev12.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.81a3832.dev10__py3-none-any.whl → 1.2.81a3832.dev12__py3-none-any.whl

Potentially problematic release.

upgini 1.2.81a3832.dev10py3-none-any.whl → 1.2.81a3832.dev12py3-none-any.whl