PyPI - upgini - Versions diffs - 1.1.262a3250.post4__py3-none-any.whl → 1.1.280a3418.post2__py3-none-any.whl - Mend

upgini 1.1.262a3250.post4py3-none-any.whl → 1.1.280a3418.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (49) hide show

upgini/__about__.py +1 -0
upgini/ads.py +6 -2
upgini/ads_management/ads_manager.py +4 -2
upgini/autofe/all_operands.py +16 -4
upgini/autofe/binary.py +2 -1
upgini/autofe/date.py +74 -7
upgini/autofe/feature.py +1 -1
upgini/autofe/groupby.py +3 -1
upgini/autofe/operand.py +4 -3
upgini/autofe/unary.py +20 -1
upgini/autofe/vector.py +2 -0
upgini/data_source/data_source_publisher.py +14 -4
upgini/dataset.py +8 -7
upgini/errors.py +1 -1
upgini/features_enricher.py +156 -63
upgini/http.py +11 -10
upgini/mdc/__init__.py +1 -3
upgini/mdc/context.py +4 -6
upgini/metadata.py +3 -0
upgini/metrics.py +160 -96
upgini/normalizer/phone_normalizer.py +2 -2
upgini/resource_bundle/__init__.py +5 -5
upgini/resource_bundle/strings.properties +9 -4
upgini/sampler/base.py +1 -4
upgini/sampler/random_under_sampler.py +2 -5
upgini/search_task.py +4 -4
upgini/spinner.py +1 -1
upgini/utils/__init__.py +3 -2
upgini/utils/base_search_key_detector.py +2 -2
upgini/utils/blocked_time_series.py +4 -2
upgini/utils/country_utils.py +2 -2
upgini/utils/custom_loss_utils.py +3 -2
upgini/utils/cv_utils.py +2 -2
upgini/utils/datetime_utils.py +75 -18
upgini/utils/deduplicate_utils.py +61 -18
upgini/utils/email_utils.py +3 -3
upgini/utils/fallback_progress_bar.py +1 -1
upgini/utils/features_validator.py +2 -1
upgini/utils/progress_bar.py +1 -1
upgini/utils/sklearn_ext.py +15 -15
upgini/utils/target_utils.py +21 -7
upgini/utils/track_info.py +27 -15
upgini/version_validator.py +2 -2
{upgini-1.1.262a3250.post4.dist-info → upgini-1.1.280a3418.post2.dist-info}/METADATA +21 -23
upgini-1.1.280a3418.post2.dist-info/RECORD +62 -0
{upgini-1.1.262a3250.post4.dist-info → upgini-1.1.280a3418.post2.dist-info}/WHEEL +1 -2
upgini-1.1.262a3250.post4.dist-info/RECORD +0 -62
upgini-1.1.262a3250.post4.dist-info/top_level.txt +0 -1
{upgini-1.1.262a3250.post4.dist-info → upgini-1.1.280a3418.post2.dist-info/licenses}/LICENSE +0 -0

upgini/metrics.py CHANGED Viewed

@@ -1,17 +1,21 @@
+from __future__ import annotations
+import inspect
 import logging
 import re
 from copy import deepcopy
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import catboost
 import numpy as np
 import pandas as pd
 from catboost import CatBoostClassifier, CatBoostRegressor
-import catboost
 from lightgbm import LGBMClassifier, LGBMRegressor
 from numpy import log1p
 from pandas.api.types import is_numeric_dtype
 from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
+from upgini.utils.features_validator import FeaturesValidator
 from upgini.utils.sklearn_ext import cross_validate
 try:
@@ -123,7 +127,7 @@ NA_REPLACEMENT = "NA"
 SUPPORTED_CATBOOST_METRICS = {
     s.upper(): s
-    for s in {
+    for s in (
         "Logloss",
         "CrossEntropy",
         "CtrFactor",
@@ -202,7 +206,7 @@ SUPPORTED_CATBOOST_METRICS = {
         "MultiLogloss",
         "MultiCrossEntropy",
         "Combination",
-    }
+    )
 }
@@ -234,71 +238,71 @@ class EstimatorWrapper:
         self.text_features = text_features
         self.logger = logger or logging.getLogger()
-    def fit(self, X: pd.DataFrame, y: np.ndarray, **kwargs):
-        X, y, _, fit_params = self._prepare_to_fit(X, y)
+    def fit(self, x: pd.DataFrame, y: np.ndarray, **kwargs):
+        x, y, _, fit_params = self._prepare_to_fit(x, y)
         kwargs.update(fit_params)
-        self.estimator.fit(X, y, **kwargs)
+        self.estimator.fit(x, y, **kwargs)
         return self
     def predict(self, **kwargs):
         return self.estimator.predict(**kwargs)
-    def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
-        X, y, groups = self._prepare_data(X, y, groups=self.groups)
-        return X, y, groups, {}
+    def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
+        x, y, groups = self._prepare_data(x, y, groups=self.groups)
+        return x, y, groups, {}
     def _prepare_data(
-        self, X: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
+        self, x: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
     ) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
-        for c in X.columns:
-            if is_numeric_dtype(X[c]):
-                X[c] = X[c].astype(float)
+        for c in x.columns:
+            if is_numeric_dtype(x[c]):
+                x[c] = x[c].astype(float)
             else:
-                X[c] = X[c].astype(str)
+                x[c] = x[c].astype(str)
         if not isinstance(y, pd.Series):
             raise Exception(bundle.get("metrics_unsupported_target_type").format(type(y)))
         if groups is not None:
-            X = X.copy()
-            X["__groups"] = groups
-            X, y = self._remove_empty_target_rows(X, y)
-            groups = X["__groups"]
-            X = X.drop(columns="__groups")
+            x = x.copy()
+            x["__groups"] = groups
+            x, y = self._remove_empty_target_rows(x, y)
+            groups = x["__groups"]
+            x = x.drop(columns="__groups")
         else:
-            X, y = self._remove_empty_target_rows(X, y)
+            x, y = self._remove_empty_target_rows(x, y)
-        return X, y, groups
+        return x, y, groups
-    def _remove_empty_target_rows(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
-        joined = pd.concat([X, y], axis=1)
+    def _remove_empty_target_rows(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
+        joined = pd.concat([x, y], axis=1)
         joined = joined[joined[y.name].notna()]
         joined = joined.reset_index(drop=True)
-        X = joined.drop(columns=y.name)
+        x = joined.drop(columns=y.name)
         y = np.array(list(joined[y.name].values))
-        return X, y
+        return x, y
-    def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
-        X, y, _ = self._prepare_data(X, y)
-        return X, y, {}
+    def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
+        x, y, _ = self._prepare_data(x, y)
+        return x, y, {}
     def cross_val_predict(
-        self, X: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
+        self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
     ) -> Optional[float]:
-        X, y, groups, fit_params = self._prepare_to_fit(X, y)
+        x, y, groups, fit_params = self._prepare_to_fit(x, y)
-        if X.shape[1] == 0:
+        if x.shape[1] == 0:
             return None
         scorer = check_scoring(self.estimator, scoring=self.scorer)
         if baseline_score_column is not None and self.metric_name == "GINI":
-            metric = roc_auc_score(y, X[baseline_score_column])
+            metric = roc_auc_score(y, x[baseline_score_column])
         else:
             cv_results = cross_validate(
                 estimator=self.estimator,
-                X=X,
+                x=x,
                 y=y,
                 scoring=scorer,
                 cv=self.cv,
@@ -318,14 +322,14 @@ class EstimatorWrapper:
             metric = 2 * metric - 1
         return metric
-    def calculate_metric(self, X: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None) -> float:
-        X, y, _ = self._prepare_to_calculate(X, y)
+    def calculate_metric(self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None) -> float:
+        x, y, _ = self._prepare_to_calculate(x, y)
         if baseline_score_column is not None and self.metric_name == "GINI":
-            metric = roc_auc_score(y, X[baseline_score_column])
+            metric = roc_auc_score(y, x[baseline_score_column])
         else:
             metrics = []
             for est in self.cv_estimators:
-                metrics.append(self.scorer(est, X, y))
+                metrics.append(self.scorer(est, x, y))
             metric = np.mean(metrics) * self.multiplier
         return self.post_process_metric(metric)
@@ -336,13 +340,13 @@ class EstimatorWrapper:
         logger: logging.Logger,
         target_type: ModelTaskType,
         cv: BaseCrossValidator,
-        X: pd.DataFrame,
+        x: pd.DataFrame,
         scoring: Union[Callable, str, None] = None,
         cat_features: Optional[List[str]] = None,
         text_features: Optional[List[str]] = None,
         add_params: Optional[Dict[str, Any]] = None,
         groups: Optional[List[str]] = None,
-    ) -> "EstimatorWrapper":
+    ) -> EstimatorWrapper:
         scorer, metric_name, multiplier = _get_scorer(target_type, scoring)
         kwargs = {
             "scorer": scorer,
@@ -352,6 +356,7 @@ class EstimatorWrapper:
             "target_type": target_type,
             "groups": groups,
             "text_features": text_features,
+            "logger": logger,
         }
         if estimator is None:
             params = dict()
@@ -377,15 +382,20 @@ class EstimatorWrapper:
             else:
                 estimator_copy = deepcopy(estimator)
             kwargs["estimator"] = estimator_copy
-            if isinstance(estimator, CatBoostClassifier) or isinstance(estimator, CatBoostRegressor):
+            if isinstance(estimator, (CatBoostClassifier, CatBoostRegressor)):
                 if cat_features is not None:
+                    for cat_feature in cat_features:
+                        if cat_feature not in x.columns:
+                            logger.error(
+                                f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
+                            )
                     estimator_copy.set_params(
-                        cat_features=[X.columns.get_loc(cat_feature) for cat_feature in cat_features]
+                        cat_features=[x.columns.get_loc(cat_feature) for cat_feature in cat_features]
                     )
                 estimator = CatBoostWrapper(**kwargs)
             else:
                 try:
-                    if isinstance(estimator, LGBMClassifier) or isinstance(estimator, LGBMRegressor):
+                    if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
                         estimator = LightGBMWrapper(**kwargs)
                     else:
                         logger.warning(
@@ -414,32 +424,40 @@ class CatBoostWrapper(EstimatorWrapper):
         target_type: ModelTaskType,
         groups: Optional[List[str]] = None,
         text_features: Optional[List[str]] = None,
+        logger: Optional[logging.Logger] = None,
     ):
         super(CatBoostWrapper, self).__init__(
-            estimator, scorer, metric_name, multiplier, cv, target_type, groups=groups, text_features=text_features
+            estimator,
+            scorer,
+            metric_name,
+            multiplier,
+            cv,
+            target_type,
+            groups=groups,
+            text_features=text_features,
+            logger=logger,
         )
         self.cat_features = None
         self.emb_features = None
+        self.exclude_features = []
-    def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
-        X, y, groups, params = super()._prepare_to_fit(X, y)
+    def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
+        x, y, groups, params = super()._prepare_to_fit(x, y)
         # Find embeddings
         if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
             emb_pattern = r"(.+)_emb\d+"
-            self.emb_features = [c for c in X.columns if re.match(emb_pattern, c) and is_numeric_dtype(X[c])]
+            self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
             embedding_features = []
             if len(self.emb_features) > 3:  # There is no reason to reduce embeddings dimension with less than 4
                 self.logger.info(
                     "Embedding features count more than 3, so group them into one vector for CatBoost: "
                     f"{self.emb_features}"
                 )
-                X, embedding_features = self.group_embeddings(X)
+                x, embedding_features = self.group_embeddings(x)
                 params["embedding_features"] = embedding_features
             else:
-                self.logger.info(
-                    f"Embedding features count less than 3, so use them separately: {self.emb_features}"
-                )
+                self.logger.info(f"Embedding features count less than 3, so use them separately: {self.emb_features}")
                 self.emb_features = []
         else:
             self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
@@ -448,7 +466,7 @@ class CatBoostWrapper(EstimatorWrapper):
         if hasattr(CatBoostClassifier, "get_text_feature_indices"):
             if self.text_features is not None:
                 self.logger.info(f"Passed text features for CatBoost: {self.text_features}")
-                self.text_features = [f for f in self.text_features if f in X.columns and not is_numeric_dtype(X[f])]
+                self.text_features = [f for f in self.text_features if f in x.columns and not is_numeric_dtype(x[f])]
                 self.logger.info(f"Rest text features after checks: {self.text_features}")
                 params["text_features"] = self.text_features
         else:
@@ -456,15 +474,15 @@ class CatBoostWrapper(EstimatorWrapper):
             self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
         # Find rest categorical features
-        self.cat_features = _get_cat_features(X, self.text_features, embedding_features)
-        X = fill_na_cat_features(X, self.cat_features)
+        self.cat_features = _get_cat_features(x, self.text_features, embedding_features)
+        x = fill_na_cat_features(x, self.cat_features)
         unique_cat_features = []
         for name in self.cat_features:
             # Remove constant categorical features
-            if X[name].nunique() > 1:
+            if x[name].nunique() > 1:
                 unique_cat_features.append(name)
             else:
-                X = X.drop(columns=name)
+                x = x.drop(columns=name)
         self.cat_features = unique_cat_features
         if (
             hasattr(self.estimator, "get_param")
@@ -473,9 +491,9 @@ class CatBoostWrapper(EstimatorWrapper):
         ):
             estimator_cat_features = self.estimator.get_param("cat_features")
             if all([isinstance(c, int) for c in estimator_cat_features]):
-                cat_features_idx = {X.columns.get_loc(c) for c in self.cat_features}
+                cat_features_idx = {x.columns.get_loc(c) for c in self.cat_features}
                 cat_features_idx.update(estimator_cat_features)
-                self.cat_features = [X.columns[idx] for idx in sorted(cat_features_idx)]
+                self.cat_features = [x.columns[idx] for idx in sorted(cat_features_idx)]
             elif all([isinstance(c, str) for c in estimator_cat_features]):
                 self.cat_features = list(set(self.cat_features + estimator_cat_features))
             else:
@@ -486,7 +504,7 @@ class CatBoostWrapper(EstimatorWrapper):
         self.logger.info(f"Selected categorical features: {self.cat_features}")
         params["cat_features"] = self.cat_features
-        return X, y, groups, params
+        return x, y, groups, params
     def group_embeddings(self, df: pd.DataFrame):
         emb_name = "__grouped_embeddings"
@@ -497,18 +515,40 @@ class CatBoostWrapper(EstimatorWrapper):
         return df, [emb_name]
-    def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
-        X, y, params = super()._prepare_to_calculate(X, y)
+    def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
+        if self.exclude_features:
+            x = x.drop(columns=self.exclude_features)
+        x, y, params = super()._prepare_to_calculate(x, y)
         if self.text_features:
             params["text_features"] = self.text_features
         if self.emb_features:
-            X, emb_columns = self.group_embeddings(X)
+            x, emb_columns = self.group_embeddings(x)
             params["embedding_features"] = emb_columns
         if self.cat_features:
-            X = fill_na_cat_features(X, self.cat_features)
+            x = fill_na_cat_features(x, self.cat_features)
             params["cat_features"] = self.cat_features
-        return X, y, params
+        return x, y, params
+    def cross_val_predict(
+        self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
+    ) -> Optional[float]:
+        try:
+            return super().cross_val_predict(x, y, baseline_score_column)
+        except Exception as e:
+            if "Dictionary size is 0" in e.args[0] and self.text_features:
+                high_cardinality_features = FeaturesValidator.find_high_cardinality(x[self.text_features])
+                self.logger.warning(
+                    "Calculate metrics has problem with CatBoost text features. Try to remove high cardinality"
+                    f" text features {high_cardinality_features} and retry"
+                )
+                for f in high_cardinality_features:
+                    self.text_features.remove(f)
+                    self.exclude_features.append(f)
+                    x = x.drop(columns=f)
+                return super().cross_val_predict(x, y, baseline_score_column)
+            else:
+                raise e
 class LightGBMWrapper(EstimatorWrapper):
@@ -522,32 +562,41 @@ class LightGBMWrapper(EstimatorWrapper):
         target_type: ModelTaskType,
         groups: Optional[List[str]] = None,
         text_features: Optional[List[str]] = None,
+        logger: Optional[logging.Logger] = None,
     ):
         super(LightGBMWrapper, self).__init__(
-            estimator, scorer, metric_name, multiplier, cv, target_type, groups=groups, text_features=text_features
+            estimator,
+            scorer,
+            metric_name,
+            multiplier,
+            cv,
+            target_type,
+            groups=groups,
+            text_features=text_features,
+            logger=logger,
         )
         self.cat_features = None
-    def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
-        X, y, groups, params = super()._prepare_to_fit(X, y)
-        self.cat_features = _get_cat_features(X)
-        X = fill_na_cat_features(X, self.cat_features)
+    def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
+        x, y, groups, params = super()._prepare_to_fit(x, y)
+        self.cat_features = _get_cat_features(x)
+        x = fill_na_cat_features(x, self.cat_features)
         for feature in self.cat_features:
-            X[feature] = X[feature].astype("category").cat.codes
+            x[feature] = x[feature].astype("category").cat.codes
         if not is_numeric_dtype(y):
             y = correct_string_target(y)
-        return X, y, groups, params
+        return x, y, groups, params
-    def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
-        X, y, params = super()._prepare_to_calculate(X, y)
+    def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
+        x, y, params = super()._prepare_to_calculate(x, y)
         if self.cat_features is not None:
-            X = fill_na_cat_features(X, self.cat_features)
+            x = fill_na_cat_features(x, self.cat_features)
             for feature in self.cat_features:
-                X[feature] = X[feature].astype("category").cat.codes
+                x[feature] = x[feature].astype("category").cat.codes
         if not is_numeric_dtype(y):
             y = correct_string_target(y)
-        return X, y, params
+        return x, y, params
 class OtherEstimatorWrapper(EstimatorWrapper):
@@ -561,54 +610,69 @@ class OtherEstimatorWrapper(EstimatorWrapper):
         target_type: ModelTaskType,
         groups: Optional[List[str]] = None,
         text_features: Optional[List[str]] = None,
+        logger: Optional[logging.Logger] = None,
     ):
         super(OtherEstimatorWrapper, self).__init__(
-            estimator, scorer, metric_name, multiplier, cv, target_type, groups=groups, text_features=text_features
+            estimator,
+            scorer,
+            metric_name,
+            multiplier,
+            cv,
+            target_type,
+            groups=groups,
+            text_features=text_features,
+            logger=logger,
         )
         self.cat_features = None
-    def _prepare_to_fit(self, X: pd.DataFrame, y: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
-        X, y, groups, params = super()._prepare_to_fit(X, y)
-        self.cat_features = _get_cat_features(X)
-        num_features = [col for col in X.columns if col not in self.cat_features]
-        X[num_features] = X[num_features].fillna(-999)
-        X = fill_na_cat_features(X, self.cat_features)
+    def _prepare_to_fit(self, x: pd.DataFrame, y: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
+        x, y, groups, params = super()._prepare_to_fit(x, y)
+        self.cat_features = _get_cat_features(x)
+        num_features = [col for col in x.columns if col not in self.cat_features]
+        x[num_features] = x[num_features].fillna(-999)
+        x = fill_na_cat_features(x, self.cat_features)
         # TODO use one-hot encoding if cardinality is less 50
         for feature in self.cat_features:
-            X[feature] = X[feature].astype("category").cat.codes
+            x[feature] = x[feature].astype("category").cat.codes
         if not is_numeric_dtype(y):
             y = correct_string_target(y)
-        return X, y, groups, params
+        return x, y, groups, params
-    def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
-        X, y, params = super()._prepare_to_calculate(X, y)
+    def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
+        x, y, params = super()._prepare_to_calculate(x, y)
         if self.cat_features is not None:
-            num_features = [col for col in X.columns if col not in self.cat_features]
-            X[num_features] = X[num_features].fillna(-999)
-            X = fill_na_cat_features(X, self.cat_features)
+            num_features = [col for col in x.columns if col not in self.cat_features]
+            x[num_features] = x[num_features].fillna(-999)
+            x = fill_na_cat_features(x, self.cat_features)
             # TODO use one-hot encoding if cardinality is less 50
             for feature in self.cat_features:
-                X[feature] = X[feature].astype("category").cat.codes
+                x[feature] = x[feature].astype("category").cat.codes
         if not is_numeric_dtype(y):
             y = correct_string_target(y)
-        return X, y, params
+        return x, y, params
 def validate_scoring_argument(scoring: Union[Callable, str, None]):
     if isinstance(scoring, str) and scoring is not None:
         _get_scorer_by_name(scoring)
+    elif isinstance(scoring, Callable):
+        spec = inspect.getfullargspec(scoring)
+        if len(spec.args) < 3:
+            raise ValidationError(
+                f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, x, y"
+            )
 def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
     metric_name = scoring
     multiplier = 1
-    if "mean_squared_log_error" == metric_name or "MSLE" == metric_name or "msle" == metric_name:
+    if metric_name == "mean_squared_log_error" or metric_name == "MSLE" or metric_name == "msle":
         scoring = make_scorer(_ext_mean_squared_log_error, greater_is_better=False)
         multiplier = -1
-    elif "root_mean_squared_log_error" in metric_name or "RMSLE" == metric_name or "rmsle" == metric_name:
+    elif "root_mean_squared_log_error" in metric_name or metric_name == "RMSLE" or metric_name == "rmsle":
         scoring = make_scorer(_ext_root_mean_squared_log_error, greater_is_better=False)
         multiplier = -1
-    elif "root_mean_squared_error" == metric_name or "RMSE" == metric_name or "rmse" == metric_name:
+    elif metric_name == "root_mean_squared_error" or metric_name == "RMSE" or metric_name == "rmse":
         scoring = get_scorer("neg_root_mean_squared_error")
         multiplier = -1
     elif scoring in available_scorers:
@@ -660,12 +724,12 @@ def _get_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None])
 def _get_cat_features(
-    X: pd.DataFrame, text_features: Optional[List[str]] = None, emb_features: Optional[List[str]] = None
+    x: pd.DataFrame, text_features: Optional[List[str]] = None, emb_features: Optional[List[str]] = None
 ) -> List[str]:
     text_features = text_features or []
     emb_features = emb_features or []
     exclude_features = text_features + emb_features
-    return [c for c in X.columns if c not in exclude_features and not is_numeric_dtype(X[c])]
+    return [c for c in x.columns if c not in exclude_features and not is_numeric_dtype(x[c])]
 def _get_add_params(input_params, add_params):

upgini/normalizer/phone_normalizer.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from typing import Optional
 import pandas as pd
-from pandas.api.types import is_float_dtype, is_int64_dtype, is_string_dtype
+from pandas.api.types import is_float_dtype, is_int64_dtype, is_object_dtype, is_string_dtype
 from upgini.errors import ValidationError
@@ -44,7 +44,7 @@ class PhoneNormalizer:
         Method will remove all non numeric chars from string and convert it to int.
         None will be set for phone numbers that couldn"t be converted to int
         """
-        if is_string_dtype(self.df[self.phone_column_name]):
+        if is_string_dtype(self.df[self.phone_column_name]) or is_object_dtype(self.df[self.phone_column_name]):
             convert_func = self.phone_str_to_int_safe
         elif is_float_dtype(self.df[self.phone_column_name]):
             convert_func = self.phone_float_to_int_safe

upgini/resource_bundle/__init__.py CHANGED Viewed

@@ -17,7 +17,7 @@ __author__ = "Felix Zenk"
 __email__ = "felix.zenk@web.de"
-class _Parser(object):
+class _Parser:
     """
     A parser for the .properties file format.
     """
@@ -49,7 +49,7 @@ class _Parser(object):
             return re.sub(pattern, lambda match: codecs.decode(match.group(0), "unicode-escape"), arg)
         # I/O read
-        with open(file_path, mode="r", encoding="utf-8") as f:
+        with open(file_path, encoding="utf-8") as f:
             lines = f.readlines()
         # parse
@@ -83,7 +83,7 @@ class _Parser(object):
         return mapping
-class ResourceBundle(object):
+class ResourceBundle:
     """
     A ResourceBundle manages internationalization of string resources
     """
@@ -199,7 +199,7 @@ class ResourceBundle(object):
         raise NotInResourceBundleError(self.name, item)
-def get_bundle(bundle_name: str, locale: str | Sequence[str | str] = None, path: Path | str = None) -> ResourceBundle:
+def get_bundle(bundle_name: str, locale: str | Sequence[str] = None, path: Path | str = None) -> ResourceBundle:
     """
     Return a new :class:`ResourceBundle` after parsing the locale
@@ -224,7 +224,7 @@ bundle = ResourceBundle("strings", None, path=os.path.dirname(os.path.realpath(_
 custom_bundles = dict()
-def get_custom_bundle(custom_cfg: Optional[str] = None) -> "ResourceBundle":
+def get_custom_bundle(custom_cfg: Optional[str] = None) -> ResourceBundle:
     global custom_bundles
     if custom_cfg is not None:
         custom_bundle = custom_bundles.get(custom_cfg)

upgini/resource_bundle/strings.properties CHANGED Viewed

@@ -38,6 +38,7 @@ loss_selection_warn=\nWARNING: Loss `{0}` is not supported for feature selection
 loss_calc_metrics_warn=\nWARNING: Loss `{0}` is not supported for metrics calculation with {1}
 multivariate_timeseries_detected=\nWARNING: Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
 group_k_fold_in_classification=\nWARNING: Using group K-fold cross-validation split for classification task.
+current_date_added=\nWARNING: No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
 # Errors
 failed_search_by_task_id=Failed to retrieve the specified search results
@@ -111,6 +112,9 @@ x_is_empty=X is empty
 y_is_empty=y is empty
 x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
 missing_generate_feature=\nWARNING: Feature {} specified in `generate_features` is not present in input columns: {}
+x_unstable_by_date=\nWARNING: Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
+train_unstable_target=\nWARNING: Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
+eval_unstable_target=\nWARNING: Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
     # eval set validation
 unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
 eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
@@ -145,7 +149,8 @@ dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample
 dataset_empty_column_names=Some column names are empty. Add names please
 dataset_full_duplicates=\nWARNING: {:.5f}% of the rows are fully duplicated
 dataset_diff_target_duplicates=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
-dataset_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
+dataset_train_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
+dataset_eval_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
 dataset_drop_old_dates=\nWARNING: We don't have data before '2000-01-01' and removed all earlier records from the search dataset
 dataset_all_dates_old=There is empty train dataset after removing data before '2000-01-01'
 dataset_invalid_target_type=Unexpected dtype of target for binary task type: {}. Expected int or bool
@@ -154,7 +159,7 @@ dataset_invalid_multiclass_target=Unexpected dtype of target for multiclass task
 dataset_invalid_regression_target=Unexpected dtype of target for regression task type: {}. Expected float
 dataset_invalid_timeseries_target=Unexpected dtype of target for timeseries task type: {}. Expected float
 dataset_to_many_multiclass_targets=The number of target classes {} exceeds the allowed threshold: {}. Please, correct your data and try again
-dataset_rarest_class_less_min=Frequency of the rarest class `{}` is {}, minimum frequency must be > {} for each class\nPlease, remove rows with rarest class from your dataframe
+dataset_rarest_class_less_min=Count of rows with the rarest class `{}` is {}, minimum count must be > {} for each class\nPlease, remove rows with rarest class from your dataframe
 dataset_rarest_class_less_threshold=\nWARNING: Target is imbalanced and will be undersampled to the rarest class. Frequency of the rarest class `{}` is {}\nMinimum number of observations for each class to avoid undersampling {} ({}%)
 dataset_date_features=\nWARNING: Columns {} is a datetime or period type but not used as a search key, removed from X
 dataset_too_many_features=Too many features. Maximum number of features is {}
@@ -196,10 +201,10 @@ email_detected=Emails detected in column `{}`. It will be used as a search key\n
 email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
 phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
 phone_detected_not_registered=\nWARNING: Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
-target_type_detected=Detected task type: {}\n
+target_type_detected=\nDetected task type: {}\n
 # all_ok_community_invite=Chat with us in Slack community:
 all_ok_community_invite=❓ Support request
-too_small_for_metrics=Your train dataset contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
+too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
 imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
 loss_selection_info=Using loss `{}` for feature selection
 loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator

upgini/sampler/base.py CHANGED Viewed

@@ -9,13 +9,11 @@ from abc import ABCMeta, abstractmethod
 from typing import List, Optional
 import numpy as np
 from sklearn.base import BaseEstimator
 from sklearn.preprocessing import label_binarize
 from sklearn.utils.multiclass import check_classification_targets
-from .utils import check_sampling_strategy, check_target_type
-from .utils import ArraysTransformer
+from .utils import ArraysTransformer, check_sampling_strategy, check_target_type
 class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
@@ -107,7 +105,6 @@ class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
             The corresponding label of `X_resampled`.
         """
-        pass
     @abstractmethod
     def _check_X_y(self, X, y, accept_sparse: Optional[List[str]] = None):

upgini 1.1.262a3250.post4__py3-none-any.whl → 1.1.280a3418.post2__py3-none-any.whl

Potentially problematic release.

upgini 1.1.262a3250.post4py3-none-any.whl → 1.1.280a3418.post2py3-none-any.whl