PyPI - upgini - Versions diffs - 1.2.81a3832.dev7__py3-none-any.whl → 1.2.81a3832.dev9__py3-none-any.whl - Mend

upgini 1.2.81a3832.dev7py3-none-any.whl → 1.2.81a3832.dev9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.81a3832.~~dev7~~"
1	+ __version__ = "1.2.81a3832.dev9"

upgini/features_enricher.py CHANGED Viewed

@@ -1768,10 +1768,10 @@ class FeaturesEnricher(TransformerMixin):
             df = generator.generate(df)
             generated_features.extend(generator.generated_features)
-        # normalizer = Normalizer(self.bundle, self.logger)
-        # df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
-        # columns_renaming = normalizer.columns_renaming
-        columns_renaming = {c: c for c in df.columns}
+        normalizer = Normalizer(self.bundle, self.logger)
+        df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
+        columns_renaming = normalizer.columns_renaming
+        # columns_renaming = {c: c for c in df.columns}
         df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
@@ -3881,7 +3881,7 @@ if response.status_code == 200:
         if features_meta is None:
             raise Exception(self.bundle.get("missing_features_meta"))
-        return [f.name for f in features_meta if f.type == "categorical" and f.shap_value > 0.0]
+        return [f.name for f in features_meta if f.type == "categorical"]
     def __prepare_feature_importances(
         self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False

upgini/metrics.py CHANGED Viewed

@@ -6,7 +6,7 @@ import re
 from collections import defaultdict
 from copy import deepcopy
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
 import lightgbm as lgb
 import numpy as np
@@ -18,6 +18,7 @@ from numpy import log1p
 from pandas.api.types import is_numeric_dtype
 from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
+from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
 from upgini.utils.features_validator import FeaturesValidator
 from upgini.utils.sklearn_ext import cross_validate
@@ -31,7 +32,7 @@ except ImportError:
     available_scorers = SCORERS
 from sklearn.metrics import mean_squared_error
 from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
-from sklearn.model_selection import BaseCrossValidator  # , TimeSeriesSplit
+from sklearn.model_selection import BaseCrossValidator, TimeSeriesSplit  # , TimeSeriesSplit
 from upgini.errors import ValidationError
 from upgini.metadata import ModelTaskType
@@ -250,6 +251,8 @@ class _CrossValResults:
 class EstimatorWrapper:
+    default_estimator: Literal["catboost", "lightgbm"] = "catboost"
     def __init__(
         self,
         estimator,
@@ -303,6 +306,8 @@ class EstimatorWrapper:
             else:
                 if x[c].dtype == "category" and x[c].cat.categories.dtype == np.int64:
                     x[c] = x[c].astype(np.int64)
+                elif not is_numeric_dtype(x[c]):
+                    x[c] = x[c].astype(str).astype("category")
         if not isinstance(y, pd.Series):
             raise Exception(bundle.get("metrics_unsupported_target_type").format(type(y)))
@@ -352,6 +357,7 @@ class EstimatorWrapper:
             self.logger.info("Calculate baseline GINI on passed baseline_score_column and target")
             metric = roc_auc_score(y, x[baseline_score_column])
         else:
+            self.logger.info(f"Cross validate with estimeator: {self.estimator}")
             cv_results = cross_validate(
                 estimator=self.estimator,
                 x=x,
@@ -458,31 +464,43 @@ class EstimatorWrapper:
             "logger": logger,
         }
         if estimator is None:
-            params = {"has_time": has_date}
-            if target_type == ModelTaskType.MULTICLASS:
-                params = _get_add_params(params, CATBOOST_MULTICLASS_PARAMS)
-                params = _get_add_params(params, add_params)
-                estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
-                # params = _get_add_params(params, LIGHTGBM_MULTICLASS_PARAMS)
-                # params = _get_add_params(params, add_params)
-                # estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
-            elif target_type == ModelTaskType.BINARY:
-                params = _get_add_params(params, CATBOOST_BINARY_PARAMS)
-                params = _get_add_params(params, add_params)
-                estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
-                # params = _get_add_params(params, LIGHTGBM_BINARY_PARAMS)
-                # params = _get_add_params(params, add_params)
-                # estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
-            elif target_type == ModelTaskType.REGRESSION:
-                params = _get_add_params(params, CATBOOST_REGRESSION_PARAMS)
-                params = _get_add_params(params, add_params)
-                estimator = CatBoostWrapper(CatBoostRegressor(**params), **kwargs)
-                # if not isinstance(cv, TimeSeriesSplit) and not isinstance(cv, BlockedTimeSeriesSplit):
-                #     params = _get_add_params(params, LIGHTGBM_REGRESSION_PARAMS)
-                # params = _get_add_params(params, add_params)
-                # estimator = LightGBMWrapper(LGBMRegressor(**params), **kwargs)
+            if EstimatorWrapper.default_estimator == "catboost":
+                logger.info("Using CatBoost as default estimator")
+                params = {"has_time": has_date}
+                if target_type == ModelTaskType.MULTICLASS:
+                    params = _get_add_params(params, CATBOOST_MULTICLASS_PARAMS)
+                    params = _get_add_params(params, add_params)
+                    estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
+                elif target_type == ModelTaskType.BINARY:
+                    params = _get_add_params(params, CATBOOST_BINARY_PARAMS)
+                    params = _get_add_params(params, add_params)
+                    estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
+                elif target_type == ModelTaskType.REGRESSION:
+                    params = _get_add_params(params, CATBOOST_REGRESSION_PARAMS)
+                    params = _get_add_params(params, add_params)
+                    estimator = CatBoostWrapper(CatBoostRegressor(**params), **kwargs)
+                else:
+                    raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
+            elif EstimatorWrapper.default_estimator == "lightgbm":
+                logger.info("Using LightGBM as default estimator")
+                params = {"random_state": DEFAULT_RANDOM_STATE, "verbose": -1}
+                if target_type == ModelTaskType.MULTICLASS:
+                    params = _get_add_params(params, LIGHTGBM_MULTICLASS_PARAMS)
+                    params = _get_add_params(params, add_params)
+                    estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
+                elif target_type == ModelTaskType.BINARY:
+                    params = _get_add_params(params, LIGHTGBM_BINARY_PARAMS)
+                    params = _get_add_params(params, add_params)
+                    estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
+                elif target_type == ModelTaskType.REGRESSION:
+                    if not isinstance(cv, TimeSeriesSplit) and not isinstance(cv, BlockedTimeSeriesSplit):
+                        params = _get_add_params(params, LIGHTGBM_REGRESSION_PARAMS)
+                    params = _get_add_params(params, add_params)
+                    estimator = LightGBMWrapper(LGBMRegressor(**params), **kwargs)
+                else:
+                    raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
             else:
-                raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
+                raise Exception("Unsupported default_estimator. Available: catboost, lightgbm")
         else:
             if hasattr(estimator, "copy"):
                 estimator_copy = estimator.copy()
@@ -490,8 +508,8 @@ class EstimatorWrapper:
                 estimator_copy = deepcopy(estimator)
             kwargs["estimator"] = estimator_copy
             if is_catboost_estimator(estimator):
-                if cat_features is not None:
-                    estimator_copy.set_params(cat_features=cat_features, has_time=has_date)
+                if has_date is not None:
+                    estimator_copy.set_params(has_time=has_date)
                 estimator = CatBoostWrapper(**kwargs)
             else:
                 if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
@@ -941,8 +959,8 @@ def _get_cat_features(
     logger.info(f"Selected categorical features: {cat_features}")
-    non_encode_features = list(set(x.select_dtypes(exclude=[np.number, np.datetime64, pd.CategoricalDtype()]).columns))
-    features_to_encode = [f for f in cat_features if f not in non_encode_features]
+    features_to_encode = list(set(x.select_dtypes(exclude=[np.number, np.datetime64, pd.CategoricalDtype()]).columns))
+    features_to_encode = [f for f in cat_features if f in features_to_encode]
     logger.info(f"Features to encode: {features_to_encode}")

{upgini-1.2.81a3832.dev7.dist-info → upgini-1.2.81a3832.dev9.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.81a3832.dev7
+Version: 1.2.81a3832.dev9
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

{upgini-1.2.81a3832.dev7.dist-info → upgini-1.2.81a3832.dev9.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,12 @@
-upgini/__about__.py,sha256=RVMSywROOgx43djBaCB4g_TyIw1r_t2n34999sThuLw,33
+upgini/__about__.py,sha256=wEcwloV3XNyxWA40HLqEb4PIXttvc8pREucBfzAKW0c,33
 upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
 upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
 upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
-upgini/features_enricher.py,sha256=WCX50iuq8_hf9AYuEfs_ZWNR7FbFc44zuXg27Z40r2s,210874
+upgini/features_enricher.py,sha256=ZSSukaq4_mngCkJyQe-XCssXbH8nOD7ByWfSHi9nypc,210847
 upgini/http.py,sha256=AfaJ3c8z_tK2hZFEehNybDKE0mp1tYcyAP_l0_p8bLQ,43933
 upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
-upgini/metrics.py,sha256=lWFF_dQAWcgI7EOQlTXiLjsAEoPLxNv1PCp_egoKolc,38821
+upgini/metrics.py,sha256=4ehQO8VEebKLiCuBq2LRqC2QbPIqswoe7b1pnR_-zQA,39985
 upgini/search_task.py,sha256=RcvAE785yksWTsTNWuZFVNlk32jHElMoEna1T_C5N8Q,17823
 upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
 upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
 upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
 upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
-upgini-1.2.81a3832.dev7.dist-info/METADATA,sha256=BrDfaRLGuSFtMudHxPC_sYI8_G9iWzidJO0vIihGtUE,49172
-upgini-1.2.81a3832.dev7.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-upgini-1.2.81a3832.dev7.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.81a3832.dev7.dist-info/RECORD,,
+upgini-1.2.81a3832.dev9.dist-info/METADATA,sha256=6jP4TJl2tN98P8wuWIBARzrPtZVRT48uPukgTvZOvlA,49172
+upgini-1.2.81a3832.dev9.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
+upgini-1.2.81a3832.dev9.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.81a3832.dev9.dist-info/RECORD,,

{upgini-1.2.81a3832.dev7.dist-info → upgini-1.2.81a3832.dev9.dist-info}/WHEEL RENAMED Viewed

File without changes

{upgini-1.2.81a3832.dev7.dist-info → upgini-1.2.81a3832.dev9.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.81a3832.dev7__py3-none-any.whl → 1.2.81a3832.dev9__py3-none-any.whl

upgini 1.2.81a3832.dev7py3-none-any.whl → 1.2.81a3832.dev9py3-none-any.whl