PyPI - upgini - Versions diffs - 1.2.81a3832.dev1__py3-none-any.whl → 1.2.81a3832.dev3__py3-none-any.whl - Mend

upgini 1.2.81a3832.dev1py3-none-any.whl → 1.2.81a3832.dev3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

upgini/__about__.py +1 -1
upgini/features_enricher.py +20 -10
upgini/http.py +21 -21
upgini/mdc/__init__.py +1 -1
upgini/metrics.py +68 -38
{upgini-1.2.81a3832.dev1.dist-info → upgini-1.2.81a3832.dev3.dist-info}/METADATA +2 -1
{upgini-1.2.81a3832.dev1.dist-info → upgini-1.2.81a3832.dev3.dist-info}/RECORD +9 -9
{upgini-1.2.81a3832.dev1.dist-info → upgini-1.2.81a3832.dev3.dist-info}/WHEEL +0 -0
{upgini-1.2.81a3832.dev1.dist-info → upgini-1.2.81a3832.dev3.dist-info}/licenses/LICENSE +0 -0

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.81a3832.~~dev1~~"
1	+ __version__ = "1.2.81a3832.dev3"

upgini/features_enricher.py CHANGED Viewed

@@ -310,6 +310,7 @@ class FeaturesEnricher(TransformerMixin):
                     self._search_task = search_task.poll_result(trace_id, quiet=True, check_fit=True)
                     file_metadata = self._search_task.get_file_metadata(trace_id)
                     x_columns = [c.originalName or c.name for c in file_metadata.columns]
+                    self.fit_columns_renaming = {c.name: c.originalName for c in file_metadata.columns}
                     df = pd.DataFrame(columns=x_columns)
                     self.__prepare_feature_importances(trace_id, df, silent=True)
                     # TODO validate search_keys with search_keys from file_metadata
@@ -476,7 +477,7 @@ class FeaturesEnricher(TransformerMixin):
             self.__validate_search_keys(self.search_keys)
             # Validate client estimator params
-            self._get_client_cat_features(estimator, X, self.search_keys)
+            self._get_and_validate_client_cat_features(estimator, X, self.search_keys)
             try:
                 self.X = X
@@ -957,9 +958,17 @@ class FeaturesEnricher(TransformerMixin):
                     self.__display_support_link(msg)
                     return None
-                client_cat_features, search_keys_for_metrics = self._get_client_cat_features(
+                cat_features_from_backend = self.__get_categorical_features()
+                client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
                     estimator, validated_X, self.search_keys
                 )
+                for cat_feature in cat_features_from_backend:
+                    original_cat_feature = self.fit_columns_renaming.get(cat_feature)
+                    if original_cat_feature in self.search_keys:
+                        if self.search_keys[original_cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
+                            search_keys_for_metrics.append(original_cat_feature)
+                        else:
+                            self.logger.warning(self.bundle.get("cat_feature_search_key").format(original_cat_feature))
                 search_keys_for_metrics.extend([c for c in self.id_columns or [] if c not in search_keys_for_metrics])
                 self.logger.info(f"Search keys for metrics: {search_keys_for_metrics}")
@@ -976,7 +985,7 @@ class FeaturesEnricher(TransformerMixin):
                     search_keys_for_metrics=search_keys_for_metrics,
                     progress_bar=progress_bar,
                     progress_callback=progress_callback,
-                    cat_features=client_cat_features,
+                    client_cat_features=client_cat_features,
                 )
                 if prepared_data is None:
                     return None
@@ -1027,7 +1036,6 @@ class FeaturesEnricher(TransformerMixin):
                     has_date = self._get_date_column(search_keys) is not None
                     model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
-                    cat_features_from_backend = self.__get_categorical_features()
                     cat_features = list(set(client_cat_features + cat_features_from_backend))
                     baseline_cat_features = [f for f in cat_features if f in fitting_X.columns]
                     enriched_cat_features = [f for f in cat_features if f in fitting_enriched_X.columns]
@@ -1423,7 +1431,7 @@ class FeaturesEnricher(TransformerMixin):
         return _cv, groups
-    def _get_client_cat_features(
+    def _get_and_validate_client_cat_features(
         self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
     ) -> Tuple[Optional[List[str]], List[str]]:
         cat_features = None
@@ -1468,7 +1476,7 @@ class FeaturesEnricher(TransformerMixin):
         search_keys_for_metrics: Optional[List[str]] = None,
         progress_bar: Optional[ProgressBar] = None,
         progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
-        cat_features: Optional[List[str]] = None,
+        client_cat_features: Optional[List[str]] = None,
     ):
         is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
         is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
@@ -1542,7 +1550,7 @@ class FeaturesEnricher(TransformerMixin):
         # Detect and drop high cardinality columns in train
         columns_with_high_cardinality = FeaturesValidator.find_high_cardinality(fitting_X)
-        non_excluding_columns = (self.generate_features or []) + (cat_features or [])
+        non_excluding_columns = (self.generate_features or []) + (client_cat_features or [])
         columns_with_high_cardinality = [c for c in columns_with_high_cardinality if c not in non_excluding_columns]
         if len(columns_with_high_cardinality) > 0:
             self.logger.warning(
@@ -2080,10 +2088,12 @@ class FeaturesEnricher(TransformerMixin):
         search_keys: Dict,
         columns_renaming: Dict[str, str],
     ):
+        # X_sampled - with hash-suffixes
+        reversed_renaming = {v: k for k, v in columns_renaming.items()}
         search_keys = {
-            columns_renaming.get(k, k): v
+            reversed_renaming.get(k, k): v
             for k, v in search_keys.items()
-            if columns_renaming.get(k, k) in X_sampled.columns.to_list()
+            if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
         }
         return FeaturesEnricher._SampledDataForMetrics(
             X_sampled=X_sampled,
@@ -3871,7 +3881,7 @@ if response.status_code == 200:
         if features_meta is None:
             raise Exception(self.bundle.get("missing_features_meta"))
-        return [f.name for f in features_meta if f.type == "categorical"]
+        return [f.name for f in features_meta if f.type == "categorical" and f.shap_value > 0.0]
     def __prepare_feature_importances(
         self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False

upgini/http.py CHANGED Viewed

@@ -20,7 +20,7 @@ import jwt
 # import pandas as pd
 import requests
 from pydantic import BaseModel
-from pythonjsonlogger import jsonlogger
+from pythonjsonlogger import json as jsonlogger
 from requests.exceptions import RequestException
 from upgini.__about__ import __version__
@@ -459,19 +459,19 @@ class _RestClient:
                 content = file.read()
                 md5_hash.update(content)
                 digest = md5_hash.hexdigest()
-                metadata_with_md5 = metadata.copy(update={"checksumMD5": digest})
+                metadata_with_md5 = metadata.model_copy(update={"checksumMD5": digest})
             # digest_sha256 = hashlib.sha256(
             #     pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
             # ).hexdigest()
             digest_sha256 = self.compute_file_digest(file_path)
-            metadata_with_md5 = metadata_with_md5.copy(update={"digest": digest_sha256})
+            metadata_with_md5 = metadata_with_md5.model_copy(update={"digest": digest_sha256})
             with open(file_path, "rb") as file:
                 files = {
                     "metadata": (
                         "metadata.json",
-                        metadata_with_md5.json(exclude_none=True).encode(),
+                        metadata_with_md5.model_dump_json(exclude_none=True).encode(),
                         "application/json",
                     ),
                     "tracking": (
@@ -481,7 +481,7 @@ class _RestClient:
                     ),
                     "metrics": (
                         "metrics.json",
-                        metrics.json(exclude_none=True).encode(),
+                        metrics.model_dump_json(exclude_none=True).encode(),
                         "application/json",
                     ),
                     "file": (metadata_with_md5.name, file, "application/octet-stream"),
@@ -489,7 +489,7 @@ class _RestClient:
                 if search_customization is not None:
                     files["customization"] = (
                         "customization.json",
-                        search_customization.json(exclude_none=True).encode(),
+                        search_customization.model_dump_json(exclude_none=True).encode(),
                         "application/json",
                     )
                 additional_headers = {self.SEARCH_KEYS_HEADER_NAME: ",".join(self.search_keys_meaning_types(metadata))}
@@ -504,7 +504,7 @@ class _RestClient:
     def check_uploaded_file_v2(self, trace_id: str, file_upload_id: str, metadata: FileMetadata) -> bool:
         api_path = self.CHECK_UPLOADED_FILE_URL_FMT_V2.format(file_upload_id)
         response = self._with_unauth_retry(
-            lambda: self._send_post_req(api_path, trace_id, metadata.json(exclude_none=True))
+            lambda: self._send_post_req(api_path, trace_id, metadata.model_dump_json(exclude_none=True))
         )
         return bool(response)
@@ -518,11 +518,11 @@ class _RestClient:
     ) -> SearchTaskResponse:
         api_path = self.INITIAL_SEARCH_WITHOUT_UPLOAD_URI_FMT_V2.format(file_upload_id)
         files = {
-            "metadata": ("metadata.json", metadata.json(exclude_none=True).encode(), "application/json"),
-            "metrics": ("metrics.json", metrics.json(exclude_none=True).encode(), "application/json"),
+            "metadata": ("metadata.json", metadata.model_dump_json(exclude_none=True).encode(), "application/json"),
+            "metrics": ("metrics.json", metrics.model_dump_json(exclude_none=True).encode(), "application/json"),
         }
         if search_customization is not None:
-            files["customization"] = search_customization.json(exclude_none=True).encode()
+            files["customization"] = search_customization.model_dump_json(exclude_none=True).encode()
         additional_headers = {self.SEARCH_KEYS_HEADER_NAME: ",".join(self.search_keys_meaning_types(metadata))}
         response = self._with_unauth_retry(
             lambda: self._send_post_file_req_v2(
@@ -548,19 +548,19 @@ class _RestClient:
                 content = file.read()
                 md5_hash.update(content)
                 digest = md5_hash.hexdigest()
-                metadata_with_md5 = metadata.copy(update={"checksumMD5": digest})
+                metadata_with_md5 = metadata.model_copy(update={"checksumMD5": digest})
             # digest_sha256 = hashlib.sha256(
             #     pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
             # ).hexdigest()
             digest_sha256 = self.compute_file_digest(file_path)
-            metadata_with_md5 = metadata_with_md5.copy(update={"digest": digest_sha256})
+            metadata_with_md5 = metadata_with_md5.model_copy(update={"digest": digest_sha256})
             with open(file_path, "rb") as file:
                 files = {
                     "metadata": (
                         "metadata.json",
-                        metadata_with_md5.json(exclude_none=True).encode(),
+                        metadata_with_md5.model_dump_json(exclude_none=True).encode(),
                         "application/json",
                     ),
                     "tracking": (
@@ -570,7 +570,7 @@ class _RestClient:
                     ),
                     "metrics": (
                         "metrics.json",
-                        metrics.json(exclude_none=True).encode(),
+                        metrics.model_dump_json(exclude_none=True).encode(),
                         "application/json",
                     ),
                     "file": (metadata_with_md5.name, file, "application/octet-stream"),
@@ -578,7 +578,7 @@ class _RestClient:
                 if search_customization is not None:
                     files["customization"] = (
                         "customization.json",
-                        search_customization.json(exclude_none=True).encode(),
+                        search_customization.model_dump_json(exclude_none=True).encode(),
                         "application/json",
                     )
@@ -602,11 +602,11 @@ class _RestClient:
     ) -> SearchTaskResponse:
         api_path = self.VALIDATION_SEARCH_WITHOUT_UPLOAD_URI_FMT_V2.format(file_upload_id, initial_search_task_id)
         files = {
-            "metadata": ("metadata.json", metadata.json(exclude_none=True).encode(), "application/json"),
-            "metrics": ("metrics.json", metrics.json(exclude_none=True).encode(), "application/json"),
+            "metadata": ("metadata.json", metadata.model_dump_json(exclude_none=True).encode(), "application/json"),
+            "metrics": ("metrics.json", metrics.model_dump_json(exclude_none=True).encode(), "application/json"),
         }
         if search_customization is not None:
-            files["customization"] = search_customization.json(exclude_none=True).encode()
+            files["customization"] = search_customization.model_dump_json(exclude_none=True).encode()
         additional_headers = {self.SEARCH_KEYS_HEADER_NAME: ",".join(self.search_keys_meaning_types(metadata))}
         response = self._with_unauth_retry(
             lambda: self._send_post_file_req_v2(
@@ -670,7 +670,7 @@ class _RestClient:
                     "file": (metadata.name, file, "application/octet-stream"),
                     "metadata": (
                         "metadata.json",
-                        metadata.json(exclude_none=True).encode(),
+                        metadata.model_dump_json(exclude_none=True).encode(),
                         "application/json",
                     ),
                 }
@@ -682,12 +682,12 @@ class _RestClient:
     def get_search_file_metadata(self, search_task_id: str, trace_id: str) -> FileMetadata:
         api_path = self.SEARCH_FILE_METADATA_URI_FMT_V2.format(search_task_id)
         response = self._with_unauth_retry(lambda: self._send_get_req(api_path, trace_id))
-        return FileMetadata.parse_obj(response)
+        return FileMetadata.model_validate(response)
     def get_provider_search_metadata_v3(self, provider_search_task_id: str, trace_id: str) -> ProviderTaskMetadataV2:
         api_path = self.SEARCH_TASK_METADATA_FMT_V3.format(provider_search_task_id)
         response = self._with_unauth_retry(lambda: self._send_get_req(api_path, trace_id))
-        return ProviderTaskMetadataV2.parse_obj(response)
+        return ProviderTaskMetadataV2.model_validate(response)
     def get_current_transform_usage(self, trace_id) -> TransformUsage:
         track_metrics = get_track_metrics(self.client_ip, self.client_visitorid)

upgini/mdc/__init__.py CHANGED Viewed

@@ -5,7 +5,7 @@
 import logging
-from pythonjsonlogger import jsonlogger
+from pythonjsonlogger import json as jsonlogger
 from upgini.mdc.context import get_mdc_fields, new_log_context

upgini/metrics.py CHANGED Viewed

@@ -11,13 +11,14 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 import lightgbm as lgb
 import numpy as np
 import pandas as pd
+from catboost import CatBoostClassifier, CatBoostRegressor
 from category_encoders.cat_boost import CatBoostEncoder
 from lightgbm import LGBMClassifier, LGBMRegressor
 from numpy import log1p
 from pandas.api.types import is_numeric_dtype
 from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
-from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
+# from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
 from upgini.utils.features_validator import FeaturesValidator
 from upgini.utils.sklearn_ext import cross_validate
@@ -31,7 +32,7 @@ except ImportError:
     available_scorers = SCORERS
 from sklearn.metrics import mean_squared_error
 from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
-from sklearn.model_selection import BaseCrossValidator, TimeSeriesSplit
+from sklearn.model_selection import BaseCrossValidator  # , TimeSeriesSplit
 from upgini.errors import ValidationError
 from upgini.metadata import ModelTaskType
@@ -328,10 +329,14 @@ class EstimatorWrapper:
     ) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
         self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
         for c in x.columns:
-            if is_numeric_dtype(x[c]):
-                x[c] = x[c].astype(float)
-            elif not x[c].dtype == "category":
-                x[c] = x[c].astype(str)
+            if c not in self.cat_features:
+                if is_numeric_dtype(x[c]):
+                    x[c] = x[c].astype(float)
+                elif not x[c].dtype == "category":
+                    x[c] = x[c].astype(str)
+            else:
+                if x[c].dtype == "category" and x[c].cat.categories.dtype == np.int64:
+                    x[c] = x[c].astype(np.int64)
         if not isinstance(y, pd.Series):
             raise Exception(bundle.get("metrics_unsupported_target_type").format(type(y)))
@@ -411,7 +416,6 @@ class EstimatorWrapper:
                 shaps = self.calculate_shap(cv_x, cv_y, estimator)
                 if shaps is not None:
                     for feature, shap_value in shaps.items():
-                        # shap_values_all_folds[feature] = shap_values_all_folds.get(feature, []) + shap_value.tolist()
                         shap_values_all_folds[feature].append(shap_value)
         if shap_values_all_folds:
@@ -488,20 +492,29 @@ class EstimatorWrapper:
             "logger": logger,
         }
         if estimator is None:
-            params = {"random_state": DEFAULT_RANDOM_STATE, "verbose": -1}
+            params = {"has_time": has_date}
             if target_type == ModelTaskType.MULTICLASS:
-                params = _get_add_params(params, LIGHTGBM_MULTICLASS_PARAMS)
+                params = _get_add_params(params, CATBOOST_MULTICLASS_PARAMS)
                 params = _get_add_params(params, add_params)
-                estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
+                estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
+                # params = _get_add_params(params, LIGHTGBM_MULTICLASS_PARAMS)
+                # params = _get_add_params(params, add_params)
+                # estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
             elif target_type == ModelTaskType.BINARY:
-                params = _get_add_params(params, LIGHTGBM_BINARY_PARAMS)
+                params = _get_add_params(params, CATBOOST_BINARY_PARAMS)
                 params = _get_add_params(params, add_params)
-                estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
+                estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
+                # params = _get_add_params(params, LIGHTGBM_BINARY_PARAMS)
+                # params = _get_add_params(params, add_params)
+                # estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
             elif target_type == ModelTaskType.REGRESSION:
-                if not isinstance(cv, TimeSeriesSplit) and not isinstance(cv, BlockedTimeSeriesSplit):
-                    params = _get_add_params(params, LIGHTGBM_REGRESSION_PARAMS)
+                params = _get_add_params(params, CATBOOST_REGRESSION_PARAMS)
                 params = _get_add_params(params, add_params)
-                estimator = LightGBMWrapper(LGBMRegressor(**params), **kwargs)
+                estimator = CatBoostWrapper(CatBoostRegressor(**params), **kwargs)
+                # if not isinstance(cv, TimeSeriesSplit) and not isinstance(cv, BlockedTimeSeriesSplit):
+                #     params = _get_add_params(params, LIGHTGBM_REGRESSION_PARAMS)
+                # params = _get_add_params(params, add_params)
+                # estimator = LightGBMWrapper(LGBMRegressor(**params), **kwargs)
             else:
                 raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
         else:
@@ -517,8 +530,6 @@ class EstimatorWrapper:
             else:
                 if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
                     estimator = LightGBMWrapper(**kwargs)
-                elif is_catboost_estimator(estimator):
-                    estimator = CatBoostWrapper(**kwargs)
                 else:
                     logger.warning(
                         f"Unexpected estimator is used for metrics: {estimator}. "
@@ -558,6 +569,7 @@ class CatBoostWrapper(EstimatorWrapper):
         self.emb_features = None
         self.grouped_embedding_features = None
         self.drop_cat_features = []
+        self.features_to_encode = []
     def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
         x, y, groups, params = super()._prepare_to_fit(x, y)
@@ -597,7 +609,13 @@ class CatBoostWrapper(EstimatorWrapper):
         self.cat_features, self.features_to_encode, self.exclude_features = _get_cat_features(
             self.logger, x, self.cat_features, self.text_features, self.grouped_embedding_features
         )
-        params["cat_features"] = self.cat_features
+        if self.features_to_encode:
+            for c in self.features_to_encode:
+                if is_numeric_dtype(x[c]):
+                    x[c] = x[c].fillna(np.nan)
+                else:
+                    x[c] = x[c].fillna("NA")
+            params["cat_features"] = self.features_to_encode
         return x, y, groups, params
@@ -626,8 +644,14 @@ class CatBoostWrapper(EstimatorWrapper):
         if self.grouped_embedding_features:
             x, emb_columns = self.group_embeddings(x)
             params["embedding_features"] = emb_columns
-        if self.cat_features:
-            params["cat_features"] = self.cat_features
+        if self.features_to_encode:
+            for c in self.features_to_encode:
+                if is_numeric_dtype(x[c]):
+                    x[c] = x[c].fillna(np.nan)
+                else:
+                    x[c] = x[c].fillna("NA")
+            params["cat_features"] = self.features_to_encode
         return x, y, params
@@ -671,23 +695,29 @@ class CatBoostWrapper(EstimatorWrapper):
                 embedding_features=self.grouped_embedding_features,
             )
-            # Get SHAP values of current estimator
-            shap_values_fold = estimator.get_feature_importance(data=fold_pool, type="ShapValues")
+            shap_values = estimator.get_feature_importance(data=fold_pool, type="ShapValues")
-            # Remove last columns (base value) and flatten
             if self.target_type == ModelTaskType.MULTICLASS:
-                all_shaps = shap_values_fold[:, :, :-1]
-                all_shaps = [all_shaps[:, :, k].flatten() for k in range(all_shaps.shape[2])]
+                # For multiclass, shap_values has shape (n_samples, n_classes, n_features + 1)
+                # Last column is bias term
+                shap_values = shap_values[:, :, :-1]  # Remove bias term
+                # Average SHAP values across classes
+                shap_values = np.mean(np.abs(shap_values), axis=1)
             else:
-                all_shaps = shap_values_fold[:, :-1]
-                all_shaps = [all_shaps[:, k].flatten() for k in range(all_shaps.shape[1])]
+                # For binary/regression, shap_values has shape (n_samples, n_features + 1)
+                # Last column is bias term
+                shap_values = shap_values[:, :-1]  # Remove bias term
+                # Take absolute values
+                shap_values = np.abs(shap_values)
-            all_shaps = np.abs(all_shaps)
+            feature_importance = {}
+            for i, col in enumerate(x.columns):
+                feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
-            return dict(zip(estimator.feature_names_, all_shaps))
+            return feature_importance
-        except Exception:
-            self.logger.exception("Failed to recalculate new SHAP values")
+        except Exception as e:
+            self.logger.exception(f"Failed to recalculate new SHAP values: {str(e)}")
             return None
@@ -830,9 +860,9 @@ class OtherEstimatorWrapper(EstimatorWrapper):
             num_features = [col for col in x.columns if col not in self.cat_features]
             x[num_features] = x[num_features].fillna(-999)
             if self.features_to_encode and self.cat_encoder is not None:
-                x[self.features_to_encode] = self.cat_encoder.transform(x[self.features_to_encode].astype("object")).astype(
-                    "category"
-                )
+                x[self.features_to_encode] = self.cat_encoder.transform(
+                    x[self.features_to_encode].astype("object")
+                ).astype("category")
         return x, y, params
@@ -935,17 +965,17 @@ def _get_cat_features(
     drop_cat_features = []
     for name in cat_features:
         # Remove constant categorical features
-        if x[name].nunique() > 1:
+        if x[name].nunique(dropna=False) > 1:
             unique_cat_features.append(name)
         else:
-            logger.info(f"Drop column {name} on preparing data for fit")
-            x = x.drop(columns=name)
+            logger.warning(f"Drop column {name} on preparing data for fit")
+            x.drop(columns=name, inplace=True)
             drop_cat_features.append(name)
     cat_features = unique_cat_features
     logger.info(f"Selected categorical features: {cat_features}")
-    features_to_encode = list(set(x.select_dtypes(exclude=[np.number, np.datetime64, pd.CategoricalDtype]).columns))
+    features_to_encode = list(set(x.select_dtypes(exclude=[np.number, np.datetime64, pd.CategoricalDtype()]).columns))
     logger.info(f"Features to encode: {features_to_encode}")

{upgini-1.2.81a3832.dev1.dist-info → upgini-1.2.81a3832.dev3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.81a3832.dev1
+Version: 1.2.81a3832.dev3
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/
@@ -22,6 +22,7 @@ Classifier: Programming Language :: Python :: 3.11
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Classifier: Topic :: Scientific/Engineering :: Information Analysis
 Requires-Python: <3.12,>=3.10
+Requires-Dist: catboost>=1.0.3
 Requires-Dist: category-encoders>=2.8.1
 Requires-Dist: fastparquet>=0.8.1
 Requires-Dist: ipywidgets>=8.1.0

{upgini-1.2.81a3832.dev1.dist-info → upgini-1.2.81a3832.dev3.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,12 @@
-upgini/__about__.py,sha256=-WSXUS5Ith33qArTnDO4LmrI0wUaXbJ8bIzoMZvAsWU,33
+upgini/__about__.py,sha256=sQSOnYXU8JfHaCG4spEa8dwpUzrTX39X2sSVYCzITIk,33
 upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
 upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
 upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
-upgini/features_enricher.py,sha256=qtrQJwF2QbKdQ8Tqk5RQj3aAqOzDgygD6nIHrco3AzE,209728
-upgini/http.py,sha256=UH7nswcZ221un3O_VW9limCBO5oRsyg1eKUHiVslRPs,43737
+upgini/features_enricher.py,sha256=WiSVfmlHI9oKJQbyf46FH0yY80hBJ6hheFpugw0f_vE,210583
+upgini/http.py,sha256=AfaJ3c8z_tK2hZFEehNybDKE0mp1tYcyAP_l0_p8bLQ,43933
 upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
-upgini/metrics.py,sha256=95sK1Kr3dYxqQcdkkoNFDe9OZY7OhgLjYwe3bhMQd38,38087
+upgini/metrics.py,sha256=fhBhMM455C__1adECAk2H3K-zyO_WUnVqZV_AJ-rQBo,39633
 upgini/search_task.py,sha256=RcvAE785yksWTsTNWuZFVNlk32jHElMoEna1T_C5N8Q,17823
 upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
 upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -32,7 +32,7 @@ upgini/autofe/timeseries/trend.py,sha256=K1_iw2ko_LIUU8YCUgrvN3n0MkHtsi7-63-8x9e
 upgini/autofe/timeseries/volatility.py,sha256=9shUmIKjpWTHVYjj80YBsk0XheBJ9uBuLv5NW9Mchnk,7953
 upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/data_source/data_source_publisher.py,sha256=4S9qwlAklD8vg9tUU_c1pHE2_glUHAh15-wr5hMwKFw,22879
-upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
+upgini/mdc/__init__.py,sha256=iHJlXQg6xRM1-ZOUtaPSJqw5SpQDszvxp4LyqviNLIQ,1027
 upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
 upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
 upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
 upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
-upgini-1.2.81a3832.dev1.dist-info/METADATA,sha256=ShIRi8EeeujsKBJ0byR2XWJ6DKFka2vrViq9d5VwjzU,49141
-upgini-1.2.81a3832.dev1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-upgini-1.2.81a3832.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.81a3832.dev1.dist-info/RECORD,,
+upgini-1.2.81a3832.dev3.dist-info/METADATA,sha256=rjTrlaR6RTthHUMnhRDn3QFCs9EhW6dDUHukgwnObxI,49172
+upgini-1.2.81a3832.dev3.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
+upgini-1.2.81a3832.dev3.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.81a3832.dev3.dist-info/RECORD,,

{upgini-1.2.81a3832.dev1.dist-info → upgini-1.2.81a3832.dev3.dist-info}/WHEEL RENAMED Viewed

File without changes

{upgini-1.2.81a3832.dev1.dist-info → upgini-1.2.81a3832.dev3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.81a3832.dev1__py3-none-any.whl → 1.2.81a3832.dev3__py3-none-any.whl

upgini 1.2.81a3832.dev1py3-none-any.whl → 1.2.81a3832.dev3py3-none-any.whl