PyPI - upgini - Versions diffs - 1.2.71a3810.dev3__py3-none-any.whl → 1.2.71a3832.dev4__py3-none-any.whl - Mend

upgini 1.2.71a3810.dev3py3-none-any.whl → 1.2.71a3832.dev4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (18) hide show

upgini/__about__.py +1 -1
upgini/autofe/timeseries/base.py +2 -2
upgini/autofe/timeseries/cross.py +1 -1
upgini/autofe/unary.py +1 -38
upgini/dataset.py +1 -1
upgini/features_enricher.py +13 -10
upgini/http.py +9 -4
upgini/metrics.py +147 -48
upgini/resource_bundle/strings.properties +1 -0
upgini/utils/deduplicate_utils.py +2 -0
upgini/utils/feature_info.py +2 -1
upgini/utils/sklearn_ext.py +20 -2
upgini/utils/sort.py +1 -1
{upgini-1.2.71a3810.dev3.dist-info → upgini-1.2.71a3832.dev4.dist-info}/METADATA +5 -4
{upgini-1.2.71a3810.dev3.dist-info → upgini-1.2.71a3832.dev4.dist-info}/RECORD +17 -18
{upgini-1.2.71a3810.dev3.dist-info → upgini-1.2.71a3832.dev4.dist-info}/WHEEL +1 -1
upgini/lazy_import.py +0 -35
{upgini-1.2.71a3810.dev3.dist-info → upgini-1.2.71a3832.dev4.dist-info}/licenses/LICENSE +0 -0

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.~~71a3810~~.~~dev3~~"
1	+ __version__ = "1.2.71a3832.dev4"

upgini/autofe/timeseries/base.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import abc
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple
 import pandas as pd
 from upgini.autofe.operator import PandasOperator
@@ -64,7 +64,7 @@ class TimeSeriesBase(PandasOperator, abc.ABC):
         return base_formula
     @classmethod
-    def _parse_offset_from_formula(cls, formula: str, base_regex: str) -> tuple[Optional[dict], Optional[str]]:
+    def _parse_offset_from_formula(cls, formula: str, base_regex: str) -> Tuple[Optional[Dict], Optional[str]]:
         """
         Parse the offset component from a formula.

upgini/autofe/timeseries/cross.py CHANGED Viewed

@@ -93,7 +93,7 @@ class CrossSeriesInteraction(TimeSeriesBase, ParametrizedOperator):
         return cls(**params)
-    def get_params(self) -> Dict[str, str | None]:
+    def get_params(self) -> Dict[str, Optional[str]]:
         res = super().get_params()
         res.update(
             {

upgini/autofe/unary.py CHANGED Viewed

@@ -1,10 +1,8 @@
-import json
-from typing import Dict, List, Optional
+from typing import Dict, Optional
 import numpy as np
 import pandas as pd
 from upgini.autofe.operator import PandasOperator, VectorizableMixin
-from upgini.autofe.utils import pydantic_validator
 class Abs(PandasOperator, VectorizableMixin):
@@ -155,38 +153,3 @@ class Embeddings(PandasOperator):
     is_unary: bool = True
     input_type: Optional[str] = "string"
     output_type: Optional[str] = "vector"
-class Bin(PandasOperator):
-    name: str = "bin"
-    is_unary: bool = True
-    output_type: Optional[str] = "string"
-    bin_bounds: List[int] = []
-    is_categorical: bool = True
-    def calculate_unary(self, data: pd.Series) -> pd.Series:
-        return data.apply(self._bin, bounds=self.bin_bounds).fillna(-1).astype(int).astype(str)
-    def _bin(self, f, bounds):
-        if f is None or np.isnan(f):
-            return np.nan
-        hit = np.where(f >= np.array(bounds))[0]
-        if hit.size > 0:
-            return np.max(hit) + 1
-        else:
-            return np.nan
-    def get_params(self) -> Dict[str, Optional[str]]:
-        res = super().get_params()
-        res.update(
-            {
-                "bin_bounds": json.dumps(self.bin_bounds),
-            }
-        )
-        return res
-    @pydantic_validator("bin_bounds", mode="before")
-    def parse_bin_bounds(cls, value):
-        if isinstance(value, str):
-            return json.loads(value)
-        return value

upgini/dataset.py CHANGED Viewed

@@ -389,7 +389,7 @@ class Dataset:  # (pd.DataFrame):
         for col in columns_to_validate:
             self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
             if validate_target and target is not None and col == target:
-                self.data.loc[self.data[target] == np.Inf, f"{col}_is_valid"] = False
+                self.data.loc[self.data[target] == np.inf, f"{col}_is_valid"] = False
             if col in mandatory_columns:
                 self.data["valid_mandatory"] = self.data["valid_mandatory"] & self.data[f"{col}_is_valid"]

upgini/features_enricher.py CHANGED Viewed

@@ -841,7 +841,7 @@ class FeaturesEnricher(TransformerMixin):
         max_features: Optional[int] = None,
         remove_outliers_calc_metrics: Optional[bool] = None,
         trace_id: Optional[str] = None,
-        silent: bool = False,
+        internal_call: bool = False,
         progress_bar: Optional[ProgressBar] = None,
         progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
         **kwargs,
@@ -1095,7 +1095,7 @@ class FeaturesEnricher(TransformerMixin):
                         enriched_shaps = enriched_cv_result.shap_values
                         if enriched_shaps is not None:
-                            self._update_shap_values(trace_id, fitting_X, enriched_shaps)
+                            self._update_shap_values(trace_id, fitting_X, enriched_shaps, silent=not internal_call)
                         if enriched_metric is None:
                             self.logger.warning(
@@ -1256,14 +1256,14 @@ class FeaturesEnricher(TransformerMixin):
                     if self.raise_validation_error:
                         raise e
                 else:
-                    if not silent:
+                    if not internal_call:
                         self._dump_python_libs()
                         self.__display_support_link()
                     raise e
             finally:
                 self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
-    def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float]):
+    def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float], silent: bool = False):
         renaming = self.fit_columns_renaming or {}
         new_shaps = {
             renaming.get(feature, feature): _round_shap_value(shap)
@@ -1272,7 +1272,7 @@ class FeaturesEnricher(TransformerMixin):
         }
         self.__prepare_feature_importances(trace_id, df, new_shaps)
-        if self.features_info_display_handle is not None:
+        if not silent and self.features_info_display_handle is not None:
             try:
                 _ = get_ipython()  # type: ignore
@@ -1284,7 +1284,7 @@ class FeaturesEnricher(TransformerMixin):
                 )
             except (ImportError, NameError):
                 pass
-        if self.data_sources_display_handle is not None:
+        if not silent and self.data_sources_display_handle is not None:
             try:
                 _ = get_ipython()  # type: ignore
@@ -1296,7 +1296,7 @@ class FeaturesEnricher(TransformerMixin):
                 )
             except (ImportError, NameError):
                 pass
-        if self.autofe_features_display_handle is not None:
+        if not silent and self.autofe_features_display_handle is not None:
             try:
                 _ = get_ipython()  # type: ignore
                 autofe_descriptions_df = self.get_autofe_features_description()
@@ -1309,7 +1309,7 @@ class FeaturesEnricher(TransformerMixin):
                     )
             except (ImportError, NameError):
                 pass
-        if self.report_button_handle is not None:
+        if not silent and self.report_button_handle is not None:
             try:
                 _ = get_ipython()  # type: ignore
@@ -4084,7 +4084,10 @@ if response.status_code == 200:
             )
         if all(k == SearchKey.CUSTOM_KEY for k in valid_search_keys.values()):
-            msg = self.bundle.get("unregistered_only_personal_keys")
+            if self.__is_registered:
+                msg = self.bundle.get("only_custom_keys")
+            else:
+                msg = self.bundle.get("unregistered_only_personal_keys")
             self.logger.warning(msg + f" Provided search keys: {search_keys}")
             raise ValidationError(msg)
@@ -4135,7 +4138,7 @@ if response.status_code == 200:
             max_features=max_features,
             remove_outliers_calc_metrics=remove_outliers_calc_metrics,
             trace_id=trace_id,
-            silent=True,
+            internal_call=True,
             progress_bar=progress_bar,
             progress_callback=progress_callback,
         )

upgini/http.py CHANGED Viewed

@@ -16,6 +16,7 @@ from typing import Any, Dict, List, Optional, Tuple
 from urllib.parse import urljoin
 import jwt
 # import pandas as pd
 import requests
 from pydantic import BaseModel
@@ -342,7 +343,9 @@ class _RestClient:
         else:
             return self._syncronized_refresh_access_token()
-    def _with_unauth_retry(self, request, try_number: int = 0, need_connection_retry: bool = True):
+    def _with_unauth_retry(
+        self, request, try_number: int = 0, need_connection_retry: bool = True, silent: bool = False
+    ):
         try:
             return request()
         except RequestException as e:
@@ -373,8 +376,9 @@ class _RestClient:
             elif "more than one concurrent search request" in e.message.lower():
                 raise ValidationError(bundle.get("concurrent_request"))
             else:
-                print(e)
-                show_status_error()
+                if not silent:
+                    print(e)
+                    show_status_error()
                 raise e
     @staticmethod
@@ -706,6 +710,7 @@ class _RestClient:
                     silent=True,
                 ),
                 need_connection_retry=False,
+                silent=True,
             )
         except Exception:
             self.send_log_event_unauth(log_event)
@@ -716,7 +721,7 @@ class _RestClient:
         try:
             requests.post(
                 url=urljoin(_RestClient.PROD_BACKEND_URL, api_path),
-                json=log_event.dict(exclude_none=True),
+                json=log_event.model_dump(exclude_none=True),
                 headers=_RestClient._get_base_headers(content_type="application/json"),
             )
         except Exception:

upgini/metrics.py CHANGED Viewed

@@ -1,17 +1,18 @@
 from __future__ import annotations
-from dataclasses import dataclass
 import inspect
 import logging
 import re
+import warnings
 from collections import defaultdict
 from copy import deepcopy
+from dataclasses import dataclass
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-import catboost
 import numpy as np
 import pandas as pd
-from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor, Pool
+from lightgbm import LGBMClassifier, LGBMRegressor
+import lightgbm as lgb
 from numpy import log1p
 from pandas.api.types import is_numeric_dtype
 from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
@@ -27,11 +28,8 @@ except ImportError:
     from sklearn.metrics._scorer import SCORERS
     available_scorers = SCORERS
-from sklearn.metrics._regression import (
-    _check_reg_targets,
-    check_consistent_length,
-)
 from sklearn.metrics import mean_squared_error
+from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
 from sklearn.model_selection import BaseCrossValidator
 from upgini.errors import ValidationError
@@ -88,11 +86,73 @@ CATBOOST_MULTICLASS_PARAMS = {
 LIGHTGBM_PARAMS = {
     "random_state": DEFAULT_RANDOM_STATE,
-    "num_leaves": 16,
+    # "num_leaves": 16,
+    # "n_estimators": 150,
+    # "min_child_weight": 1,
     "max_depth": 4,
-    "n_estimators": 150,
+    "max_cat_threshold": 80,
+    "min_data_per_group": 25,
+    "num_boost_round": 150,
+    "cat_l2": 10,
+    "cat_smooth": 12,
+    "learning_rate": 0.05,
+    "feature_fraction": 1.0,
+    "min_sum_hessian_in_leaf": 0.01,
+}
+LIGHTGBM_REGRESSION_PARAMS = {
+    "random_state": DEFAULT_RANDOM_STATE,
+    "deterministic": True,
+    "min_gain_to_split": 0.001,
+    "n_estimators": 275,
+    "max_depth": 5,
+    "max_cat_threshold": 80,
+    "min_data_per_group": 25,
+    "cat_l2": 10,
+    "cat_smooth": 12,
     "learning_rate": 0.05,
-    "min_child_weight": 1,
+    "feature_fraction": 1.0,
+    "min_sum_hessian_in_leaf": 0.01,
+    "objective": "huber",
+    "verbosity": -1,
+}
+LIGHTGBM_MULTICLASS_PARAMS = {
+    "random_state": DEFAULT_RANDOM_STATE,
+    "deterministic": True,
+    "min_gain_to_split": 0.001,
+    "n_estimators": 275,
+    "max_depth": 3,
+    "max_cat_threshold": 80,
+    "min_data_per_group": 25,
+    "cat_l2": 10,
+    "cat_smooth": 12,
+    "learning_rate": 0.25,  # CatBoost 0.25
+    "min_sum_hessian_in_leaf": 0.01,
+    "class_weight": "balanced",  # TODO pass dict with weights for each class
+    "objective": "multiclass",
+    "use_quantized_grad": "true",
+    "num_grad_quant_bins": "8",
+    "stochastic_rounding": "true",
+    "verbosity": -1,
+}
+LIGHTGBM_BINARY_PARAMS = {
+    "random_state": DEFAULT_RANDOM_STATE,
+    "deterministic": True,
+    "min_gain_to_split": 0.001,
+    "n_estimators": 275,
+    "max_depth": 5,
+    "max_cat_threshold": 80,
+    "min_data_per_group": 25,
+    "cat_l2": 10,
+    "cat_smooth": 12,
+    "learning_rate": 0.05,
+    "feature_fraction": 1.0,
+    "min_sum_hessian_in_leaf": 0.01,
+    "objective": "binary",
+    "class_weight": "balanced",  # TODO pass dict with weights for each class
+    "verbosity": -1,
 }
 N_FOLDS = 5
@@ -211,6 +271,15 @@ SUPPORTED_CATBOOST_METRICS = {
 }
+def is_catboost_estimator(estimator):
+    try:
+        from catboost import CatBoostClassifier, CatBoostRegressor
+        return isinstance(estimator, (CatBoostClassifier, CatBoostRegressor))
+    except ImportError:
+        return False
 @dataclass
 class _CrossValResults:
     metric: Optional[float]
@@ -292,7 +361,7 @@ class EstimatorWrapper:
         self.logger.info(f"After preparing data columns: {x.columns.to_list()}")
         return x, y, groups
-    def _remove_empty_target_rows(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
+    def _remove_empty_target_rows(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray]:
         joined = pd.concat([x, y], axis=1)
         joined = joined[joined[y.name].notna()]
         joined = joined.reset_index(drop=True)
@@ -346,12 +415,15 @@ class EstimatorWrapper:
             for estimator, split in zip(self.cv_estimators, splits):
                 _, validation_idx = split
                 cv_x = x.iloc[validation_idx]
-                cv_y = y[validation_idx]
+                if isinstance(y, pd.Series):
+                    cv_y = y.iloc[validation_idx]
+                else:
+                    cv_y = y[validation_idx]
                 shaps = self.calculate_shap(cv_x, cv_y, estimator)
                 if shaps is not None:
                     for feature, shap_value in shaps.items():
                         # shap_values_all_folds[feature] = shap_values_all_folds.get(feature, []) + shap_value.tolist()
-                        shap_values_all_folds[feature].extend(shap_value.tolist())
+                        shap_values_all_folds[feature].append(shap_value)
         if shap_values_all_folds:
             average_shap_values = {
@@ -427,21 +499,18 @@ class EstimatorWrapper:
         }
         if estimator is None:
             params = {}
-            params["has_time"] = has_date
-            # if metric_name.upper() in SUPPORTED_CATBOOST_METRICS:
-            #     params["eval_metric"] = SUPPORTED_CATBOOST_METRICS[metric_name.upper()]
             if target_type == ModelTaskType.MULTICLASS:
-                params = _get_add_params(params, CATBOOST_MULTICLASS_PARAMS)
+                params = _get_add_params(params, LIGHTGBM_MULTICLASS_PARAMS)
                 params = _get_add_params(params, add_params)
-                estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
+                estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
             elif target_type == ModelTaskType.BINARY:
-                params = _get_add_params(params, CATBOOST_BINARY_PARAMS)
+                params = _get_add_params(params, LIGHTGBM_BINARY_PARAMS)
                 params = _get_add_params(params, add_params)
-                estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
+                estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
             elif target_type == ModelTaskType.REGRESSION:
-                params = _get_add_params(params, CATBOOST_REGRESSION_PARAMS)
+                params = _get_add_params(params, LIGHTGBM_REGRESSION_PARAMS)
                 params = _get_add_params(params, add_params)
-                estimator = CatBoostWrapper(CatBoostRegressor(**params), **kwargs)
+                estimator = LightGBMWrapper(LGBMRegressor(**params), **kwargs)
             else:
                 raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
         else:
@@ -450,31 +519,21 @@ class EstimatorWrapper:
             else:
                 estimator_copy = deepcopy(estimator)
             kwargs["estimator"] = estimator_copy
-            if isinstance(estimator, (CatBoostClassifier, CatBoostRegressor)):
+            if is_catboost_estimator(estimator):
                 if cat_features is not None:
                     for cat_feature in cat_features:
                         if cat_feature not in x.columns:
                             logger.error(
                                 f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
                             )
-                    estimator_copy.set_params(
-                        # cat_features=[x.columns.get_loc(cat_feature) for cat_feature in cat_features]
-                        cat_features=cat_features
-                    )
+                    estimator_copy.set_params(cat_features=cat_features, has_time=has_date)
                 estimator = CatBoostWrapper(**kwargs)
             else:
-                try:
-                    from lightgbm import LGBMClassifier, LGBMRegressor
-                    if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
-                        estimator = LightGBMWrapper(**kwargs)
-                    else:
-                        logger.warning(
-                            f"Unexpected estimator is used for metrics: {estimator}. "
-                            "Default strategy for category features will be used"
-                        )
-                        estimator = OtherEstimatorWrapper(**kwargs)
-                except ModuleNotFoundError:
+                if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
+                    estimator = LightGBMWrapper(**kwargs)
+                elif is_catboost_estimator(estimator):
+                    estimator = CatBoostWrapper(**kwargs)
+                else:
                     logger.warning(
                         f"Unexpected estimator is used for metrics: {estimator}. "
                         "Default strategy for category features will be used"
@@ -487,7 +546,7 @@ class EstimatorWrapper:
 class CatBoostWrapper(EstimatorWrapper):
     def __init__(
         self,
-        estimator: Union[CatBoostClassifier, CatBoostRegressor],
+        estimator,
         scorer: Callable,
         metric_name: str,
         multiplier: int,
@@ -517,6 +576,9 @@ class CatBoostWrapper(EstimatorWrapper):
         x, y, groups, params = super()._prepare_to_fit(x, y)
         # Find embeddings
+        import catboost
+        from catboost import CatBoostClassifier
         if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
             emb_pattern = r"(.+)_emb\d+"
             self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
@@ -637,8 +699,10 @@ class CatBoostWrapper(EstimatorWrapper):
             else:
                 raise e
-    def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator: CatBoost) -> Optional[Dict[str, float]]:
+    def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
         try:
+            from catboost import Pool
             # Create Pool for fold data, if need (for example, when categorical features are present)
             fold_pool = Pool(
                 x,
@@ -695,25 +759,60 @@ class LightGBMWrapper(EstimatorWrapper):
         self.cat_features = None
     def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
-        x, y, groups, params = super()._prepare_to_fit(x, y)
+        x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
+        params["callbacks"] = [lgb.early_stopping(stopping_rounds=20)]
         self.cat_features = _get_cat_features(x)
         x = fill_na_cat_features(x, self.cat_features)
         for feature in self.cat_features:
             x[feature] = x[feature].astype("category").cat.codes
-        if not is_numeric_dtype(y):
-            y = correct_string_target(y)
+        if not is_numeric_dtype(y_numpy):
+            y_numpy = correct_string_target(y_numpy)
-        return x, y, groups, params
+        return x, y_numpy, groups, params
     def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
-        x, y, params = super()._prepare_to_calculate(x, y)
+        x, y_numpy, params = super()._prepare_to_calculate(x, y)
         if self.cat_features is not None:
             x = fill_na_cat_features(x, self.cat_features)
             for feature in self.cat_features:
                 x[feature] = x[feature].astype("category").cat.codes
         if not is_numeric_dtype(y):
-            y = correct_string_target(y)
-        return x, y, params
+            y_numpy = correct_string_target(y_numpy)
+        return x, y_numpy, params
+    def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
+        try:
+            # Suppress specific warning from SHAP for LightGBM binary classifier
+            warnings.filterwarnings(
+                "ignore",
+                message=(
+                    "LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray"
+                ),
+            )
+            from shap import TreeExplainer
+            if not isinstance(estimator, (LGBMRegressor, LGBMClassifier)):
+                return None
+            explainer = TreeExplainer(estimator)
+            shap_values = explainer.shap_values(x)
+            # For classification, shap_values is returned as a list for each class
+            # Take values for the positive class
+            if isinstance(shap_values, list):
+                shap_values = shap_values[1]
+            # Calculate mean absolute SHAP value for each feature
+            feature_importance = {}
+            for i, col in enumerate(x.columns):
+                feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
+            return feature_importance
+        except Exception as e:
+            self.logger.warning(f"Failed to calculate SHAP values: {str(e)}")
+            return None
 class OtherEstimatorWrapper(EstimatorWrapper):

upgini/resource_bundle/strings.properties CHANGED Viewed

@@ -80,6 +80,7 @@ email_and_hem_simultanious=EMAIL and HEM search keys cannot be used simultaneous
 postal_code_without_country=COUNTRY search key required if POSTAL_CODE is present
 multiple_search_key=Search key {} passed multiple times
 unregistered_only_personal_keys=Only personal search keys used. Api_key from profile.upgini.com required for EMAIL/HEM, PHONE NUMBER or IPv4/IPv6 search keys\nSee docs https://github.com/upgini/upgini#-open-up-all-capabilities-of-upgini
+only_custom_keys=Only CUSTOM_KEY search keys were provided. At least one of DATE, COUNTRY, POSTAL_CODE, PHONE, EMAIL, HEM, IP should be provided
 search_key_not_found=Column `{}` from search_keys was not found in X dataframe: {}
 numeric_search_key_not_found=Index {} in search_keys is out of bounds for {} columns of X dataframe
 unsupported_search_key_type=Unsupported type of key in search_keys: {}

upgini/utils/deduplicate_utils.py CHANGED Viewed

@@ -74,6 +74,8 @@ def remove_fintech_duplicates(
         # Checking for different dates by the same personal keys
         uniques = grouped_by_personal_cols[date_col].nunique()
         total = len(uniques)
+        if total == 0:
+            return segment_df, None
         diff_dates = len(uniques[uniques > 1])
         if diff_dates / total >= 0.6:
             return segment_df, None

upgini/utils/feature_info.py CHANGED Viewed

@@ -90,7 +90,8 @@ class FeatureInfo:
 def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: Optional[pd.DataFrame]) -> str:
     if data is not None and len(data) > 0 and feature_meta.name in data.columns:
         if len(data) > 3:
-            feature_sample = np.random.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
+            rand = np.random.RandomState(42)
+            feature_sample = rand.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
         else:
             feature_sample = data[feature_meta.name].dropna().unique().tolist()
         if len(feature_sample) > 0 and isinstance(feature_sample[0], float):

upgini/utils/sklearn_ext.py CHANGED Viewed

@@ -9,7 +9,6 @@ from traceback import format_exc
 import numpy as np
 import scipy.sparse as sp
-from catboost import CatBoostClassifier, CatBoostRegressor
 from joblib import Parallel, logger
 from scipy.sparse import issparse
 from sklearn import config_context, get_config
@@ -342,6 +341,22 @@ def cross_validate(
         raise e
+def is_catboost_estimator(estimator):
+    try:
+        from catboost import CatBoostClassifier, CatBoostRegressor
+        return isinstance(estimator, (CatBoostClassifier, CatBoostRegressor))
+    except ImportError:
+        return False
+def is_lightgbm_estimator(estimator):
+    try:
+        from lightgbm import LGBMClassifier, LGBMRegressor
+        return isinstance(estimator, (LGBMClassifier, LGBMRegressor))
+    except ImportError:
+        return False
 def _fit_and_score(
     estimator,
     X,
@@ -497,7 +512,10 @@ def _fit_and_score(
         if y_train is None:
             estimator.fit(X_train, **fit_params)
         else:
-            if isinstance(estimator, (CatBoostClassifier, CatBoostRegressor)):
+            if is_catboost_estimator(estimator):
+                fit_params = fit_params.copy()
+                fit_params["eval_set"] = [(X_test, y_test)]
+            elif is_lightgbm_estimator(estimator):
                 fit_params = fit_params.copy()
                 fit_params["eval_set"] = [(X_test, y_test)]
             estimator.fit(X_train, y_train, **fit_params)

upgini/utils/sort.py CHANGED Viewed

@@ -87,7 +87,7 @@ def get_sort_columns_dict(
     df_with_target = df_with_target.loc[~target.isna()]
     df = df_with_target.iloc[:, :-1]
     target = df_with_target.iloc[:, -1]
-    df = df.fillna(df.mean())
+    df = df.fillna(df.apply(lambda x: int(x.mean()) if pd.api.types.is_integer_dtype(x) else x.mean()))
     omit_nan = False
     hashes = [hash_series(df[col]) for col in columns_for_sort]
     df = np.asarray(df, dtype=np.float32)

{upgini-1.2.71a3810.dev3.dist-info → upgini-1.2.71a3832.dev4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.71a3810.dev3
+Version: 1.2.71a3832.dev4
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/
@@ -22,14 +22,14 @@ Classifier: Programming Language :: Python :: 3.11
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Classifier: Topic :: Scientific/Engineering :: Information Analysis
 Requires-Python: <3.12,>=3.10
-Requires-Dist: catboost>=1.0.3
 Requires-Dist: fastparquet>=0.8.1
 Requires-Dist: ipywidgets>=8.1.0
 Requires-Dist: jarowinkler>=2.0.0
 Requires-Dist: levenshtein>=0.25.1
-Requires-Dist: numpy<=1.26.4,>=1.19.0
+Requires-Dist: lightgbm>=4.6.0
+Requires-Dist: numpy<3.0.0,>=1.19.0
 Requires-Dist: pandas<3.0.0,>=1.1.0
-Requires-Dist: psutil>=6.0.0
+Requires-Dist: psutil>=5.9.0
 Requires-Dist: pydantic<3.0.0,>1.0.0
 Requires-Dist: pyjwt>=2.8.0
 Requires-Dist: python-bidi==0.4.2
@@ -38,6 +38,7 @@ Requires-Dist: python-json-logger>=3.3.0
 Requires-Dist: requests>=2.8.0
 Requires-Dist: scikit-learn>=1.3.0
 Requires-Dist: scipy>=1.10.0
+Requires-Dist: shap>=0.44.0
 Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
 Description-Content-Type: text/markdown

{upgini-1.2.71a3810.dev3.dist-info → upgini-1.2.71a3832.dev4.dist-info}/RECORD RENAMED Viewed

@@ -1,13 +1,12 @@
-upgini/__about__.py,sha256=HJUMYeAgyipX6d-hLqm0G7l9lH2D5uJGT9KFNz20JM0,33
+upgini/__about__.py,sha256=xZJ4YiYa1ZXgmCQ3SYjASYcXSx3CrMdke97pR0PB16E,33
 upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
-upgini/dataset.py,sha256=nCPfkQIlAanLgCpcmsDfxFXmg99dRm9m0K_ibdLUr-4,35365
+upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
 upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
-upgini/features_enricher.py,sha256=KqDQ29sU1Aty5Z40DDqO869Y_CClQfmU58nE9rScxRc,204434
-upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
-upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
+upgini/features_enricher.py,sha256=Z6RSjqcqneGwWflsq1Q5rjf83awPNYqKpAgHRh7jils,204680
+upgini/http.py,sha256=RvzcShpDXssLs6ycGN8xilkKi8ZV9XGUrrk8bwdUzbw,43607
 upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
-upgini/metrics.py,sha256=t7uOOnlDYvP6E3DLjPMQcFBjyhJfUQY8aUlx7N0Mh-s,35477
+upgini/metrics.py,sha256=LI0wwTUSnxX62lVSM7J8Pq_RSbruq93QUhbMXilWM30,38301
 upgini/search_task.py,sha256=EuCGp0iCWz2fpuJgN6M47aP_CtIi3Oq9zw78w0mkKiU,17595
 upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
 upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -20,12 +19,12 @@ upgini/autofe/date.py,sha256=C86F7sPiscUGq2a45UtQA9ADWBWg0kt54mePHHzjbLE,10633
 upgini/autofe/feature.py,sha256=y1x3wijhTVBmloayQAHiscqKU9Ll8kLcGm1PdvS357I,14910
 upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
 upgini/autofe/operator.py,sha256=EOffJw6vKXpEh5yymqb1RFNJPxGxmnHdFRo9dB5SCFo,4969
-upgini/autofe/unary.py,sha256=3lvwtWrgIHziypwUTetrUv1iCqwDhabbKH4OySkQDhs,5722
+upgini/autofe/unary.py,sha256=yVgPvtfnPSOhrii0YgezddmgWPwyOBCR0JutaIkdTTc,4658
 upgini/autofe/utils.py,sha256=fK1am2_tQj3fL2vDslblye8lmyfWgGIUOX1beYVBz4k,2420
 upgini/autofe/vector.py,sha256=l0KdKg-txlZxDSE4hPPfCtfGQofYbl7oaABPr830sPI,667
 upgini/autofe/timeseries/__init__.py,sha256=PGwwDAMwvkXl3el12tXVEmZUgDUvlmIPlXtROm6bD18,738
-upgini/autofe/timeseries/base.py,sha256=MYK260n3h9kEbgunbyp0cpR0pgNHml3N2WDLGW5BLDU,3603
-upgini/autofe/timeseries/cross.py,sha256=xpHHVITXYUK20BgEZlqKN1Uy2uxKnHz72gngjt7BxVE,5316
+upgini/autofe/timeseries/base.py,sha256=rWJqRuFAzTZEsUdWG5s1Vhif9zzRRmalASXvarufRxI,3610
+upgini/autofe/timeseries/cross.py,sha256=BTINVwuZSbm_4NKkVm0FGM68SrvZLENZKXN7-UyvhYI,5319
 upgini/autofe/timeseries/delta.py,sha256=h0YhmI1TlPJnjwFpN_GQxLb6r59DQuucnG5tQAXSgjU,3520
 upgini/autofe/timeseries/lag.py,sha256=LfQtg484vuqM0mgY4Wft1swHX_Srq7OKKgZswCXoiXI,1882
 upgini/autofe/timeseries/roll.py,sha256=zADKXU-eYWQnQ5R3am1yEal8uU6Tm0jLAixwPb_aCHg,2794
@@ -39,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
 upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
 upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
 upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
-upgini/resource_bundle/strings.properties,sha256=LDT-jtYlrD1IXvWjFSf-dtvapje0qSrqI9W3v7y2zVo,27646
+upgini/resource_bundle/strings.properties,sha256=mwQrerdJj3adzT-fHqvs6Qjf-rqDccsUzELDIXJKAmY,27791
 upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
 upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -53,11 +52,11 @@ upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk
 upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
 upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
 upgini/utils/datetime_utils.py,sha256=_jq-kn_dGNFfs-DGXcWCGzy9bkplfAjrZ8SsmN28zXc,13535
-upgini/utils/deduplicate_utils.py,sha256=SMZx9IKIhWI5HqXepfKiQb3uDJrogQZtG6jcWuMo5Z4,8855
+upgini/utils/deduplicate_utils.py,sha256=AcMLoObMjhOTQ_fMS1LWy0GKp6WXnZ-FNux_8V3nbZU,8914
 upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
 upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
 upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
-upgini/utils/feature_info.py,sha256=m1tQcT3hTChPAiXzpk0WQcEqElj8KgeCifEJFa7-gss,7247
+upgini/utils/feature_info.py,sha256=Q9HN6A-fvfVD-irFWrmOqqZG9RsUSvh5MTY_k0xu-tE,7287
 upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
 upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
 upgini/utils/ip_utils.py,sha256=TSQ_qDsLlVnm09X1HacpabEf_HNqSWpxBF4Sdc2xs08,6580
@@ -65,13 +64,13 @@ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
 upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
 upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
 upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
-upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
-upgini/utils/sort.py,sha256=H79A17NMoHtLbqLCPFx_MBUloLZcDKjOba_H4gCE3t8,6965
+upgini/utils/sklearn_ext.py,sha256=HpaNQaKJisgNE7IZ71n7uswxTj7kbPglU2G3s1sORAc,45042
+upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
 upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,16579
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
 upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
 upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
-upgini-1.2.71a3810.dev3.dist-info/METADATA,sha256=tZWeZpg4Bh8rhogD7KDK-Sq7oFXDHzH0ljKi3Q1Z6AQ,49075
-upgini-1.2.71a3810.dev3.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
-upgini-1.2.71a3810.dev3.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.71a3810.dev3.dist-info/RECORD,,
+upgini-1.2.71a3832.dev4.dist-info/METADATA,sha256=XWxCzwoYpOeebCAtVb_H4-x-9VeHLDwYc7DkputGaAc,49101
+upgini-1.2.71a3832.dev4.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
+upgini-1.2.71a3832.dev4.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.71a3832.dev4.dist-info/RECORD,,

{upgini-1.2.71a3810.dev3.dist-info → upgini-1.2.71a3832.dev4.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: hatchling 1.24.2
+Generator: hatchling 1.25.0
 Root-Is-Purelib: true
 Tag: py3-none-any

upgini/lazy_import.py DELETED Viewed

@@ -1,35 +0,0 @@
-import importlib
-import importlib.util
-import importlib.machinery
-class LazyImport:
-    def __init__(self, module_name, class_name):
-        self.module_name = module_name
-        self.class_name = class_name
-        self._module = None
-        self._class = None
-    def _load(self):
-        if self._module is None:
-            # Load module and save link to it
-            spec = importlib.util.find_spec(self.module_name)
-            if spec is None:
-                raise ImportError(f"Module {self.module_name} not found")
-            # Create module
-            self._module = importlib.util.module_from_spec(spec)
-            # Execute module
-            spec.loader.exec_module(self._module)
-            # Get class from module
-            self._class = getattr(self._module, self.class_name)
-    def __call__(self, *args, **kwargs):
-        self._load()
-        return self._class(*args, **kwargs)
-    def __getattr__(self, name):
-        self._load()
-        return getattr(self._class, name)

{upgini-1.2.71a3810.dev3.dist-info → upgini-1.2.71a3832.dev4.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.71a3810.dev3__py3-none-any.whl → 1.2.71a3832.dev4__py3-none-any.whl

Potentially problematic release.

upgini 1.2.71a3810.dev3py3-none-any.whl → 1.2.71a3832.dev4py3-none-any.whl