PyPI - upgini - Versions diffs - 1.2.71a3810.dev4__py3-none-any.whl → 1.2.71a3810.dev6__py3-none-any.whl - Mend

upgini 1.2.71a3810.dev4py3-none-any.whl → 1.2.71a3810.dev6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (19) hide show

upgini/__about__.py +1 -1
upgini/autofe/feature.py +9 -7
upgini/autofe/timeseries/base.py +2 -2
upgini/autofe/timeseries/cross.py +1 -1
upgini/dataset.py +1 -1
upgini/features_enricher.py +86 -27
upgini/http.py +9 -4
upgini/metrics.py +178 -54
upgini/resource_bundle/strings.properties +1 -0
upgini/utils/deduplicate_utils.py +2 -0
upgini/utils/feature_info.py +2 -1
upgini/utils/sklearn_ext.py +20 -2
upgini/utils/sort.py +1 -1
upgini/utils/target_utils.py +4 -2
{upgini-1.2.71a3810.dev4.dist-info → upgini-1.2.71a3810.dev6.dist-info}/METADATA +5 -4
{upgini-1.2.71a3810.dev4.dist-info → upgini-1.2.71a3810.dev6.dist-info}/RECORD +18 -19
upgini/lazy_import.py +0 -35
{upgini-1.2.71a3810.dev4.dist-info → upgini-1.2.71a3810.dev6.dist-info}/WHEEL +0 -0
{upgini-1.2.71a3810.dev4.dist-info → upgini-1.2.71a3810.dev6.dist-info}/licenses/LICENSE +0 -0

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.71a3810.~~dev4~~"
1	+ __version__ = "1.2.71a3810.dev6"

upgini/autofe/feature.py CHANGED Viewed

@@ -162,16 +162,18 @@ class Feature:
             return self.cached_display_name
         should_stack_op = not isinstance(self.children[0], Column) if self.op.is_unary else False
-        prev_name = [self.children[0].get_op_display_name()] if should_stack_op else []
+        components = []
         if self.alias:
-            components = ["f_autofe", self.alias]
-        elif shorten and (not self.op.is_unary or should_stack_op):
-            components = ["f_autofe"] + prev_name + [self.get_op_display_name()]
-        else:
-            components = (
-                ["f_" + "_f_".join(self.get_columns(**kwargs))] + ["autofe"] + prev_name + [self.get_op_display_name()]
+            components.extend(["f_autofe", self.alias])
+        elif should_stack_op:
+            components.extend(
+                [self.children[0].get_display_name(cache=cache, shorten=shorten, **kwargs), self.get_op_display_name()]
             )
+        elif shorten:
+            components.extend(["f_autofe", self.get_op_display_name()])
+        else:
+            components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + ["autofe", self.get_op_display_name()]
         components.extend([str(self.display_index)] if self.display_index is not None else [])
         display_name = "_".join(components)

upgini/autofe/timeseries/base.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import abc
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple
 import pandas as pd
 from upgini.autofe.operator import PandasOperator
@@ -64,7 +64,7 @@ class TimeSeriesBase(PandasOperator, abc.ABC):
         return base_formula
     @classmethod
-    def _parse_offset_from_formula(cls, formula: str, base_regex: str) -> tuple[Optional[dict], Optional[str]]:
+    def _parse_offset_from_formula(cls, formula: str, base_regex: str) -> Tuple[Optional[Dict], Optional[str]]:
         """
         Parse the offset component from a formula.

upgini/autofe/timeseries/cross.py CHANGED Viewed

@@ -93,7 +93,7 @@ class CrossSeriesInteraction(TimeSeriesBase, ParametrizedOperator):
         return cls(**params)
-    def get_params(self) -> Dict[str, str | None]:
+    def get_params(self) -> Dict[str, Optional[str]]:
         res = super().get_params()
         res.update(
             {

upgini/dataset.py CHANGED Viewed

@@ -389,7 +389,7 @@ class Dataset:  # (pd.DataFrame):
         for col in columns_to_validate:
             self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
             if validate_target and target is not None and col == target:
-                self.data.loc[self.data[target] == np.Inf, f"{col}_is_valid"] = False
+                self.data.loc[self.data[target] == np.inf, f"{col}_is_valid"] = False
             if col in mandatory_columns:
                 self.data["valid_mandatory"] = self.data["valid_mandatory"] & self.data[f"{col}_is_valid"]

upgini/features_enricher.py CHANGED Viewed

@@ -12,6 +12,7 @@ import tempfile
 import time
 import uuid
 from collections import Counter
+from copy import deepcopy
 from dataclasses import dataclass
 from threading import Thread
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
@@ -841,7 +842,7 @@ class FeaturesEnricher(TransformerMixin):
         max_features: Optional[int] = None,
         remove_outliers_calc_metrics: Optional[bool] = None,
         trace_id: Optional[str] = None,
-        silent: bool = False,
+        internal_call: bool = False,
         progress_bar: Optional[ProgressBar] = None,
         progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
         **kwargs,
@@ -1095,7 +1096,7 @@ class FeaturesEnricher(TransformerMixin):
                         enriched_shaps = enriched_cv_result.shap_values
                         if enriched_shaps is not None:
-                            self._update_shap_values(trace_id, fitting_X, enriched_shaps)
+                            self._update_shap_values(trace_id, fitting_X, enriched_shaps, silent=not internal_call)
                         if enriched_metric is None:
                             self.logger.warning(
@@ -1256,14 +1257,14 @@ class FeaturesEnricher(TransformerMixin):
                     if self.raise_validation_error:
                         raise e
                 else:
-                    if not silent:
+                    if not internal_call:
                         self._dump_python_libs()
                         self.__display_support_link()
                     raise e
             finally:
                 self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
-    def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float]):
+    def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float], silent: bool = False):
         renaming = self.fit_columns_renaming or {}
         new_shaps = {
             renaming.get(feature, feature): _round_shap_value(shap)
@@ -1272,7 +1273,7 @@ class FeaturesEnricher(TransformerMixin):
         }
         self.__prepare_feature_importances(trace_id, df, new_shaps)
-        if self.features_info_display_handle is not None:
+        if not silent and self.features_info_display_handle is not None:
             try:
                 _ = get_ipython()  # type: ignore
@@ -1284,7 +1285,7 @@ class FeaturesEnricher(TransformerMixin):
                 )
             except (ImportError, NameError):
                 pass
-        if self.data_sources_display_handle is not None:
+        if not silent and self.data_sources_display_handle is not None:
             try:
                 _ = get_ipython()  # type: ignore
@@ -1296,7 +1297,7 @@ class FeaturesEnricher(TransformerMixin):
                 )
             except (ImportError, NameError):
                 pass
-        if self.autofe_features_display_handle is not None:
+        if not silent and self.autofe_features_display_handle is not None:
             try:
                 _ = get_ipython()  # type: ignore
                 autofe_descriptions_df = self.get_autofe_features_description()
@@ -1309,7 +1310,7 @@ class FeaturesEnricher(TransformerMixin):
                     )
             except (ImportError, NameError):
                 pass
-        if self.report_button_handle is not None:
+        if not silent and self.report_button_handle is not None:
             try:
                 _ = get_ipython()  # type: ignore
@@ -1512,8 +1513,7 @@ class FeaturesEnricher(TransformerMixin):
         self.logger.info(f"Client features column on prepare data for metrics: {client_features}")
         filtered_enriched_features = self.__filtered_enriched_features(
-            importance_threshold,
-            max_features,
+            importance_threshold, max_features, trace_id, validated_X
         )
         filtered_enriched_features = [c for c in filtered_enriched_features if c not in client_features]
@@ -2541,7 +2541,9 @@ if response.status_code == 200:
                 for c in itertools.chain(validated_Xy.columns.tolist(), generated_features)
                 if c not in self.dropped_client_feature_names_
             ]
-            filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
+            filtered_columns = self.__filtered_enriched_features(
+                importance_threshold, max_features, trace_id, validated_X
+            )
             selecting_columns.extend(
                 c for c in filtered_columns if c in result.columns and c not in validated_X.columns
             )
@@ -3248,8 +3250,7 @@ if response.status_code == 200:
     def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
         if len(eval_pair) != 2:
             raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
-        eval_X = eval_pair[0]
-        eval_y = eval_pair[1]
+        eval_X, eval_y = eval_pair
         if _num_samples(eval_X) == 0:
             raise ValidationError(self.bundle.get("eval_x_is_empty"))
@@ -3805,6 +3806,47 @@ if response.status_code == 200:
         return result_features
+    def __get_features_importance_from_server(self, trace_id: str, df: pd.DataFrame):
+        if self._search_task is None:
+            raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
+        features_meta = self._search_task.get_all_features_metadata_v2()
+        if features_meta is None:
+            raise Exception(self.bundle.get("missing_features_meta"))
+        features_meta = deepcopy(features_meta)
+        original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
+        df = df.rename(columns=original_names_dict)
+        features_meta.sort(key=lambda m: (-m.shap_value, m.name))
+        importances = {}
+        for feature_meta in features_meta:
+            if feature_meta.name in original_names_dict.keys():
+                feature_meta.name = original_names_dict[feature_meta.name]
+            is_client_feature = feature_meta.name in df.columns
+            if feature_meta.shap_value == 0.0:
+                continue
+            # Use only important features
+            if (
+                feature_meta.name == COUNTRY
+                # In select_features mode we select also from etalon features and need to show them
+                or (not self.fit_select_features and is_client_feature)
+            ):
+                continue
+            # Temporary workaround for duplicate features metadata
+            if feature_meta.name in importances:
+                self.logger.warning(f"WARNING: Duplicate feature metadata: {feature_meta}")
+                continue
+            importances[feature_meta.name] = feature_meta.shap_value
+        return importances
     def __prepare_feature_importances(
         self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
     ):
@@ -3813,6 +3855,7 @@ if response.status_code == 200:
         features_meta = self._search_task.get_all_features_metadata_v2()
         if features_meta is None:
             raise Exception(self.bundle.get("missing_features_meta"))
+        features_meta = deepcopy(features_meta)
         original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
         features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
@@ -3828,15 +3871,23 @@ if response.status_code == 200:
         original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
-        if updated_shaps is not None:
-            for fm in features_meta:
-                fm.shap_value = updated_shaps.get(fm.name, 0.0)
-        features_meta.sort(key=lambda m: (-m.shap_value, m.name))
         for feature_meta in features_meta:
             if feature_meta.name in original_names_dict.keys():
                 feature_meta.name = original_names_dict[feature_meta.name]
+            if updated_shaps is not None:
+                updating_shap = updated_shaps.get(feature_meta.name)
+                if updating_shap is None:
+                    self.logger.warning(
+                        f"WARNING: Shap value for feature {feature_meta.name} not found and will be set to 0.0"
+                    )
+                    updating_shap = 0.0
+                feature_meta.shap_value = updating_shap
+        features_meta.sort(key=lambda m: (-m.shap_value, m.name))
+        for feature_meta in features_meta:
             is_client_feature = feature_meta.name in df.columns
             # TODO make a decision about selected features based on special flag from mlb
@@ -3848,7 +3899,7 @@ if response.status_code == 200:
             # Use only important features
             if (
                 # feature_meta.name in self.fit_generated_features or
-                feature_meta.name == COUNTRY
+                feature_meta.name == COUNTRY  # constant synthetic column
                 # In select_features mode we select also from etalon features and need to show them
                 or (not self.fit_select_features and is_client_feature)
             ):
@@ -3990,16 +4041,19 @@ if response.status_code == 200:
         )
     def __filtered_importance_names(
-        self, importance_threshold: Optional[float], max_features: Optional[int]
+        self, importance_threshold: Optional[float], max_features: Optional[int], trace_id: str, df: pd.DataFrame
     ) -> List[str]:
-        if len(self.feature_names_) == 0:
-            return []
+        # get features importance from server
+        filtered_importances = self.__get_features_importance_from_server(trace_id, df)
-        filtered_importances = list(zip(self.feature_names_, self.feature_importances_))
+        if len(filtered_importances) == 0:
+            return []
         if importance_threshold is not None:
             filtered_importances = [
-                (name, importance) for name, importance in filtered_importances if importance > importance_threshold
+                (name, importance)
+                for name, importance in filtered_importances.items()
+                if importance > importance_threshold
             ]
         if max_features is not None:
             filtered_importances = list(filtered_importances)[:max_features]
@@ -4084,7 +4138,10 @@ if response.status_code == 200:
             )
         if all(k == SearchKey.CUSTOM_KEY for k in valid_search_keys.values()):
-            msg = self.bundle.get("unregistered_only_personal_keys")
+            if self.__is_registered:
+                msg = self.bundle.get("only_custom_keys")
+            else:
+                msg = self.bundle.get("unregistered_only_personal_keys")
             self.logger.warning(msg + f" Provided search keys: {search_keys}")
             raise ValidationError(msg)
@@ -4135,7 +4192,7 @@ if response.status_code == 200:
             max_features=max_features,
             remove_outliers_calc_metrics=remove_outliers_calc_metrics,
             trace_id=trace_id,
-            silent=True,
+            internal_call=True,
             progress_bar=progress_bar,
             progress_callback=progress_callback,
         )
@@ -4209,11 +4266,13 @@ if response.status_code == 200:
         self,
         importance_threshold: Optional[float],
         max_features: Optional[int],
+        trace_id: str,
+        df: pd.DataFrame,
     ) -> List[str]:
         importance_threshold = self.__validate_importance_threshold(importance_threshold)
         max_features = self.__validate_max_features(max_features)
-        return self.__filtered_importance_names(importance_threshold, max_features)
+        return self.__filtered_importance_names(importance_threshold, max_features, trace_id, df)
     def __detect_missing_search_keys(
         self,

upgini/http.py CHANGED Viewed

@@ -16,6 +16,7 @@ from typing import Any, Dict, List, Optional, Tuple
 from urllib.parse import urljoin
 import jwt
 # import pandas as pd
 import requests
 from pydantic import BaseModel
@@ -342,7 +343,9 @@ class _RestClient:
         else:
             return self._syncronized_refresh_access_token()
-    def _with_unauth_retry(self, request, try_number: int = 0, need_connection_retry: bool = True):
+    def _with_unauth_retry(
+        self, request, try_number: int = 0, need_connection_retry: bool = True, silent: bool = False
+    ):
         try:
             return request()
         except RequestException as e:
@@ -373,8 +376,9 @@ class _RestClient:
             elif "more than one concurrent search request" in e.message.lower():
                 raise ValidationError(bundle.get("concurrent_request"))
             else:
-                print(e)
-                show_status_error()
+                if not silent:
+                    print(e)
+                    show_status_error()
                 raise e
     @staticmethod
@@ -706,6 +710,7 @@ class _RestClient:
                     silent=True,
                 ),
                 need_connection_retry=False,
+                silent=True,
             )
         except Exception:
             self.send_log_event_unauth(log_event)
@@ -716,7 +721,7 @@ class _RestClient:
         try:
             requests.post(
                 url=urljoin(_RestClient.PROD_BACKEND_URL, api_path),
-                json=log_event.dict(exclude_none=True),
+                json=log_event.model_dump(exclude_none=True),
                 headers=_RestClient._get_base_headers(content_type="application/json"),
             )
         except Exception:

upgini/metrics.py CHANGED Viewed

@@ -1,20 +1,21 @@
 from __future__ import annotations
-from dataclasses import dataclass
 import inspect
 import logging
 import re
 from collections import defaultdict
 from copy import deepcopy
+from dataclasses import dataclass
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-import catboost
+import lightgbm as lgb
 import numpy as np
 import pandas as pd
-from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor, Pool
+from lightgbm import LGBMClassifier, LGBMRegressor
 from numpy import log1p
 from pandas.api.types import is_numeric_dtype
 from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
+from sklearn.preprocessing import OrdinalEncoder
 from upgini.utils.features_validator import FeaturesValidator
 from upgini.utils.sklearn_ext import cross_validate
@@ -27,11 +28,8 @@ except ImportError:
     from sklearn.metrics._scorer import SCORERS
     available_scorers = SCORERS
-from sklearn.metrics._regression import (
-    _check_reg_targets,
-    check_consistent_length,
-)
 from sklearn.metrics import mean_squared_error
+from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
 from sklearn.model_selection import BaseCrossValidator
 from upgini.errors import ValidationError
@@ -88,13 +86,73 @@ CATBOOST_MULTICLASS_PARAMS = {
 LIGHTGBM_PARAMS = {
     "random_state": DEFAULT_RANDOM_STATE,
-    "num_leaves": 16,
+    # "num_leaves": 16,
+    # "n_estimators": 150,
+    # "min_child_weight": 1,
     "max_depth": 4,
-    "n_estimators": 150,
+    "max_cat_threshold": 80,
+    "min_data_per_group": 25,
+    "num_boost_round": 150,
+    "cat_l2": 10,
+    "cat_smooth": 12,
+    "learning_rate": 0.05,
+    "feature_fraction": 1.0,
+    "min_sum_hessian_in_leaf": 0.01,
+}
+LIGHTGBM_REGRESSION_PARAMS = {
+    "random_state": DEFAULT_RANDOM_STATE,
+    "deterministic": True,
+    "min_gain_to_split": 0.001,
+    "n_estimators": 275,
+    "max_depth": 5,
+    "max_cat_threshold": 80,
+    "min_data_per_group": 25,
+    "cat_l2": 10,
+    "cat_smooth": 12,
+    "learning_rate": 0.05,
+    "feature_fraction": 1.0,
+    "min_sum_hessian_in_leaf": 0.01,
+    "objective": "huber",
+    "verbosity": -1,
+}
+LIGHTGBM_MULTICLASS_PARAMS = {
+    "random_state": DEFAULT_RANDOM_STATE,
+    "n_estimators": 275,
+    "max_depth": 5,
     "learning_rate": 0.05,
-    "min_child_weight": 1,
+    "min_gain_to_split": 0.001,
+    "max_cat_threshold": 80,
+    "min_data_per_group": 20,
+    "cat_smooth": 18,
+    "cat_l2": 8,
+    "objective": "multiclass",
+    "class_weight": "balanced",
+    "use_quantized_grad": "true",
+    "num_grad_quant_bins": "8",
+    "stochastic_rounding": "true",
+    "verbosity": -1,
 }
+LIGHTGBM_BINARY_PARAMS = {
+    "random_state": DEFAULT_RANDOM_STATE,
+    "min_gain_to_split": 0.001,
+    "n_estimators": 275,
+    "max_depth": 5,
+    "learning_rate": 0.05,
+    "objective": "binary",
+    "class_weight": "balanced",
+    "deterministic": True,
+    "max_cat_threshold": 80,
+    "min_data_per_group": 20,
+    "cat_smooth": 18,
+    "cat_l2": 8,
+    "verbosity": -1,
+}
+LIGHTGBM_EARLY_STOPPING_ROUNDS = 20
 N_FOLDS = 5
 BLOCKED_TS_TEST_SIZE = 0.2
@@ -211,6 +269,15 @@ SUPPORTED_CATBOOST_METRICS = {
 }
+def is_catboost_estimator(estimator):
+    try:
+        from catboost import CatBoostClassifier, CatBoostRegressor
+        return isinstance(estimator, (CatBoostClassifier, CatBoostRegressor))
+    except ImportError:
+        return False
 @dataclass
 class _CrossValResults:
     metric: Optional[float]
@@ -274,7 +341,7 @@ class EstimatorWrapper:
         for c in x.columns:
             if is_numeric_dtype(x[c]):
                 x[c] = x[c].astype(float)
-            else:
+            elif not x[c].dtype == "category":
                 x[c] = x[c].astype(str)
         if not isinstance(y, pd.Series):
@@ -292,7 +359,7 @@ class EstimatorWrapper:
         self.logger.info(f"After preparing data columns: {x.columns.to_list()}")
         return x, y, groups
-    def _remove_empty_target_rows(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
+    def _remove_empty_target_rows(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray]:
         joined = pd.concat([x, y], axis=1)
         joined = joined[joined[y.name].notna()]
         joined = joined.reset_index(drop=True)
@@ -346,12 +413,15 @@ class EstimatorWrapper:
             for estimator, split in zip(self.cv_estimators, splits):
                 _, validation_idx = split
                 cv_x = x.iloc[validation_idx]
-                cv_y = y[validation_idx]
+                if isinstance(y, pd.Series):
+                    cv_y = y.iloc[validation_idx]
+                else:
+                    cv_y = y[validation_idx]
                 shaps = self.calculate_shap(cv_x, cv_y, estimator)
                 if shaps is not None:
                     for feature, shap_value in shaps.items():
                         # shap_values_all_folds[feature] = shap_values_all_folds.get(feature, []) + shap_value.tolist()
-                        shap_values_all_folds[feature].extend(shap_value.tolist())
+                        shap_values_all_folds[feature].append(shap_value)
         if shap_values_all_folds:
             average_shap_values = {
@@ -427,21 +497,18 @@ class EstimatorWrapper:
         }
         if estimator is None:
             params = {}
-            params["has_time"] = has_date
-            # if metric_name.upper() in SUPPORTED_CATBOOST_METRICS:
-            #     params["eval_metric"] = SUPPORTED_CATBOOST_METRICS[metric_name.upper()]
             if target_type == ModelTaskType.MULTICLASS:
-                params = _get_add_params(params, CATBOOST_MULTICLASS_PARAMS)
+                params = _get_add_params(params, LIGHTGBM_MULTICLASS_PARAMS)
                 params = _get_add_params(params, add_params)
-                estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
+                estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
             elif target_type == ModelTaskType.BINARY:
-                params = _get_add_params(params, CATBOOST_BINARY_PARAMS)
+                params = _get_add_params(params, LIGHTGBM_BINARY_PARAMS)
                 params = _get_add_params(params, add_params)
-                estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
+                estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
             elif target_type == ModelTaskType.REGRESSION:
-                params = _get_add_params(params, CATBOOST_REGRESSION_PARAMS)
+                params = _get_add_params(params, LIGHTGBM_REGRESSION_PARAMS)
                 params = _get_add_params(params, add_params)
-                estimator = CatBoostWrapper(CatBoostRegressor(**params), **kwargs)
+                estimator = LightGBMWrapper(LGBMRegressor(**params), **kwargs)
             else:
                 raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
         else:
@@ -450,31 +517,21 @@ class EstimatorWrapper:
             else:
                 estimator_copy = deepcopy(estimator)
             kwargs["estimator"] = estimator_copy
-            if isinstance(estimator, (CatBoostClassifier, CatBoostRegressor)):
+            if is_catboost_estimator(estimator):
                 if cat_features is not None:
                     for cat_feature in cat_features:
                         if cat_feature not in x.columns:
                             logger.error(
                                 f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
                             )
-                    estimator_copy.set_params(
-                        # cat_features=[x.columns.get_loc(cat_feature) for cat_feature in cat_features]
-                        cat_features=cat_features
-                    )
+                    estimator_copy.set_params(cat_features=cat_features, has_time=has_date)
                 estimator = CatBoostWrapper(**kwargs)
             else:
-                try:
-                    from lightgbm import LGBMClassifier, LGBMRegressor
-                    if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
-                        estimator = LightGBMWrapper(**kwargs)
-                    else:
-                        logger.warning(
-                            f"Unexpected estimator is used for metrics: {estimator}. "
-                            "Default strategy for category features will be used"
-                        )
-                        estimator = OtherEstimatorWrapper(**kwargs)
-                except ModuleNotFoundError:
+                if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
+                    estimator = LightGBMWrapper(**kwargs)
+                elif is_catboost_estimator(estimator):
+                    estimator = CatBoostWrapper(**kwargs)
+                else:
                     logger.warning(
                         f"Unexpected estimator is used for metrics: {estimator}. "
                         "Default strategy for category features will be used"
@@ -487,7 +544,7 @@ class EstimatorWrapper:
 class CatBoostWrapper(EstimatorWrapper):
     def __init__(
         self,
-        estimator: Union[CatBoostClassifier, CatBoostRegressor],
+        estimator,
         scorer: Callable,
         metric_name: str,
         multiplier: int,
@@ -517,6 +574,9 @@ class CatBoostWrapper(EstimatorWrapper):
         x, y, groups, params = super()._prepare_to_fit(x, y)
         # Find embeddings
+        import catboost
+        from catboost import CatBoostClassifier
         if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
             emb_pattern = r"(.+)_emb\d+"
             self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
@@ -637,8 +697,10 @@ class CatBoostWrapper(EstimatorWrapper):
             else:
                 raise e
-    def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator: CatBoost) -> Optional[Dict[str, float]]:
+    def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
         try:
+            from catboost import Pool
             # Create Pool for fold data, if need (for example, when categorical features are present)
             fold_pool = Pool(
                 x,
@@ -693,27 +755,89 @@ class LightGBMWrapper(EstimatorWrapper):
             logger=logger,
         )
         self.cat_features = None
+        self.cat_encoder = None
+        self.n_classes = None
     def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
-        x, y, groups, params = super()._prepare_to_fit(x, y)
+        x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
+        if self.target_type in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]:
+            self.n_classes = len(np.unique(y_numpy))
+        if LIGHTGBM_EARLY_STOPPING_ROUNDS is not None:
+            params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
         self.cat_features = _get_cat_features(x)
-        x = fill_na_cat_features(x, self.cat_features)
-        for feature in self.cat_features:
-            x[feature] = x[feature].astype("category").cat.codes
-        if not is_numeric_dtype(y):
-            y = correct_string_target(y)
+        print("prepare to fit")
+        print(x.dtypes.to_dict())
+        print(self.cat_features)
+        if self.cat_features:
+            x = fill_na_cat_features(x, self.cat_features)
+            encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
+            encoded = pd.DataFrame(
+                encoder.fit_transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
+            )
+            x[self.cat_features] = encoded
+            self.cat_encoder = encoder
+        if not is_numeric_dtype(y_numpy):
+            y_numpy = correct_string_target(y_numpy)
-        return x, y, groups, params
+        return x, y_numpy, groups, params
     def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
-        x, y, params = super()._prepare_to_calculate(x, y)
+        x, y_numpy, params = super()._prepare_to_calculate(x, y)
+        print("prepare to calculate")
+        print(x.dtypes.to_dict())
+        print(self.cat_features)
         if self.cat_features is not None:
             x = fill_na_cat_features(x, self.cat_features)
-            for feature in self.cat_features:
-                x[feature] = x[feature].astype("category").cat.codes
+            if self.cat_encoder is not None:
+                x[self.cat_features] = pd.DataFrame(
+                    self.cat_encoder.transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
+                )
         if not is_numeric_dtype(y):
-            y = correct_string_target(y)
-        return x, y, params
+            y_numpy = correct_string_target(y_numpy)
+        return x, y_numpy, params
+    def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
+        try:
+            shap_matrix = estimator.predict(
+                x,
+                predict_disable_shape_check=True,
+                raw_score=True,
+                pred_leaf=False,
+                pred_early_stop=True,
+                pred_contrib=True,
+            )
+            if self.target_type == ModelTaskType.MULTICLASS:
+                n_feat = x.shape[1]
+                shap_matrix.shape = (shap_matrix.shape[0], self.n_classes, n_feat + 1)
+                shap_matrix = np.mean(np.abs(shap_matrix), axis=1)
+            # exclude base value
+            shap_matrix = shap_matrix[:, :-1]
+            feature_importance = {}
+            for i, col in enumerate(x.columns):
+                feature_importance[col] = np.mean(np.abs(shap_matrix[:, i]))
+            # # exclude last column (base value)
+            # shap_values_only = shap_values[:, :-1]
+            # mean_abs_shap = np.mean(np.abs(shap_values_only), axis=0)
+            # # For classification, shap_values is returned as a list for each class
+            # # Take values for the positive class
+            # if isinstance(shap_values, list):
+            #     shap_values = shap_values[1]
+            # # Calculate mean absolute SHAP value for each feature
+            # feature_importance = {}
+            # for i, col in enumerate(x.columns):
+            #     feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
+            return feature_importance
+        except Exception as e:
+            self.logger.warning(f"Failed to calculate SHAP values: {str(e)}")
+            return None
 class OtherEstimatorWrapper(EstimatorWrapper):

upgini/resource_bundle/strings.properties CHANGED Viewed

@@ -80,6 +80,7 @@ email_and_hem_simultanious=EMAIL and HEM search keys cannot be used simultaneous
 postal_code_without_country=COUNTRY search key required if POSTAL_CODE is present
 multiple_search_key=Search key {} passed multiple times
 unregistered_only_personal_keys=Only personal search keys used. Api_key from profile.upgini.com required for EMAIL/HEM, PHONE NUMBER or IPv4/IPv6 search keys\nSee docs https://github.com/upgini/upgini#-open-up-all-capabilities-of-upgini
+only_custom_keys=Only CUSTOM_KEY search keys were provided. At least one of DATE, COUNTRY, POSTAL_CODE, PHONE, EMAIL, HEM, IP should be provided
 search_key_not_found=Column `{}` from search_keys was not found in X dataframe: {}
 numeric_search_key_not_found=Index {} in search_keys is out of bounds for {} columns of X dataframe
 unsupported_search_key_type=Unsupported type of key in search_keys: {}

upgini/utils/deduplicate_utils.py CHANGED Viewed

@@ -74,6 +74,8 @@ def remove_fintech_duplicates(
         # Checking for different dates by the same personal keys
         uniques = grouped_by_personal_cols[date_col].nunique()
         total = len(uniques)
+        if total == 0:
+            return segment_df, None
         diff_dates = len(uniques[uniques > 1])
         if diff_dates / total >= 0.6:
             return segment_df, None

upgini/utils/feature_info.py CHANGED Viewed

@@ -90,7 +90,8 @@ class FeatureInfo:
 def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: Optional[pd.DataFrame]) -> str:
     if data is not None and len(data) > 0 and feature_meta.name in data.columns:
         if len(data) > 3:
-            feature_sample = np.random.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
+            rand = np.random.RandomState(42)
+            feature_sample = rand.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
         else:
             feature_sample = data[feature_meta.name].dropna().unique().tolist()
         if len(feature_sample) > 0 and isinstance(feature_sample[0], float):

upgini/utils/sklearn_ext.py CHANGED Viewed

@@ -9,7 +9,6 @@ from traceback import format_exc
 import numpy as np
 import scipy.sparse as sp
-from catboost import CatBoostClassifier, CatBoostRegressor
 from joblib import Parallel, logger
 from scipy.sparse import issparse
 from sklearn import config_context, get_config
@@ -342,6 +341,22 @@ def cross_validate(
         raise e
+def is_catboost_estimator(estimator):
+    try:
+        from catboost import CatBoostClassifier, CatBoostRegressor
+        return isinstance(estimator, (CatBoostClassifier, CatBoostRegressor))
+    except ImportError:
+        return False
+def is_lightgbm_estimator(estimator):
+    try:
+        from lightgbm import LGBMClassifier, LGBMRegressor
+        return isinstance(estimator, (LGBMClassifier, LGBMRegressor))
+    except ImportError:
+        return False
 def _fit_and_score(
     estimator,
     X,
@@ -497,7 +512,10 @@ def _fit_and_score(
         if y_train is None:
             estimator.fit(X_train, **fit_params)
         else:
-            if isinstance(estimator, (CatBoostClassifier, CatBoostRegressor)):
+            if is_catboost_estimator(estimator):
+                fit_params = fit_params.copy()
+                fit_params["eval_set"] = [(X_test, y_test)]
+            elif is_lightgbm_estimator(estimator):
                 fit_params = fit_params.copy()
                 fit_params["eval_set"] = [(X_test, y_test)]
             estimator.fit(X_train, y_train, **fit_params)

upgini/utils/sort.py CHANGED Viewed

@@ -87,7 +87,7 @@ def get_sort_columns_dict(
     df_with_target = df_with_target.loc[~target.isna()]
     df = df_with_target.iloc[:, :-1]
     target = df_with_target.iloc[:, -1]
-    df = df.fillna(df.mean())
+    df = df.fillna(df.apply(lambda x: int(x.mean()) if pd.api.types.is_integer_dtype(x) else x.mean()))
     omit_nan = False
     hashes = [hash_series(df[col]) for col in columns_for_sort]
     df = np.asarray(df, dtype=np.float32)

upgini/utils/target_utils.py CHANGED Viewed

@@ -204,7 +204,7 @@ def balance_undersample(
 def balance_undersample_forced(
     df: pd.DataFrame,
     target_column: str,
-    id_columns: List[str],
+    id_columns: Optional[List[str]],
     date_column: str,
     task_type: ModelTaskType,
     cv_type: Optional[CVType],
@@ -287,7 +287,7 @@ DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
 def balance_undersample_time_series_trunc(
     df: pd.DataFrame,
-    id_columns: List[str],
+    id_columns: Optional[List[str]],
     date_column: str,
     sample_size: int,
     random_state: int = 42,
@@ -298,6 +298,8 @@ def balance_undersample_time_series_trunc(
     **kwargs,
 ):
     # Convert date column to datetime
+    if id_columns is None:
+        id_columns = [date_column]
     dates_df = df[id_columns + [date_column]].copy()
     dates_df[date_column] = pd.to_datetime(dates_df[date_column], unit="ms")

{upgini-1.2.71a3810.dev4.dist-info → upgini-1.2.71a3810.dev6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.71a3810.dev4
+Version: 1.2.71a3810.dev6
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/
@@ -22,14 +22,14 @@ Classifier: Programming Language :: Python :: 3.11
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Classifier: Topic :: Scientific/Engineering :: Information Analysis
 Requires-Python: <3.12,>=3.10
-Requires-Dist: catboost>=1.0.3
 Requires-Dist: fastparquet>=0.8.1
 Requires-Dist: ipywidgets>=8.1.0
 Requires-Dist: jarowinkler>=2.0.0
 Requires-Dist: levenshtein>=0.25.1
-Requires-Dist: numpy<=1.26.4,>=1.19.0
+Requires-Dist: lightgbm>=4.6.0
+Requires-Dist: numpy<3.0.0,>=1.19.0
 Requires-Dist: pandas<3.0.0,>=1.1.0
-Requires-Dist: psutil>=6.0.0
+Requires-Dist: psutil>=5.9.0
 Requires-Dist: pydantic<3.0.0,>1.0.0
 Requires-Dist: pyjwt>=2.8.0
 Requires-Dist: python-bidi==0.4.2
@@ -38,6 +38,7 @@ Requires-Dist: python-json-logger>=3.3.0
 Requires-Dist: requests>=2.8.0
 Requires-Dist: scikit-learn>=1.3.0
 Requires-Dist: scipy>=1.10.0
+Requires-Dist: shap>=0.44.0
 Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
 Description-Content-Type: text/markdown

{upgini-1.2.71a3810.dev4.dist-info → upgini-1.2.71a3810.dev6.dist-info}/RECORD RENAMED Viewed

@@ -1,13 +1,12 @@
-upgini/__about__.py,sha256=LP81_wgdiIYkisJXGLW7oX7fcgEPYOBkpyITBahIEVo,33
+upgini/__about__.py,sha256=pP2-JWkoPVosnW6bKUy6ajRXus3pPBdc2hG-HO-Ztao,33
 upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
-upgini/dataset.py,sha256=nCPfkQIlAanLgCpcmsDfxFXmg99dRm9m0K_ibdLUr-4,35365
+upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
 upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
-upgini/features_enricher.py,sha256=KqDQ29sU1Aty5Z40DDqO869Y_CClQfmU58nE9rScxRc,204434
-upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
-upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
+upgini/features_enricher.py,sha256=Li1sPihWVkPUPcma8HRbPFwpCqd9V9d2p5zQUgkpdpU,206998
+upgini/http.py,sha256=RvzcShpDXssLs6ycGN8xilkKi8ZV9XGUrrk8bwdUzbw,43607
 upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
-upgini/metrics.py,sha256=t7uOOnlDYvP6E3DLjPMQcFBjyhJfUQY8aUlx7N0Mh-s,35477
+upgini/metrics.py,sha256=B4sFcz-OWkVMQ7d_Y8vZwDo-xXkF6H2oAaCIgImSC0k,39410
 upgini/search_task.py,sha256=EuCGp0iCWz2fpuJgN6M47aP_CtIi3Oq9zw78w0mkKiU,17595
 upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
 upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -17,15 +16,15 @@ upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
 upgini/autofe/binary.py,sha256=MnQuFiERpocjCPQUjOljlsq5FE-04GPfwtNjzvfNMyU,7671
 upgini/autofe/date.py,sha256=MM1S-6imNSzCDOhbNnmsc_bwSqUWBcS8vWAdHF8j1kY,11134
-upgini/autofe/feature.py,sha256=y1x3wijhTVBmloayQAHiscqKU9Ll8kLcGm1PdvS357I,14910
+upgini/autofe/feature.py,sha256=md43NwDof0s_nWn_WfOO0l2wYItQ416nEzHm5u29XOA,14945
 upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
 upgini/autofe/operator.py,sha256=EOffJw6vKXpEh5yymqb1RFNJPxGxmnHdFRo9dB5SCFo,4969
 upgini/autofe/unary.py,sha256=Sx11IoHRh5nwyALzjgG9GQOrVNIs8NZ1JzunAJuN66A,5731
 upgini/autofe/utils.py,sha256=fK1am2_tQj3fL2vDslblye8lmyfWgGIUOX1beYVBz4k,2420
 upgini/autofe/vector.py,sha256=zehv1J9ChHdZKWjKlkRf6RpfQMCJduZmqCEePYNUfkQ,943
 upgini/autofe/timeseries/__init__.py,sha256=PGwwDAMwvkXl3el12tXVEmZUgDUvlmIPlXtROm6bD18,738
-upgini/autofe/timeseries/base.py,sha256=MYK260n3h9kEbgunbyp0cpR0pgNHml3N2WDLGW5BLDU,3603
-upgini/autofe/timeseries/cross.py,sha256=xpHHVITXYUK20BgEZlqKN1Uy2uxKnHz72gngjt7BxVE,5316
+upgini/autofe/timeseries/base.py,sha256=rWJqRuFAzTZEsUdWG5s1Vhif9zzRRmalASXvarufRxI,3610
+upgini/autofe/timeseries/cross.py,sha256=BTINVwuZSbm_4NKkVm0FGM68SrvZLENZKXN7-UyvhYI,5319
 upgini/autofe/timeseries/delta.py,sha256=h0YhmI1TlPJnjwFpN_GQxLb6r59DQuucnG5tQAXSgjU,3520
 upgini/autofe/timeseries/lag.py,sha256=LfQtg484vuqM0mgY4Wft1swHX_Srq7OKKgZswCXoiXI,1882
 upgini/autofe/timeseries/roll.py,sha256=zADKXU-eYWQnQ5R3am1yEal8uU6Tm0jLAixwPb_aCHg,2794
@@ -39,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
 upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
 upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
 upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
-upgini/resource_bundle/strings.properties,sha256=LDT-jtYlrD1IXvWjFSf-dtvapje0qSrqI9W3v7y2zVo,27646
+upgini/resource_bundle/strings.properties,sha256=mwQrerdJj3adzT-fHqvs6Qjf-rqDccsUzELDIXJKAmY,27791
 upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
 upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -53,11 +52,11 @@ upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk
 upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
 upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
 upgini/utils/datetime_utils.py,sha256=_jq-kn_dGNFfs-DGXcWCGzy9bkplfAjrZ8SsmN28zXc,13535
-upgini/utils/deduplicate_utils.py,sha256=SMZx9IKIhWI5HqXepfKiQb3uDJrogQZtG6jcWuMo5Z4,8855
+upgini/utils/deduplicate_utils.py,sha256=AcMLoObMjhOTQ_fMS1LWy0GKp6WXnZ-FNux_8V3nbZU,8914
 upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
 upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
 upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
-upgini/utils/feature_info.py,sha256=m1tQcT3hTChPAiXzpk0WQcEqElj8KgeCifEJFa7-gss,7247
+upgini/utils/feature_info.py,sha256=Q9HN6A-fvfVD-irFWrmOqqZG9RsUSvh5MTY_k0xu-tE,7287
 upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
 upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
 upgini/utils/ip_utils.py,sha256=TSQ_qDsLlVnm09X1HacpabEf_HNqSWpxBF4Sdc2xs08,6580
@@ -65,13 +64,13 @@ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
 upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
 upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
 upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
-upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
-upgini/utils/sort.py,sha256=H79A17NMoHtLbqLCPFx_MBUloLZcDKjOba_H4gCE3t8,6965
-upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,16579
+upgini/utils/sklearn_ext.py,sha256=HpaNQaKJisgNE7IZ71n7uswxTj7kbPglU2G3s1sORAc,45042
+upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
+upgini/utils/target_utils.py,sha256=KNFzJta1SpGU4sp07dHKSeVJlDs_9qgD2wcw5YuJfOc,16661
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
 upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
 upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
-upgini-1.2.71a3810.dev4.dist-info/METADATA,sha256=N2b-C2Z-kt5bPFhG-XK-IpHNHIce6PLNAL_VNtixQ_s,49075
-upgini-1.2.71a3810.dev4.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
-upgini-1.2.71a3810.dev4.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.71a3810.dev4.dist-info/RECORD,,
+upgini-1.2.71a3810.dev6.dist-info/METADATA,sha256=hGSStg6uah4fD-YtMrBOLCF6EPf9Uq59DfbYsspPQkI,49101
+upgini-1.2.71a3810.dev6.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
+upgini-1.2.71a3810.dev6.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.71a3810.dev6.dist-info/RECORD,,

upgini/lazy_import.py DELETED Viewed

@@ -1,35 +0,0 @@
-import importlib
-import importlib.util
-import importlib.machinery
-class LazyImport:
-    def __init__(self, module_name, class_name):
-        self.module_name = module_name
-        self.class_name = class_name
-        self._module = None
-        self._class = None
-    def _load(self):
-        if self._module is None:
-            # Load module and save link to it
-            spec = importlib.util.find_spec(self.module_name)
-            if spec is None:
-                raise ImportError(f"Module {self.module_name} not found")
-            # Create module
-            self._module = importlib.util.module_from_spec(spec)
-            # Execute module
-            spec.loader.exec_module(self._module)
-            # Get class from module
-            self._class = getattr(self._module, self.class_name)
-    def __call__(self, *args, **kwargs):
-        self._load()
-        return self._class(*args, **kwargs)
-    def __getattr__(self, name):
-        self._load()
-        return getattr(self._class, name)

{upgini-1.2.71a3810.dev4.dist-info → upgini-1.2.71a3810.dev6.dist-info}/WHEEL RENAMED Viewed

File without changes

{upgini-1.2.71a3810.dev4.dist-info → upgini-1.2.71a3810.dev6.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.71a3810.dev4__py3-none-any.whl → 1.2.71a3810.dev6__py3-none-any.whl

Potentially problematic release.

upgini 1.2.71a3810.dev4py3-none-any.whl → 1.2.71a3810.dev6py3-none-any.whl