PyPI - upgini - Versions diffs - 1.1.262a3250.post4__py3-none-any.whl → 1.1.280a3418.post2__py3-none-any.whl - Mend

upgini 1.1.262a3250.post4py3-none-any.whl → 1.1.280a3418.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (49) hide show

upgini/__about__.py +1 -0
upgini/ads.py +6 -2
upgini/ads_management/ads_manager.py +4 -2
upgini/autofe/all_operands.py +16 -4
upgini/autofe/binary.py +2 -1
upgini/autofe/date.py +74 -7
upgini/autofe/feature.py +1 -1
upgini/autofe/groupby.py +3 -1
upgini/autofe/operand.py +4 -3
upgini/autofe/unary.py +20 -1
upgini/autofe/vector.py +2 -0
upgini/data_source/data_source_publisher.py +14 -4
upgini/dataset.py +8 -7
upgini/errors.py +1 -1
upgini/features_enricher.py +156 -63
upgini/http.py +11 -10
upgini/mdc/__init__.py +1 -3
upgini/mdc/context.py +4 -6
upgini/metadata.py +3 -0
upgini/metrics.py +160 -96
upgini/normalizer/phone_normalizer.py +2 -2
upgini/resource_bundle/__init__.py +5 -5
upgini/resource_bundle/strings.properties +9 -4
upgini/sampler/base.py +1 -4
upgini/sampler/random_under_sampler.py +2 -5
upgini/search_task.py +4 -4
upgini/spinner.py +1 -1
upgini/utils/__init__.py +3 -2
upgini/utils/base_search_key_detector.py +2 -2
upgini/utils/blocked_time_series.py +4 -2
upgini/utils/country_utils.py +2 -2
upgini/utils/custom_loss_utils.py +3 -2
upgini/utils/cv_utils.py +2 -2
upgini/utils/datetime_utils.py +75 -18
upgini/utils/deduplicate_utils.py +61 -18
upgini/utils/email_utils.py +3 -3
upgini/utils/fallback_progress_bar.py +1 -1
upgini/utils/features_validator.py +2 -1
upgini/utils/progress_bar.py +1 -1
upgini/utils/sklearn_ext.py +15 -15
upgini/utils/target_utils.py +21 -7
upgini/utils/track_info.py +27 -15
upgini/version_validator.py +2 -2
{upgini-1.1.262a3250.post4.dist-info → upgini-1.1.280a3418.post2.dist-info}/METADATA +21 -23
upgini-1.1.280a3418.post2.dist-info/RECORD +62 -0
{upgini-1.1.262a3250.post4.dist-info → upgini-1.1.280a3418.post2.dist-info}/WHEEL +1 -2
upgini-1.1.262a3250.post4.dist-info/RECORD +0 -62
upgini-1.1.262a3250.post4.dist-info/top_level.txt +0 -1
{upgini-1.1.262a3250.post4.dist-info → upgini-1.1.280a3418.post2.dist-info/licenses}/LICENSE +0 -0

upgini/sampler/random_under_sampler.py CHANGED Viewed

@@ -5,13 +5,10 @@
 # License: MIT
 import numpy as np
-from sklearn.utils import check_random_state
-from sklearn.utils import _safe_indexing
+from sklearn.utils import _safe_indexing, check_random_state
 from .base import BaseUnderSampler
-from .utils import check_target_type
-from .utils import _deprecate_positional_args
+from .utils import _deprecate_positional_args, check_target_type
 class RandomUnderSampler(BaseUnderSampler):

upgini/search_task.py CHANGED Viewed

@@ -8,10 +8,10 @@ import pandas as pd
 from upgini import dataset
 from upgini.http import (
-    _RestClient,
     ProviderTaskSummary,
     SearchProgress,
     SearchTaskSummary,
+    _RestClient,
     get_rest_client,
     is_demo_api_key,
 )
@@ -295,7 +295,7 @@ class SearchTask:
         return self.rest_client.get_search_file_metadata(self.search_task_id, trace_id)
-@lru_cache()
+@lru_cache
 def _get_all_initial_raw_features_cached(
     endpoint: Optional[str],
     api_key: Optional[str],
@@ -328,7 +328,7 @@ def _get_all_initial_raw_features_cached(
     return result_df
-@lru_cache()
+@lru_cache
 def _get_all_validation_raw_features_cached(
     endpoint: Optional[str],
     api_key: Optional[str],
@@ -357,7 +357,7 @@ def _get_all_validation_raw_features_cached(
     return result_df
-@lru_cache()
+@lru_cache
 def _get_target_outliers_cached(
     endpoint: Optional[str],
     api_key: Optional[str],

upgini/spinner.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import threading
-from typing import Optional, List
 import time
+from typing import List, Optional
 class Spinner:

upgini/utils/__init__.py CHANGED Viewed

@@ -2,7 +2,7 @@ import itertools
 from typing import List, Tuple
 import pandas as pd
-from pandas.api.types import is_string_dtype
+from pandas.api.types import is_object_dtype, is_string_dtype
 def combine_search_keys(search_keys: List[str]) -> List[Tuple[str]]:
@@ -20,5 +20,6 @@ def find_numbers_with_decimal_comma(df: pd.DataFrame) -> pd.DataFrame:
     return [
         col
         for col in tmp.columns
-        if is_string_dtype(tmp[col]) and tmp[col].astype("string").str.match("^[0-9]+,[0-9]*$").any()
+        if (is_string_dtype(tmp[col]) or is_object_dtype(tmp[col]))
+        and tmp[col].astype("string").str.match("^[0-9]+,[0-9]*$").any()
     ]

upgini/utils/base_search_key_detector.py CHANGED Viewed

@@ -5,10 +5,10 @@ import pandas as pd
 class BaseSearchKeyDetector:
     def _is_search_key_by_name(self, column_name: str) -> bool:
-        raise NotImplementedError()
+        raise NotImplementedError
     def _is_search_key_by_values(self, column: pd.Series) -> bool:
-        raise NotImplementedError()
+        raise NotImplementedError
     def _get_search_key_by_name(self, column_names: List[str]) -> Optional[str]:
         for column_name in column_names:

upgini/utils/blocked_time_series.py CHANGED Viewed

@@ -1,8 +1,10 @@
-import numpy as np
 import numbers
+import numpy as np
+from sklearn.model_selection import BaseCrossValidator
 from sklearn.utils import indexable
 from sklearn.utils.validation import _num_samples
-from sklearn.model_selection import BaseCrossValidator
 from upgini.resource_bundle import bundle

upgini/utils/country_utils.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import pandas as pd
-from pandas.api.types import is_string_dtype
+from pandas.api.types import is_object_dtype, is_string_dtype
 from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
@@ -9,7 +9,7 @@ class CountrySearchKeyDetector(BaseSearchKeyDetector):
         return "country" in str(column_name).lower()
     def _is_search_key_by_values(self, column: pd.Series) -> bool:
-        if not is_string_dtype(column):
+        if not is_string_dtype(column) and not is_object_dtype(column):
             return False
         all_count = len(column)

upgini/utils/custom_loss_utils.py CHANGED Viewed

@@ -1,6 +1,7 @@
-from upgini.metadata import ModelTaskType, RuntimeParameters
-from typing import Optional, Dict, Any
 import logging
+from typing import Any, Dict, Optional
+from upgini.metadata import ModelTaskType, RuntimeParameters
 from upgini.resource_bundle import bundle

upgini/utils/cv_utils.py CHANGED Viewed

@@ -1,9 +1,9 @@
 from functools import reduce
 from typing import Any, Dict, List, Optional, Tuple, Union
-import numpy as np
+import numpy as np
 import pandas as pd
-from sklearn.model_selection import BaseCrossValidator, KFold, TimeSeriesSplit, GroupKFold, GroupShuffleSplit
+from sklearn.model_selection import BaseCrossValidator, GroupKFold, GroupShuffleSplit, KFold, TimeSeriesSplit
 from upgini.metadata import CVType
 from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit

upgini/utils/datetime_utils.py CHANGED Viewed

@@ -1,15 +1,20 @@
 import datetime
 import logging
 import re
-from typing import List, Optional
+from typing import Dict, List, Optional
 import numpy as np
 import pandas as pd
 from dateutil.relativedelta import relativedelta
-from pandas.api.types import is_numeric_dtype, is_period_dtype, is_string_dtype
+from pandas.api.types import (
+    is_numeric_dtype,
+    is_period_dtype,
+)
 from upgini.errors import ValidationError
+from upgini.metadata import SearchKey
 from upgini.resource_bundle import ResourceBundle, get_custom_bundle
+from upgini.utils.warning_counter import WarningCounter
 DATE_FORMATS = [
     "%Y-%m-%d",
@@ -76,9 +81,6 @@ class DateTimeSearchKeyConverter:
             df[self.date_column] = df[self.date_column].apply(lambda x: x.replace(tzinfo=None))
         elif isinstance(df[self.date_column].values[0], datetime.date):
             df[self.date_column] = pd.to_datetime(df[self.date_column], errors="coerce")
-        elif is_string_dtype(df[self.date_column]):
-            df[self.date_column] = df[self.date_column].apply(self.clean_date)
-            df[self.date_column] = self.parse_date(df)
         elif is_period_dtype(df[self.date_column]):
             df[self.date_column] = pd.to_datetime(df[self.date_column].astype("string"))
         elif is_numeric_dtype(df[self.date_column]):
@@ -98,6 +100,9 @@ class DateTimeSearchKeyConverter:
                 msg = self.bundle.get("unsupported_date_type").format(self.date_column)
                 self.logger.warning(msg)
                 raise ValidationError(msg)
+        else:
+            df[self.date_column] = df[self.date_column].astype("string").apply(self.clean_date)
+            df[self.date_column] = self.parse_date(df)
         # If column with date is datetime then extract seconds of the day and minute of the hour
         # as additional features
@@ -121,9 +126,9 @@ class DateTimeSearchKeyConverter:
         df.drop(columns=seconds, inplace=True)
         if keep_time:
-            df[self.DATETIME_COL] = df[self.date_column].view(np.int64) // 1_000_000
+            df[self.DATETIME_COL] = df[self.date_column].astype(np.int64) // 1_000_000
             df[self.DATETIME_COL] = df[self.DATETIME_COL].apply(self._int_to_opt).astype("Int64")
-        df[self.date_column] = df[self.date_column].dt.floor("D").view(np.int64) // 1_000_000
+        df[self.date_column] = df[self.date_column].dt.floor("D").astype(np.int64) // 1_000_000
         df[self.date_column] = df[self.date_column].apply(self._int_to_opt).astype("Int64")
         self.logger.info(f"Date after convertion to timestamp: {df[self.date_column]}")
@@ -203,18 +208,17 @@ def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[st
     if nunique_dates / days_delta < 0.3:
         return False
-    def check_differences(group):
-        data = group.drop(date_col, axis=1)
-        diffs = data.values[:, None] != data.values
-        diff_counts = diffs.sum(axis=2)
-        max_diff = np.max(diff_counts)
-        return max_diff <= 2
+    accumulated_changing_columns = set()
-    def is_multiple_rows(group):
+    def check_differences(group: pd.DataFrame):
+        changing_columns = group.columns[group.nunique(dropna=False) > 1].to_list()
+        accumulated_changing_columns.update(changing_columns)
+    def is_multiple_rows(group: pd.DataFrame) -> bool:
         return group.shape[0] > 1
-    grouped = df.groupby(date_col)
-    dates_with_multiple_rows = len(grouped.apply(is_multiple_rows))
+    grouped = df.groupby(date_col)[[c for c in df.columns if c != date_col]]
+    dates_with_multiple_rows = grouped.apply(is_multiple_rows).sum()
     # share of dates with more than one record is more than 99%
     if dates_with_multiple_rows / nunique_dates < 0.99:
@@ -223,5 +227,58 @@ def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[st
     if df.shape[1] <= 3:
         return True
-    is_diff_less_than_two_columns = grouped.apply(check_differences)
-    return is_diff_less_than_two_columns.all()
+    grouped.apply(check_differences)
+    return len(accumulated_changing_columns) <= 2
+def validate_dates_distribution(
+    X: pd.DataFrame,
+    search_keys: Dict[str, SearchKey],
+    logger: Optional[logging.Logger] = None,
+    bundle: Optional[ResourceBundle] = None,
+    warning_counter: Optional[WarningCounter] = None,
+):
+    maybe_date_col = None
+    for key, key_type in search_keys.items():
+        if key_type in [SearchKey.DATE, SearchKey.DATETIME]:
+            maybe_date_col = key
+    if maybe_date_col is None:
+        for col in X.columns:
+            if col in search_keys:
+                continue
+            try:
+                if pd.__version__ >= "2.0.0":
+                    # Format mixed to avoid massive warnings
+                    pd.to_datetime(X[col], format="mixed")
+                else:
+                    pd.to_datetime(X[col])
+                maybe_date_col = col
+                break
+            except Exception:
+                pass
+    if maybe_date_col is None:
+        return
+    if pd.__version__ >= "2.0.0":
+        dates = pd.to_datetime(X[maybe_date_col], format="mixed").dt.date
+    else:
+        dates = pd.to_datetime(X[maybe_date_col]).dt.date
+    date_counts = dates.value_counts().sort_index()
+    date_counts_1 = date_counts[: round(len(date_counts) / 2)]
+    date_counts_2 = date_counts[round(len(date_counts) / 2) :]
+    ratio = date_counts_2.mean() / date_counts_1.mean()
+    if ratio > 1.2 or ratio < 0.8:
+        if warning_counter is not None:
+            warning_counter.increment()
+        if logger is None:
+            logger = logging.getLogger("muted_logger")
+            logger.setLevel("FATAL")
+        bundle = bundle or get_custom_bundle()
+        msg = bundle.get("x_unstable_by_date")
+        print(msg)
+        logger.warning(msg)

upgini/utils/deduplicate_utils.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Dict, List, Optional, Union
 import pandas as pd
-from upgini.metadata import SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
+from upgini.metadata import EVAL_SET_INDEX, SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
 from upgini.resource_bundle import ResourceBundle
 from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
 from upgini.utils.target_utils import define_task
@@ -78,20 +78,58 @@ def remove_fintech_duplicates(
     rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
     if len(rows_with_diff_target) > 0:
         unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
-        rows_to_remove = pd.merge(df.reset_index(), unique_keys_to_delete, on=personal_cols)
-        rows_to_remove = rows_to_remove.set_index(df.index.name or "index")
-        perc = len(rows_to_remove) * 100 / len(df)
-        msg = bundle.get("dataset_diff_target_duplicates_fintech").format(
-            perc, len(rows_to_remove), rows_to_remove.index.to_list()
-        )
-        if not silent:
-            print(msg)
-        if logger:
-            logger.warning(msg)
-        logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
-        df = df[~df.index.isin(rows_to_remove.index)]
-        logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
+        if EVAL_SET_INDEX not in df.columns:
+            rows_to_remove = pd.merge(df.reset_index(), unique_keys_to_delete, on=personal_cols)
+            rows_to_remove = rows_to_remove.set_index(df.index.name or "index")
+            perc = len(rows_to_remove) * 100 / len(df)
+            msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
+                perc, len(rows_to_remove), rows_to_remove.index.to_list()
+            )
+            if not silent:
+                print(msg)
+            if logger:
+                logger.warning(msg)
+            logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
+            df = df[~df.index.isin(rows_to_remove.index)]
+            logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
+        else:
+            # Indices in train and eval_set can be the same so we remove rows from them separately
+            train = df.query(f"{EVAL_SET_INDEX} == 0")
+            train_rows_to_remove = pd.merge(train.reset_index(), unique_keys_to_delete, on=personal_cols)
+            train_rows_to_remove = train_rows_to_remove.set_index(train.index.name or "index")
+            train_perc = len(train_rows_to_remove) * 100 / len(train)
+            msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
+                train_perc, len(train_rows_to_remove), train_rows_to_remove.index.to_list()
+            )
+            if not silent:
+                print(msg)
+            if logger:
+                logger.warning(msg)
+            logger.info(f"Train dataset shape before clean fintech duplicates: {train.shape}")
+            train = train[~train.index.isin(train_rows_to_remove.index)]
+            logger.info(f"Train dataset shape after clean fintech duplicates: {train.shape}")
+            evals = [df.query(f"{EVAL_SET_INDEX} == {i}") for i in df[EVAL_SET_INDEX].unique() if i != 0]
+            new_evals = []
+            for i, eval in enumerate(evals):
+                eval_rows_to_remove = pd.merge(eval.reset_index(), unique_keys_to_delete, on=personal_cols)
+                eval_rows_to_remove = eval_rows_to_remove.set_index(eval.index.name or "index")
+                eval_perc = len(eval_rows_to_remove) * 100 / len(eval)
+                msg = bundle.get("dataset_eval_diff_target_duplicates_fintech").format(
+                    eval_perc, len(eval_rows_to_remove), i + 1, eval_rows_to_remove.index.to_list()
+                )
+                if not silent:
+                    print(msg)
+                if logger:
+                    logger.warning(msg)
+                logger.info(f"Eval {i + 1} dataset shape before clean fintech duplicates: {eval.shape}")
+                eval = eval[~eval.index.isin(eval_rows_to_remove.index)]
+                logger.info(f"Eval {i + 1} dataset shape after clean fintech duplicates: {eval.shape}")
+                new_evals.append(eval)
+            logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
+            df = pd.concat([train] + new_evals)
+            logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
     return df
@@ -101,14 +139,18 @@ def clean_full_duplicates(
     nrows = len(df)
     if nrows == 0:
         return df
-    # Remove absolute duplicates (exclude system_record_id)
+    # Remove full duplicates (exclude system_record_id, sort_id and eval_set_index)
     unique_columns = df.columns.tolist()
     if SYSTEM_RECORD_ID in unique_columns:
         unique_columns.remove(SYSTEM_RECORD_ID)
     if SORT_ID in unique_columns:
         unique_columns.remove(SORT_ID)
+    if EVAL_SET_INDEX in unique_columns:
+        unique_columns.remove(EVAL_SET_INDEX)
     logger.info(f"Dataset shape before clean duplicates: {df.shape}")
-    df = df.drop_duplicates(subset=unique_columns)
+    # Train segment goes first so if duplicates are found in train and eval set
+    # then we keep unique rows in train segment
+    df = df.drop_duplicates(subset=unique_columns, keep="first")
     logger.info(f"Dataset shape after clean duplicates: {df.shape}")
     nrows_after_full_dedup = len(df)
     share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
@@ -123,7 +165,7 @@ def clean_full_duplicates(
         marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
         if marked_duplicates.sum() > 0:
             dups_indices = df[marked_duplicates].index.to_list()
-            nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns))
+            nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
             num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
             share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
@@ -133,6 +175,7 @@ def clean_full_duplicates(
                 print(msg)
             df = df.drop_duplicates(subset=unique_columns, keep=False)
             logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
     return df

upgini/utils/email_utils.py CHANGED Viewed

@@ -4,10 +4,10 @@ from hashlib import sha256
 from typing import Dict, List, Optional
 import pandas as pd
-from pandas.api.types import is_string_dtype
-from upgini.resource_bundle import bundle
+from pandas.api.types import is_object_dtype, is_string_dtype
 from upgini.metadata import SearchKey
+from upgini.resource_bundle import bundle
 from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
 EMAIL_REGEX = re.compile(r"^[a-zA-Z0-9.!#$%&’*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*$")
@@ -18,7 +18,7 @@ class EmailSearchKeyDetector(BaseSearchKeyDetector):
         return str(column_name).lower() in ["email", "e_mail", "e-mail"]
     def _is_search_key_by_values(self, column: pd.Series) -> bool:
-        if not is_string_dtype(column):
+        if not is_string_dtype(column) and not is_object_dtype:
             return False
         if not column.astype("string").str.contains("@").any():
             return False

upgini/utils/fallback_progress_bar.py CHANGED Viewed

@@ -22,7 +22,7 @@ class CustomFallbackProgressBar:
         fraction = self.progress / self.total
         filled = "=" * int(fraction * self.text_width)
         rest = " " * (self.text_width - len(filled))
-        return "[{}{}] {}% {} {}".format(filled, rest, self.progress, self._stage, self._eta)
+        return f"[{filled}{rest}] {self.progress}% {self._stage} {self._eta}"
     def display(self):
         print(self)

upgini/utils/features_validator.py CHANGED Viewed

@@ -81,7 +81,8 @@ class FeaturesValidator:
         return [
             i
             for i in df
-            if (is_string_dtype(df[i]) or is_integer_dtype(df[i])) and (df[i].nunique(dropna=False) / row_count >= 0.95)
+            if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or is_integer_dtype(df[i]))
+            and (df[i].nunique(dropna=False) / row_count >= 0.85)
         ]
     @staticmethod

upgini/utils/progress_bar.py CHANGED Viewed

@@ -28,7 +28,7 @@ class CustomProgressBar(DisplayObject):
         fraction = self.progress / self.total
         filled = "=" * int(fraction * self.text_width)
         rest = " " * (self.text_width - len(filled))
-        return "[{}{}] {}% {}".format(filled, rest, self.progress, self._stage)
+        return f"[{filled}{rest}] {self.progress}% {self._stage}"
     def _repr_html_(self):
         return "<progress style='width:{}' max='{}' value='{}'></progress>  {}% {}</br>{}".format(

upgini/utils/sklearn_ext.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import functools
-import logging
 import numbers
 import time
 import warnings
@@ -21,6 +20,7 @@ from sklearn.metrics._scorer import _MultimetricScorer
 from sklearn.model_selection import check_cv
 from sklearn.utils.fixes import np_version, parse_version
 from sklearn.utils.validation import indexable
 # from sklearn.model_selection import cross_validate as original_cross_validate
 _DEFAULT_TAGS = {
@@ -47,7 +47,7 @@ _DEFAULT_TAGS = {
 def cross_validate(
     estimator,
-    X,
+    x,
     y=None,
     *,
     groups=None,
@@ -70,7 +70,7 @@ def cross_validate(
     estimator : estimator object implementing 'fit'
         The object to use to fit the data.
-    X : array-like of shape (n_samples, n_features)
+    x : array-like of shape (n_samples, n_features)
         The data to fit. Can be for example a list, or an array.
     y : array-like of shape (n_samples,) or (n_samples, n_outputs), \
@@ -251,7 +251,7 @@ def cross_validate(
     """
     try:
-        X, y, groups = indexable(X, y, groups)
+        x, y, groups = indexable(x, y, groups)
         cv = check_cv(cv, y, classifier=is_classifier(estimator))
@@ -268,7 +268,7 @@ def cross_validate(
         results = parallel(
             delayed(_fit_and_score)(
                 clone(estimator),
-                X,
+                x,
                 y,
                 scorers,
                 train,
@@ -281,7 +281,7 @@ def cross_validate(
                 return_estimator=return_estimator,
                 error_score=error_score,
             )
-            for train, test in cv.split(X, y, groups)
+            for train, test in cv.split(x, y, groups)
         )
         _warn_about_fit_failures(results, error_score)
@@ -313,7 +313,7 @@ def cross_validate(
         return ret
     except Exception:
-        logging.exception("Failed to execute overriden cross_validate. Fallback to original")
+        # logging.exception("Failed to execute overriden cross_validate. Fallback to original")
         raise
         # fit_params["use_best_model"] = False
         # return original_cross_validate(
@@ -488,7 +488,7 @@ def _fit_and_score(
         if y_train is None:
             estimator.fit(X_train, **fit_params)
         else:
-            if isinstance(estimator, CatBoostClassifier) or isinstance(estimator, CatBoostRegressor):
+            if isinstance(estimator, (CatBoostClassifier, CatBoostRegressor)):
                 fit_params = fit_params.copy()
                 fit_params["eval_set"] = [(X_test, y_test)]
             estimator.fit(X_train, y_train, **fit_params)
@@ -583,9 +583,11 @@ def _aggregate_score_dicts(scores):
     """
     return {
-        key: np.asarray([score[key] for score in scores])
-        if isinstance(scores[0][key], numbers.Number)
-        else [score[key] for score in scores]
+        key: (
+            np.asarray([score[key] for score in scores])
+            if isinstance(scores[0][key], numbers.Number)
+            else [score[key] for score in scores]
+        )
         for key in scores[0]
     }
@@ -970,9 +972,7 @@ def _safe_indexing(X, indices, *, axis=0):
         return X
     if axis not in (0, 1):
-        raise ValueError(
-            "'axis' should be either 0 (to index rows) or 1 (to index " " column). Got {} instead.".format(axis)
-        )
+        raise ValueError("'axis' should be either 0 (to index rows) or 1 (to index " f" column). Got {axis} instead.")
     indices_dtype = _determine_key_type(indices)
@@ -983,7 +983,7 @@ def _safe_indexing(X, indices, *, axis=0):
         raise ValueError(
             "'X' should be a 2D NumPy array, 2D sparse matrix or pandas "
             "dataframe when indexing the columns (i.e. 'axis=1'). "
-            "Got {} instead with {} dimension(s).".format(type(X), X.ndim)
+            f"Got {type(X)} instead with {X.ndim} dimension(s)."
         )
     if axis == 1 and indices_dtype == "str" and not hasattr(X, "loc"):

upgini/utils/target_utils.py CHANGED Viewed

@@ -107,7 +107,7 @@ def balance_undersample(
     min_class_count = vc[min_class_value]
     min_class_percent = imbalance_threshold / target_classes_count
-    min_class_threshold = min_class_percent * count
+    min_class_threshold = int(min_class_percent * count)
     resampled_data = df
     df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
@@ -132,9 +132,7 @@ def balance_undersample(
                 class_value = classes[class_idx]
                 class_count = vc[class_value]
                 sample_strategy[class_value] = min(class_count, quantile25_class_cnt * multiclass_bootstrap_loops)
-            sampler = RandomUnderSampler(
-                sampling_strategy=sample_strategy, random_state=random_state
-            )
+            sampler = RandomUnderSampler(sampling_strategy=sample_strategy, random_state=random_state)
             X = df[SYSTEM_RECORD_ID]
             X = X.to_frame(SYSTEM_RECORD_ID)
             new_x, _ = sampler.fit_resample(X, target)  # type: ignore
@@ -153,9 +151,7 @@ def balance_undersample(
         minority_class = df[df[target_column] == min_class_value]
         majority_class = df[df[target_column] != min_class_value]
         sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
-        sampled_majority_class = majority_class.sample(
-            n=sample_size, random_state=random_state
-        )
+        sampled_majority_class = majority_class.sample(n=sample_size, random_state=random_state)
         resampled_data = df[
             (df[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
             | (df[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
@@ -181,3 +177,21 @@ def balance_undersample(
     logger.info(f"Shape after rebalance resampling: {resampled_data}")
     return resampled_data
+def calculate_psi(expected: pd.Series, actual: pd.Series) -> float:
+    df = pd.concat([expected, actual])
+    # Define the bins for the target variable
+    df_min = df.min()
+    df_max = df.max()
+    bins = [df_min, (df_min + df_max) / 2, df_max]
+    # Calculate the base distribution
+    train_distribution = expected.value_counts(bins=bins, normalize=True).sort_index().values
+    # Calculate the target distribution
+    test_distribution = actual.value_counts(bins=bins, normalize=True).sort_index().values
+    # Calculate the PSI
+    return np.sum((train_distribution - test_distribution) * np.log(train_distribution / test_distribution))

upgini 1.1.262a3250.post4__py3-none-any.whl → 1.1.280a3418.post2__py3-none-any.whl

Potentially problematic release.

upgini 1.1.262a3250.post4py3-none-any.whl → 1.1.280a3418.post2py3-none-any.whl