PyPI - upgini - Versions diffs - 1.2.14a3616.dev3__py3-none-any.whl → 1.2.15__py3-none-any.whl - Mend

upgini 1.2.14a3616.dev3py3-none-any.whl → 1.2.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (10) hide show

upgini/__about__.py +1 -1
upgini/dataset.py +6 -3
upgini/features_enricher.py +21 -26
upgini/normalizer/normalize_utils.py +22 -15
upgini/resource_bundle/strings.properties +8 -1
upgini/utils/target_utils.py +96 -46
{upgini-1.2.14a3616.dev3.dist-info → upgini-1.2.15.dist-info}/METADATA +1 -1
{upgini-1.2.14a3616.dev3.dist-info → upgini-1.2.15.dist-info}/RECORD +10 -10
{upgini-1.2.14a3616.dev3.dist-info → upgini-1.2.15.dist-info}/WHEEL +1 -1
{upgini-1.2.14a3616.dev3.dist-info → upgini-1.2.15.dist-info}/licenses/LICENSE +0 -0

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.~~14a3616.dev3~~"
1	+ __version__ = "1.2.15"

upgini/dataset.py CHANGED Viewed

@@ -53,7 +53,8 @@ class Dataset:  # (pd.DataFrame):
     FIT_SAMPLE_THRESHOLD = 200_000
     FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
     FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
-    MIN_SAMPLE_THRESHOLD = 5_000
+    BINARY_MIN_SAMPLE_THRESHOLD = 5_000
+    MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
     IMBALANCE_THESHOLD = 0.6
     BINARY_BOOTSTRAP_LOOPS = 5
     MULTICLASS_BOOTSTRAP_LOOPS = 2
@@ -225,7 +226,7 @@ class Dataset:  # (pd.DataFrame):
             train_segment = self.data
         if self.task_type == ModelTaskType.MULTICLASS or (
-            self.task_type == ModelTaskType.BINARY and len(train_segment) > self.MIN_SAMPLE_THRESHOLD
+            self.task_type == ModelTaskType.BINARY and len(train_segment) > self.BINARY_MIN_SAMPLE_THRESHOLD
         ):
             count = len(train_segment)
             target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
@@ -253,6 +254,7 @@ class Dataset:  # (pd.DataFrame):
             min_class_percent = self.IMBALANCE_THESHOLD / target_classes_count
             min_class_threshold = min_class_percent * count
+            # If min class count less than 30% for binary or (60 / classes_count)% for multiclass
             if min_class_count < min_class_threshold:
                 self.imbalanced = True
                 self.data = balance_undersample(
@@ -260,7 +262,8 @@ class Dataset:  # (pd.DataFrame):
                     target_column=target_column,
                     task_type=self.task_type,
                     random_state=self.random_state,
-                    imbalance_threshold=self.IMBALANCE_THESHOLD,
+                    binary_min_sample_threshold=self.BINARY_MIN_SAMPLE_THRESHOLD,
+                    multiclass_min_sample_threshold=self.MULTICLASS_MIN_SAMPLE_THRESHOLD,
                     binary_bootstrap_loops=self.BINARY_BOOTSTRAP_LOOPS,
                     multiclass_bootstrap_loops=self.MULTICLASS_BOOTSTRAP_LOOPS,
                     logger=self.logger,

upgini/features_enricher.py CHANGED Viewed

@@ -1577,8 +1577,8 @@ class FeaturesEnricher(TransformerMixin):
             df = generator.generate(df)
             generated_features.extend(generator.generated_features)
-        normalizer = Normalizer(search_keys, generated_features, self.bundle, self.logger, self.warning_counter)
-        df = normalizer.normalize(df)
+        normalizer = Normalizer(self.bundle, self.logger, self.warning_counter)
+        df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
         columns_renaming = normalizer.columns_renaming
         df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
@@ -2017,10 +2017,8 @@ class FeaturesEnricher(TransformerMixin):
                 df = generator.generate(df)
                 generated_features.extend(generator.generated_features)
-            normalizer = Normalizer(
-                search_keys, generated_features, self.bundle, self.logger, self.warning_counter, silent_mode
-            )
-            df = normalizer.normalize(df)
+            normalizer = Normalizer(self.bundle, self.logger, self.warning_counter, silent_mode)
+            df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
             columns_renaming = normalizer.columns_renaming
             # Don't pass all features in backend on transform
@@ -2449,16 +2447,13 @@ class FeaturesEnricher(TransformerMixin):
         if is_numeric_dtype(df[self.TARGET_NAME]) and has_date:
             self._validate_PSI(df.sort_values(by=maybe_date_column))
-        normalizer = Normalizer(
-            self.fit_search_keys, self.fit_generated_features, self.bundle, self.logger, self.warning_counter
+        normalizer = Normalizer(self.bundle, self.logger, self.warning_counter)
+        df, self.fit_search_keys, self.fit_generated_features = normalizer.normalize(
+            df, self.fit_search_keys, self.fit_generated_features
         )
-        df = normalizer.normalize(df)
-        columns_renaming = normalizer.columns_renaming
-        self.fit_columns_renaming = columns_renaming
+        self.fit_columns_renaming = normalizer.columns_renaming
-        self.__adjust_cv(
-            df, normalizer.search_keys, self.model_task_type
-        )
+        self.__adjust_cv(df)
         df = remove_fintech_duplicates(
             df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
@@ -2472,7 +2467,7 @@ class FeaturesEnricher(TransformerMixin):
         self.df_with_original_index = df.copy()
         # TODO check maybe need to drop _time column from df_with_original_index
-        df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys, columns_renaming)
+        df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys, self.fit_columns_renaming)
         # Convert EMAIL to HEM after unnesting to do it only with one column
         email_column = self._get_email_column(self.fit_search_keys)
@@ -2482,7 +2477,7 @@ class FeaturesEnricher(TransformerMixin):
                 email_column,
                 hem_column,
                 self.fit_search_keys,
-                columns_renaming,
+                self.fit_columns_renaming,
                 list(unnest_search_keys.keys()),
                 self.logger,
             )
@@ -2493,7 +2488,7 @@ class FeaturesEnricher(TransformerMixin):
             converter = IpSearchKeyConverter(
                 ip_column,
                 self.fit_search_keys,
-                columns_renaming,
+                self.fit_columns_renaming,
                 list(unnest_search_keys.keys()),
                 self.bundle,
                 self.logger,
@@ -2524,7 +2519,7 @@ class FeaturesEnricher(TransformerMixin):
         features_columns = [c for c in df.columns if c not in non_feature_columns]
         features_to_drop = FeaturesValidator(self.logger).validate(
-            df, features_columns, self.generate_features, self.warning_counter, columns_renaming
+            df, features_columns, self.generate_features, self.warning_counter, self.fit_columns_renaming
         )
         self.fit_dropped_features.update(features_to_drop)
         df = df.drop(columns=features_to_drop)
@@ -2565,7 +2560,7 @@ class FeaturesEnricher(TransformerMixin):
             rest_client=self.rest_client,
             logger=self.logger,
         )
-        dataset.columns_renaming = columns_renaming
+        dataset.columns_renaming = self.fit_columns_renaming
         self.passed_features = [
             column for column, meaning_type in meaning_types.items() if meaning_type == FileColumnMeaningType.FEATURE
@@ -2712,22 +2707,22 @@ class FeaturesEnricher(TransformerMixin):
         if not self.warning_counter.has_warnings():
             self.__display_support_link(self.bundle.get("all_ok_community_invite"))
-    def __adjust_cv(self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], model_task_type: ModelTaskType):
-        date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
+    def __adjust_cv(self, df: pd.DataFrame):
+        date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
         # Check Multivariate time series
         if (
             self.cv is None
             and date_column
-            and model_task_type == ModelTaskType.REGRESSION
-            and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(search_keys.keys())) == 0
-            and is_blocked_time_series(df, date_column, list(search_keys.keys()) + [TARGET])
+            and self.model_task_type == ModelTaskType.REGRESSION
+            and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys())) == 0
+            and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
         ):
             msg = self.bundle.get("multivariate_timeseries_detected")
             self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
-        elif self.cv is None and model_task_type != ModelTaskType.REGRESSION:
+        elif self.cv is None and self.model_task_type != ModelTaskType.REGRESSION:
             msg = self.bundle.get("group_k_fold_in_classification")
             self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
-            group_columns = self._get_group_columns(df, search_keys)
+            group_columns = self._get_group_columns(df, self.fit_search_keys)
             self.runtime_parameters.properties["cv_params.group_columns"] = ",".join(group_columns)
             self.runtime_parameters.properties["cv_params.shuffle_kfold"] = "True"

upgini/normalizer/normalize_utils.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import hashlib
 from logging import Logger, getLogger
-from typing import Dict, List
+from typing import Dict, List, Tuple
 import numpy as np
 import pandas as pd
@@ -35,22 +35,25 @@ class Normalizer:
     def __init__(
         self,
-        search_keys: Dict[str, SearchKey],
-        generated_features: List[str],
         bundle: ResourceBundle = None,
         logger: Logger = None,
         warnings_counter: WarningCounter = None,
         silent_mode=False,
     ):
-        self.search_keys = search_keys
-        self.generated_features = generated_features
         self.bundle = bundle or get_custom_bundle()
         self.logger = logger or getLogger()
         self.warnings_counter = warnings_counter or WarningCounter()
         self.silent_mode = silent_mode
         self.columns_renaming = {}
+        self.search_keys = {}
+        self.generated_features = []
+    def normalize(
+        self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], generated_features: List[str]
+    ) -> Tuple[pd.DataFrame, Dict[str, SearchKey], List[str]]:
+        self.search_keys = search_keys.copy()
+        self.generated_features = generated_features.copy()
-    def normalize(self, df: pd.DataFrame) -> pd.DataFrame:
         df = df.copy()
         df = self._rename_columns(df)
@@ -68,21 +71,25 @@ class Normalizer:
         df = self.__convert_features_types(df)
-        return df
+        return df, self.search_keys, self.generated_features
     def _rename_columns(self, df: pd.DataFrame):
         # logger.info("Replace restricted symbols in column names")
         new_columns = []
         dup_counter = 0
         for column in df.columns:
-            if column in [
-                TARGET,
-                EVAL_SET_INDEX,
-                SYSTEM_RECORD_ID,
-                ENTITY_SYSTEM_RECORD_ID,
-                SEARCH_KEY_UNNEST,
-                DateTimeSearchKeyConverter.DATETIME_COL,
-            ] + self.generated_features:
+            if (
+                column
+                in [
+                    TARGET,
+                    EVAL_SET_INDEX,
+                    SYSTEM_RECORD_ID,
+                    ENTITY_SYSTEM_RECORD_ID,
+                    SEARCH_KEY_UNNEST,
+                    DateTimeSearchKeyConverter.DATETIME_COL,
+                ]
+                + self.generated_features
+            ):
                 self.columns_renaming[column] = column
                 new_columns.append(column)
                 continue

upgini/resource_bundle/strings.properties CHANGED Viewed

@@ -203,11 +203,18 @@ email_detected=Emails detected in column `{}`. It will be used as a search key\n
 email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
 phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
 phone_detected_not_registered=\nWARNING: Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
-target_type_detected=\nDetected task type: {}\n
+target_type_detected=\nDetected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
+binary_target_reason=only two unique label-values observed
+non_numeric_multiclass_reason=non-numeric label values observed
+few_unique_label_multiclass_reason=few unique label-values observed and can be considered as categorical
+date_search_key_regression_reason=date search key is present, treating as regression
+many_unique_label_regression_reason=many unique label-values or non-integer floating point values observed
+limited_int_multiclass_reason=integer-like values with limited unique values observed
 # all_ok_community_invite=Chat with us in Slack community:
 all_ok_community_invite=❓ Support request
 too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
 imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
+imbalanced_target=\nWARNING: Target is imbalanced and will be undersampled. Frequency of the rarest class `{}` is {}
 loss_selection_info=Using loss `{}` for feature selection
 loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator

upgini/utils/target_utils.py CHANGED Viewed

@@ -24,49 +24,83 @@ def define_task(
 ) -> ModelTaskType:
     if logger is None:
         logger = logging.getLogger()
+    # Replace inf and -inf with NaN to handle extreme values correctly
+    y = y.replace([np.inf, -np.inf], np.nan, inplace=False)
+    # Drop NaN values from the target
     target = y.dropna()
+    # Check if target is numeric and finite
     if is_numeric_dtype(target):
         target = target.loc[np.isfinite(target)]
     else:
+        # If not numeric, drop empty strings as well
         target = target.loc[target != ""]
+    # Raise error if there are no valid values left in the target
     if len(target) == 0:
         raise ValidationError(bundle.get("empty_target"))
+    # Count unique values in the target
     target_items = target.nunique()
+    # Raise error if all target values are the same
     if target_items == 1:
         raise ValidationError(bundle.get("dataset_constant_target"))
+    reason = ""  # Will store the reason for selecting the task type
+    # Binary classification case: exactly two unique values
     if target_items == 2:
         task = ModelTaskType.BINARY
+        reason = bundle.get("binary_target_reason")
     else:
+        # Attempt to convert target to numeric
         try:
             target = pd.to_numeric(target)
             is_numeric = True
         except Exception:
             is_numeric = False
-        # If any value is non numeric - multiclass
+        # If target cannot be converted to numeric, assume multiclass classification
         if not is_numeric:
             task = ModelTaskType.MULTICLASS
+            reason = bundle.get("non_numeric_multiclass_reason")
         else:
+            # Multiclass classification: few unique values and integer encoding
             if target.nunique() <= 50 and is_int_encoding(target.unique()):
                 task = ModelTaskType.MULTICLASS
+                reason = bundle.get("few_unique_label_multiclass_reason")
+            # Regression case: if there is date, assume regression
             elif has_date:
                 task = ModelTaskType.REGRESSION
+                reason = bundle.get("date_search_key_regression_reason")
             else:
+                # Remove zero values and recalculate unique ratio
                 non_zero_target = target[target != 0]
                 target_items = non_zero_target.nunique()
                 target_ratio = target_items / len(non_zero_target)
+                # Use unique_ratio to determine whether to classify as regression or multiclass
                 if (
-                    (target.dtype.kind == "f" and np.any(target != target.astype(int)))  # any non integer
+                    (target.dtype.kind == "f" and np.any(target != target.astype(int)))  # Non-integer float values
                     or target_items > 50
-                    or target_ratio > 0.2
+                    or target_ratio > 0.2  # If non-zero values have high ratio of uniqueness
                 ):
                     task = ModelTaskType.REGRESSION
+                    reason = bundle.get("many_unique_label_regression_reason")
                 else:
                     task = ModelTaskType.MULTICLASS
+                    reason = bundle.get("limited_int_multiclass_reason")
-    logger.info(f"Detected task type: {task}")
+    # Log or print the reason for the selected task type
+    logger.info(f"Detected task type: {task} (Reason: {reason})")
+    # Print task type and reason if silent mode is off
     if not silent:
-        print(bundle.get("target_type_detected").format(task))
+        print(bundle.get("target_type_detected").format(task, reason))
     return task
@@ -81,8 +115,8 @@ def balance_undersample(
     target_column: str,
     task_type: ModelTaskType,
     random_state: int,
-    imbalance_threshold: int = 0.2,
-    min_sample_threshold: int = 5000,
+    binary_min_sample_threshold: int = 5000,
+    multiclass_min_sample_threshold: int = 25000,
     binary_bootstrap_loops: int = 5,
     multiclass_bootstrap_loops: int = 2,
     logger: Optional[logging.Logger] = None,
@@ -96,52 +130,60 @@ def balance_undersample(
     if SYSTEM_RECORD_ID not in df.columns:
         raise Exception("System record id must be presented for undersampling")
-    count = len(df)
+    # count = len(df)
     target = df[target_column].copy()
-    target_classes_count = target.nunique()
+    # target_classes_count = target.nunique()
     vc = target.value_counts()
     max_class_value = vc.index[0]
     min_class_value = vc.index[len(vc) - 1]
     max_class_count = vc[max_class_value]
     min_class_count = vc[min_class_value]
+    num_classes = len(vc)
-    min_class_percent = imbalance_threshold / target_classes_count
-    min_class_threshold = int(min_class_percent * count)
+    # min_class_percent = imbalance_threshold / target_classes_count
+    # min_class_threshold = int(min_class_percent * count)
     resampled_data = df
     df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
     if task_type == ModelTaskType.MULTICLASS:
-        # Sort classes by rows count and find 25% quantile class
-        classes = vc.index
-        quantile25_idx = int(0.75 * len(classes)) - 1
-        quantile25_class = classes[quantile25_idx]
-        quantile25_class_cnt = vc[quantile25_class]
-        if max_class_count > (quantile25_class_cnt * multiclass_bootstrap_loops):
-            msg = bundle.get("imbalance_multiclass").format(quantile25_class, quantile25_class_cnt)
+        if len(df) > multiclass_min_sample_threshold and max_class_count > (
+            min_class_count * multiclass_bootstrap_loops
+        ):
+            # msg = bundle.get("imbalance_multiclass").format(min_class_value, min_class_count)
+            msg = bundle.get("imbalanced_target").format(min_class_value, min_class_count)
             logger.warning(msg)
             print(msg)
             if warning_counter:
                 warning_counter.increment()
-            # 25% and lower classes will stay as is. Higher classes will be downsampled
             sample_strategy = dict()
-            for class_idx in range(quantile25_idx):
-                # compare class count with count_of_quantile25_class * 2
-                class_value = classes[class_idx]
+            for class_value in vc.index:
+                if class_value == min_class_value:
+                    continue
                 class_count = vc[class_value]
-                sample_strategy[class_value] = min(class_count, quantile25_class_cnt * multiclass_bootstrap_loops)
+                sample_size = min(
+                    class_count,
+                    multiclass_bootstrap_loops
+                    * (
+                        min_class_count
+                        + max((multiclass_min_sample_threshold - num_classes * min_class_count) / (num_classes - 1), 0)
+                    ),
+                )
+                sample_strategy[class_value] = int(sample_size)
+            logger.info(f"Rebalance sample strategy: {sample_strategy}. Min class count: {min_class_count}")
             sampler = RandomUnderSampler(sampling_strategy=sample_strategy, random_state=random_state)
             X = df[SYSTEM_RECORD_ID]
             X = X.to_frame(SYSTEM_RECORD_ID)
             new_x, _ = sampler.fit_resample(X, target)  # type: ignore
             resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
-    elif len(df) > min_sample_threshold and min_class_count < min_sample_threshold / 2:
-        msg = bundle.get("dataset_rarest_class_less_threshold").format(
-            min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
-        )
+    elif len(df) > binary_min_sample_threshold:
+        # msg = bundle.get("dataset_rarest_class_less_threshold").format(
+        #     min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
+        # )
+        msg = bundle.get("imbalanced_target").format(min_class_value, min_class_count)
         logger.warning(msg)
         print(msg)
         if warning_counter:
@@ -150,30 +192,38 @@ def balance_undersample(
         # fill up to min_sample_threshold by majority class
         minority_class = df[df[target_column] == min_class_value]
         majority_class = df[df[target_column] != min_class_value]
-        sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
+        # sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
+        sample_size = min(
+            max_class_count,
+            binary_bootstrap_loops * (min_class_count + max(binary_min_sample_threshold - 2 * min_class_count, 0)),
+        )
+        logger.info(
+            f"Min class count: {min_class_count}. Max class count: {max_class_count}."
+            f" Rebalance sample size: {sample_size}"
+        )
         sampled_majority_class = majority_class.sample(n=sample_size, random_state=random_state)
         resampled_data = df[
             (df[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
             | (df[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
         ]
-    elif max_class_count > min_class_count * binary_bootstrap_loops:
-        msg = bundle.get("dataset_rarest_class_less_threshold").format(
-            min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
-        )
-        logger.warning(msg)
-        print(msg)
-        if warning_counter:
-            warning_counter.increment()
-        sampler = RandomUnderSampler(
-            sampling_strategy={max_class_value: binary_bootstrap_loops * min_class_count}, random_state=random_state
-        )
-        X = df[SYSTEM_RECORD_ID]
-        X = X.to_frame(SYSTEM_RECORD_ID)
-        new_x, _ = sampler.fit_resample(X, target)  # type: ignore
-        resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
+    # elif max_class_count > min_class_count * binary_bootstrap_loops:
+    #     msg = bundle.get("dataset_rarest_class_less_threshold").format(
+    #         min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
+    #     )
+    #     logger.warning(msg)
+    #     print(msg)
+    #     if warning_counter:
+    #         warning_counter.increment()
+    #     sampler = RandomUnderSampler(
+    #         sampling_strategy={max_class_value: binary_bootstrap_loops * min_class_count}, random_state=random_state
+    #     )
+    #     X = df[SYSTEM_RECORD_ID]
+    #     X = X.to_frame(SYSTEM_RECORD_ID)
+    #     new_x, _ = sampler.fit_resample(X, target)  # type: ignore
+    #     resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
     logger.info(f"Shape after rebalance resampling: {resampled_data}")
     return resampled_data

{upgini-1.2.14a3616.dev3.dist-info → upgini-1.2.15.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.14a3616.dev3
+Version: 1.2.15
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

{upgini-1.2.14a3616.dev3.dist-info → upgini-1.2.15.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,9 @@
-upgini/__about__.py,sha256=r3lE_a_du_MY_PJ07BeAX4zN5ZZJoiV-YXe1uJzNwTo,33
+upgini/__about__.py,sha256=Q6rDLuL8XHKQggYBtRCtxzpPQJgFYWn4x0gcVlH7H4g,23
 upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
-upgini/dataset.py,sha256=olZ-OHSfBNoBSCo7R5t7uCLukI2nO7afpx_A-HCiJLk,31067
+upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
 upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
-upgini/features_enricher.py,sha256=tGdWQdCgilWlG-sDebIBxQ_OMpnOqg8mTzxCj7Xp-yo,188320
+upgini/features_enricher.py,sha256=vRC7g6n6XQxSrvzXk6NJjP0ZytDQhWR4sTAo4Hp7gmA,188319
 upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
 upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
 upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
@@ -27,10 +27,10 @@ upgini/data_source/data_source_publisher.py,sha256=X-8aGtVgzGmxyXkMVBoBLIGDMb4lY
 upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
 upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
 upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-upgini/normalizer/normalize_utils.py,sha256=bHRPWCNrUvt2R9qMX6dZFCJ0i8ENVCQ2Rw3dHH9IJEg,7447
+upgini/normalizer/normalize_utils.py,sha256=Lv75lq7M46z9cAIutwkdKZtPZkWblgoRzToAJ1BwY8A,7709
 upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
 upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
-upgini/resource_bundle/strings.properties,sha256=9kvmcUrsSFUCrzOiN0Ozf-lQ2H8Igz5gATUPoHMOaU4,26456
+upgini/resource_bundle/strings.properties,sha256=eqJP6bGu12zFuQJqMY03QbMhppcdwIfL2bsJWaqmuZ4,27221
 upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
 upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -54,10 +54,10 @@ upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,1
 upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
 upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
 upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
-upgini/utils/target_utils.py,sha256=BVtDmrmFMKerSUWaNOIEdzsYHIFiODdpnWbE50QDPDc,7864
+upgini/utils/target_utils.py,sha256=qHzZRmICFbLNCrmVqGkaBcjm91L2ERRZMppci36acV4,10085
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
 upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
-upgini-1.2.14a3616.dev3.dist-info/METADATA,sha256=t3fSIzaoSthUHfOhJmHqz45r_3UpZhF2ur9cFekdcA8,48587
-upgini-1.2.14a3616.dev3.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
-upgini-1.2.14a3616.dev3.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.14a3616.dev3.dist-info/RECORD,,
+upgini-1.2.15.dist-info/METADATA,sha256=Hua2FUNftyzzpi9eR090MFJ-5F8S_KS_5SrZhwOUgco,48577
+upgini-1.2.15.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
+upgini-1.2.15.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.15.dist-info/RECORD,,

{upgini-1.2.14a3616.dev3.dist-info → upgini-1.2.15.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: hatchling 1.24.2
+Generator: hatchling 1.25.0
 Root-Is-Purelib: true
 Tag: py3-none-any

{upgini-1.2.14a3616.dev3.dist-info → upgini-1.2.15.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.14a3616.dev3__py3-none-any.whl → 1.2.15__py3-none-any.whl

Potentially problematic release.

upgini 1.2.14a3616.dev3py3-none-any.whl → 1.2.15py3-none-any.whl