PyPI - upgini - Versions diffs - 1.2.14a1__tar.gz → 1.2.14a2__tar.gz - Mend

upgini 1.2.14a1tar.gz → 1.2.14a2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

{upgini-1.2.14a1 → upgini-1.2.14a2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.14a1
+Version: 1.2.14a2
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

upgini-1.2.14a2/src/upgini/__about__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.2.14a2"

{upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/dataset.py RENAMED Viewed

@@ -53,7 +53,8 @@ class Dataset:  # (pd.DataFrame):
     FIT_SAMPLE_THRESHOLD = 200_000
     FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
     FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
-    MIN_SAMPLE_THRESHOLD = 5_000
+    BINARY_MIN_SAMPLE_THRESHOLD = 5_000
+    MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
     IMBALANCE_THESHOLD = 0.6
     BINARY_BOOTSTRAP_LOOPS = 5
     MULTICLASS_BOOTSTRAP_LOOPS = 2
@@ -225,7 +226,7 @@ class Dataset:  # (pd.DataFrame):
             train_segment = self.data
         if self.task_type == ModelTaskType.MULTICLASS or (
-            self.task_type == ModelTaskType.BINARY and len(train_segment) > self.MIN_SAMPLE_THRESHOLD
+            self.task_type == ModelTaskType.BINARY and len(train_segment) > self.BINARY_MIN_SAMPLE_THRESHOLD
         ):
             count = len(train_segment)
             target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
@@ -261,7 +262,8 @@ class Dataset:  # (pd.DataFrame):
                     target_column=target_column,
                     task_type=self.task_type,
                     random_state=self.random_state,
-                    imbalance_threshold=self.IMBALANCE_THESHOLD,
+                    binary_min_sample_threshold=self.BINARY_MIN_SAMPLE_THRESHOLD,
+                    multiclass_min_sample_threshold=self.MULTICLASS_MIN_SAMPLE_THRESHOLD,
                     binary_bootstrap_loops=self.BINARY_BOOTSTRAP_LOOPS,
                     multiclass_bootstrap_loops=self.MULTICLASS_BOOTSTRAP_LOOPS,
                     logger=self.logger,

{upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/resource_bundle/strings.properties RENAMED Viewed

@@ -208,6 +208,7 @@ target_type_detected=\nDetected task type: {}\n
 all_ok_community_invite=❓ Support request
 too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
 imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
+imbalanced_target=\nWARNING: Target is imbalanced and will be undersampled. Frequency of the rarest class `{}` is {}
 loss_selection_info=Using loss `{}` for feature selection
 loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator

{upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/target_utils.py RENAMED Viewed

@@ -81,8 +81,8 @@ def balance_undersample(
     target_column: str,
     task_type: ModelTaskType,
     random_state: int,
-    imbalance_threshold: int = 0.2,
-    min_sample_threshold: int = 5000,
+    binary_min_sample_threshold: int = 5000,
+    multiclass_min_sample_threshold: int = 25000,
     binary_bootstrap_loops: int = 5,
     multiclass_bootstrap_loops: int = 2,
     logger: Optional[logging.Logger] = None,
@@ -96,52 +96,59 @@ def balance_undersample(
     if SYSTEM_RECORD_ID not in df.columns:
         raise Exception("System record id must be presented for undersampling")
-    count = len(df)
+    # count = len(df)
     target = df[target_column].copy()
-    target_classes_count = target.nunique()
+    # target_classes_count = target.nunique()
     vc = target.value_counts()
     max_class_value = vc.index[0]
     min_class_value = vc.index[len(vc) - 1]
     max_class_count = vc[max_class_value]
     min_class_count = vc[min_class_value]
+    num_classes = len(vc)
-    min_class_percent = imbalance_threshold / target_classes_count
-    min_class_threshold = int(min_class_percent * count)
+    # min_class_percent = imbalance_threshold / target_classes_count
+    # min_class_threshold = int(min_class_percent * count)
     resampled_data = df
     df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
     if task_type == ModelTaskType.MULTICLASS:
-        # Sort classes by rows count and find 25% quantile class
-        classes = vc.index
-        quantile25_idx = int(0.75 * len(classes)) - 1
-        quantile25_class = classes[quantile25_idx]
-        quantile25_class_cnt = vc[quantile25_class]
-        if max_class_count > (quantile25_class_cnt * multiclass_bootstrap_loops):
-            msg = bundle.get("imbalance_multiclass").format(quantile25_class, quantile25_class_cnt)
+        if len(df) > multiclass_min_sample_threshold and max_class_count > (
+            min_class_count * multiclass_bootstrap_loops
+        ):
+            # msg = bundle.get("imbalance_multiclass").format(min_class_value, min_class_count)
+            msg = bundle.get("imbalanced_target").format(min_class_value, min_class_count)
             logger.warning(msg)
             print(msg)
             if warning_counter:
                 warning_counter.increment()
-            # 25% and lower classes will stay as is. Higher classes will be downsampled
             sample_strategy = dict()
-            for class_idx in range(quantile25_idx):
-                # compare class count with count_of_quantile25_class * 2
-                class_value = classes[class_idx]
+            for class_value in vc.index:
+                if class_value == min_class_value:
+                    continue
                 class_count = vc[class_value]
-                sample_strategy[class_value] = min(class_count, quantile25_class_cnt * multiclass_bootstrap_loops)
+                sample_size = min(
+                    class_count,
+                    multiclass_bootstrap_loops
+                    * (
+                        min_class_count
+                        + max((multiclass_min_sample_threshold - num_classes * min_class_count) / (num_classes - 1), 0)
+                    ),
+                )
+                sample_strategy[class_value] = int(sample_size)
             sampler = RandomUnderSampler(sampling_strategy=sample_strategy, random_state=random_state)
             X = df[SYSTEM_RECORD_ID]
             X = X.to_frame(SYSTEM_RECORD_ID)
             new_x, _ = sampler.fit_resample(X, target)  # type: ignore
             resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
-    elif len(df) > min_sample_threshold and min_class_count < min_sample_threshold / 2:
-        msg = bundle.get("dataset_rarest_class_less_threshold").format(
-            min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
-        )
+    elif len(df) > binary_min_sample_threshold:
+        # msg = bundle.get("dataset_rarest_class_less_threshold").format(
+        #     min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
+        # )
+        msg = bundle.get("imbalanced_target").format(min_class_value, min_class_count)
         logger.warning(msg)
         print(msg)
         if warning_counter:
@@ -153,7 +160,7 @@ def balance_undersample(
         # sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
         sample_size = min(
             max_class_count,
-            binary_bootstrap_loops * (min_class_count + max(min_sample_threshold - 2 * min_class_count, 0)),
+            binary_bootstrap_loops * (min_class_count + max(binary_min_sample_threshold - 2 * min_class_count, 0)),
         )
         sampled_majority_class = majority_class.sample(n=sample_size, random_state=random_state)
         resampled_data = df[