PyPI - upgini - Versions diffs - 1.1.261a3250.post2__tar.gz → 1.1.262__tar.gz - Mend

upgini 1.1.261a3250.post2tar.gz → 1.1.262tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (86) hide show

{upgini-1.1.261a3250.post2/src/upgini.egg-info → upgini-1.1.262}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: upgini
-Version: 1.1.261a3250.post2
+Version: 1.1.262
 Summary: Intelligent data search & enrichment for Machine Learning
 Home-page: https://upgini.com/
 Author: Upgini Developers

{upgini-1.1.261a3250.post2 → upgini-1.1.262}/setup.py RENAMED Viewed

@@ -40,7 +40,7 @@ def send_log(msg: str):
 here = Path(__file__).parent.resolve()
-version = "1.1.261a3250.post2"
+version = "1.1.262"
 try:
     send_log(f"Start setup PyLib version {version}")
     setup(

{upgini-1.1.261a3250.post2 → upgini-1.1.262}/src/upgini/autofe/all_operands.py RENAMED Viewed

@@ -1,5 +1,4 @@
 from typing import Dict
-from upgini.autofe.date import DateDiff, DateDiffFuture
 from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
 from upgini.autofe.operand import Operand
 from upgini.autofe.unary import Abs, Log, Residual, Sqrt, Square, Sigmoid, Floor, Freq
@@ -36,8 +35,6 @@ ALL_OPERANDS: Dict[str, Operand] = {
         Operand(name="GroupByThenNUnique", output_type="int", is_vectorizable=True, is_grouping=True),
         Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
         Sim(),
-        DateDiff(),
-        DateDiffFuture(),
     ]
 }

{upgini-1.1.261a3250.post2 → upgini-1.1.262}/src/upgini/data_source/data_source_publisher.py RENAMED Viewed

@@ -40,7 +40,7 @@ class DataSourcePublisher:
         if logs_enabled:
             self.logger = LoggerFactory().get_logger(endpoint, api_key)
         else:
-            self.logger = logging.getLogger()
+            self.logger = logging.getLogger("muted_logger")
             self.logger.setLevel("FATAL")
     def place(
@@ -170,6 +170,7 @@ class DataSourcePublisher:
                     print(msg)
                     self.logger.info(msg)
                     self._rest_client.stop_ads_management_task(task_id, trace_id)
+                raise
             except Exception:
                 self.logger.exception("Failed to register data table")
                 raise
@@ -289,6 +290,7 @@ class DataSourcePublisher:
                 raise ValidationError("One of arguments: bq_table_id or search_keys should be presented")
             if bq_table_id is not None and search_keys is not None:
                 raise ValidationError("Only one argument could be presented: bq_table_id or search_keys")
+            task_id = None
             try:
                 search_keys = [k.value.value for k in search_keys] if search_keys else None
                 request = {"bqTableId": bq_table_id, "searchKeys": search_keys}
@@ -303,6 +305,13 @@ class DataSourcePublisher:
                     raise Exception("Failed to register ADS: " + status_response["errorMessage"])
                 print("Uploading successfully finished")
+            except KeyboardInterrupt:
+                if task_id is not None:
+                    msg = f"Stopping AdsManagementTask {task_id}"
+                    print(msg)
+                    self.logger.info(msg)
+                    self._rest_client.stop_ads_management_task(task_id, trace_id)
+                raise
             except Exception:
                 self.logger.exception(f"Failed to upload table {bq_table_id}")
                 raise

{upgini-1.1.261a3250.post2 → upgini-1.1.262}/src/upgini/dataset.py RENAMED Viewed

@@ -39,10 +39,10 @@ from upgini.metadata import (
 )
 from upgini.normalizer.phone_normalizer import PhoneNormalizer
 from upgini.resource_bundle import ResourceBundle, get_custom_bundle
-from upgini.sampler.random_under_sampler import RandomUnderSampler
 from upgini.search_task import SearchTask
 from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
 from upgini.utils.email_utils import EmailSearchKeyConverter
+from upgini.utils.target_utils import balance_undersample
 try:
     from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
@@ -60,7 +60,9 @@ class Dataset:  # (pd.DataFrame):
     FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
     FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
     MIN_SAMPLE_THRESHOLD = 5_000
-    IMBALANCE_THESHOLD = 0.4
+    IMBALANCE_THESHOLD = 0.6
+    BINARY_BOOTSTRAP_LOOPS = 5
+    MULTICLASS_BOOTSTRAP_LOOPS = 2
     MIN_TARGET_CLASS_ROWS = 100
     MAX_MULTICLASS_CLASS_COUNT = 100
     MIN_SUPPORTED_DATE_TS = 946684800000  # 2000-01-01
@@ -460,10 +462,8 @@ class Dataset:  # (pd.DataFrame):
             self.task_type == ModelTaskType.BINARY and len(train_segment) > self.MIN_SAMPLE_THRESHOLD
         ):
             count = len(train_segment)
-            min_class_count = count
-            min_class_value = None
-            target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, "")
-            target = train_segment[target_column].copy()
+            target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
+            target = train_segment[target_column]
             target_classes_count = target.nunique()
             if target_classes_count > self.MAX_MULTICLASS_CLASS_COUNT:
@@ -473,12 +473,9 @@ class Dataset:  # (pd.DataFrame):
                 self.logger.warning(msg)
                 raise ValidationError(msg)
-            unique_target = target.unique()
-            for v in list(unique_target):  # type: ignore
-                current_class_count = len(train_segment.loc[target == v])
-                if current_class_count < min_class_count:
-                    min_class_count = current_class_count
-                    min_class_value = v
+            vc = target.value_counts()
+            min_class_value = vc.index[len(vc) - 1]
+            min_class_count = vc[min_class_value]
             if min_class_count < self.MIN_TARGET_CLASS_ROWS:
                 msg = self.bundle.get("dataset_rarest_class_less_min").format(
@@ -491,53 +488,19 @@ class Dataset:  # (pd.DataFrame):
             min_class_threshold = min_class_percent * count
             if min_class_count < min_class_threshold:
-                msg = self.bundle.get("dataset_rarest_class_less_threshold").format(
-                    min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
-                )
-                self.logger.warning(msg)
-                print(msg)
-                self.warning_counter.increment()
-                train_segment = train_segment.copy().sort_values(by=SYSTEM_RECORD_ID)
-                if self.task_type == ModelTaskType.MULTICLASS:
-                    # Sort classes by rows count and find 25% quantile class
-                    classes = target.value_counts().index
-                    quantile25_idx = int(0.75 * len(classes))
-                    quantile25_class = classes[quantile25_idx]
-                    count_of_quantile25_class = len(target[target == quantile25_class])
-                    msg = self.bundle.get("imbalance_multiclass").format(quantile25_class, count_of_quantile25_class)
-                    self.logger.warning(msg)
-                    print(msg)
-                    # 25% and lower classes will stay as is. Higher classes will be downsampled
-                    parts = []
-                    for class_idx in range(quantile25_idx):
-                        sampled = train_segment[train_segment[target_column] == classes[class_idx]].sample(
-                            n=count_of_quantile25_class, random_state=self.random_state
-                        )
-                        parts.append(sampled)
-                    for class_idx in range(quantile25_idx, len(classes)):
-                        parts.append(train_segment[train_segment[target_column] == classes[class_idx]])
-                    resampled_data = pd.concat(parts)
-                elif self.task_type == ModelTaskType.BINARY and min_class_count < self.MIN_SAMPLE_THRESHOLD / 2:
-                    minority_class = train_segment[train_segment[target_column] == min_class_value]
-                    majority_class = train_segment[train_segment[target_column] != min_class_value]
-                    sampled_majority_class = majority_class.sample(
-                        n=self.MIN_SAMPLE_THRESHOLD - min_class_count, random_state=self.random_state
-                    )
-                    resampled_data = train_segment[
-                        (train_segment[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
-                        | (train_segment[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
-                    ]
-                else:
-                    sampler = RandomUnderSampler(random_state=self.random_state)
-                    X = train_segment[SYSTEM_RECORD_ID]
-                    X = X.to_frame(SYSTEM_RECORD_ID)
-                    new_x, _ = sampler.fit_resample(X, target)  # type: ignore
-                    resampled_data = train_segment[train_segment[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
-                self.data = resampled_data
-                self.logger.info(f"Shape after rebalance resampling: {self.data.shape}")
                 self.imbalanced = True
+                self.data = balance_undersample(
+                    df=train_segment,
+                    target_column=target_column,
+                    task_type=self.task_type,
+                    random_state=self.random_state,
+                    imbalance_threshold=self.IMBALANCE_THESHOLD,
+                    binary_bootstrap_loops=self.BINARY_BOOTSTRAP_LOOPS,
+                    multiclass_bootstrap_loops=self.MULTICLASS_BOOTSTRAP_LOOPS,
+                    logger=self.logger,
+                    bundle=self.bundle,
+                    warning_counter=self.warning_counter,
+                )
         # Resample over fit threshold
         if not self.imbalanced and EVAL_SET_INDEX in self.data.columns:

{upgini-1.1.261a3250.post2 → upgini-1.1.262}/src/upgini/features_enricher.py RENAMED Viewed

@@ -220,7 +220,7 @@ class FeaturesEnricher(TransformerMixin):
         if logs_enabled:
             self.logger = LoggerFactory().get_logger(endpoint, self._api_key, client_ip, client_visitorid)
         else:
-            self.logger = logging.getLogger()
+            self.logger = logging.getLogger("muted_logger")
             self.logger.setLevel("FATAL")
         if len(kwargs) > 0:

{upgini-1.1.261a3250.post2 → upgini-1.1.262}/src/upgini/search_task.py RENAMED Viewed

@@ -57,7 +57,7 @@ class SearchTask:
         if logger is not None:
             self.logger = logger
         else:
-            self.logger = logging.getLogger()
+            self.logger = logging.getLogger("muted_logger")
             self.logger.setLevel("FATAL")
         self.provider_metadata_v2: Optional[List[ProviderTaskMetadataV2]] = None
         self.unused_features_for_generation: Optional[List[str]] = None

{upgini-1.1.261a3250.post2 → upgini-1.1.262}/src/upgini/utils/datetime_utils.py RENAMED Viewed

@@ -44,7 +44,7 @@ class DateTimeSearchKeyConverter:
         if logger is not None:
             self.logger = logger
         else:
-            self.logger = logging.getLogger()
+            self.logger = logging.getLogger("muted_logger")
             self.logger.setLevel("FATAL")
         self.generated_features: List[str] = []
         self.bundle = bundle or get_custom_bundle()

upgini-1.1.262/src/upgini/utils/target_utils.py ADDED Viewed

@@ -0,0 +1,183 @@
+import logging
+from typing import Optional, Union
+import numpy as np
+import pandas as pd
+from pandas.api.types import is_numeric_dtype
+from upgini.errors import ValidationError
+from upgini.metadata import SYSTEM_RECORD_ID, ModelTaskType
+from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
+from upgini.sampler.random_under_sampler import RandomUnderSampler
+from upgini.utils.warning_counter import WarningCounter
+def correct_string_target(y: Union[pd.Series, np.ndarray]) -> Union[pd.Series, np.ndarray]:
+    if isinstance(y, pd.Series):
+        return y.astype(str).astype("category").cat.codes
+    elif isinstance(y, np.ndarray):
+        return pd.Series(y).astype(str).astype("category").cat.codes.values
+def define_task(
+    y: pd.Series, has_date: bool = False, logger: Optional[logging.Logger] = None, silent: bool = False
+) -> ModelTaskType:
+    if logger is None:
+        logger = logging.getLogger()
+    target = y.dropna()
+    if is_numeric_dtype(target):
+        target = target.loc[np.isfinite(target)]
+    else:
+        target = target.loc[target != ""]
+    if len(target) == 0:
+        raise ValidationError(bundle.get("empty_target"))
+    target_items = target.nunique()
+    if target_items == 1:
+        raise ValidationError(bundle.get("dataset_constant_target"))
+    if target_items == 2:
+        task = ModelTaskType.BINARY
+    else:
+        try:
+            target = pd.to_numeric(target)
+            is_numeric = True
+        except Exception:
+            is_numeric = False
+        # If any value is non numeric - multiclass
+        if not is_numeric:
+            task = ModelTaskType.MULTICLASS
+        else:
+            if target.nunique() <= 50 and is_int_encoding(target.unique()):
+                task = ModelTaskType.MULTICLASS
+            elif has_date:
+                task = ModelTaskType.REGRESSION
+            else:
+                non_zero_target = target[target != 0]
+                target_items = non_zero_target.nunique()
+                target_ratio = target_items / len(non_zero_target)
+                if (
+                    (target.dtype.kind == "f" and np.any(target != target.astype(int)))  # any non integer
+                    or target_items > 50
+                    or target_ratio > 0.2
+                ):
+                    task = ModelTaskType.REGRESSION
+                else:
+                    task = ModelTaskType.MULTICLASS
+    logger.info(f"Detected task type: {task}")
+    if not silent:
+        print(bundle.get("target_type_detected").format(task))
+    return task
+def is_int_encoding(unique_values):
+    return set(unique_values) == set(range(len(unique_values))) or set(unique_values) == set(
+        range(1, len(unique_values) + 1)
+    )
+def balance_undersample(
+    df: pd.DataFrame,
+    target_column: str,
+    task_type: ModelTaskType,
+    random_state: int,
+    imbalance_threshold: int = 0.2,
+    min_sample_threshold: int = 5000,
+    binary_bootstrap_loops: int = 5,
+    multiclass_bootstrap_loops: int = 2,
+    logger: Optional[logging.Logger] = None,
+    bundle: Optional[ResourceBundle] = None,
+    warning_counter: Optional[WarningCounter] = None,
+) -> pd.DataFrame:
+    if logger is None:
+        logger = logging.getLogger("muted_logger")
+        logger.setLevel("FATAL")
+    bundle = bundle or get_custom_bundle()
+    if SYSTEM_RECORD_ID not in df.columns:
+        raise Exception("System record id must be presented for undersampling")
+    count = len(df)
+    target = df[target_column].copy()
+    target_classes_count = target.nunique()
+    vc = target.value_counts()
+    max_class_value = vc.index[0]
+    min_class_value = vc.index[len(vc) - 1]
+    max_class_count = vc[max_class_value]
+    min_class_count = vc[min_class_value]
+    min_class_percent = imbalance_threshold / target_classes_count
+    min_class_threshold = min_class_percent * count
+    resampled_data = df
+    df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
+    if task_type == ModelTaskType.MULTICLASS:
+        # Sort classes by rows count and find 25% quantile class
+        classes = vc.index
+        quantile25_idx = int(0.75 * len(classes)) - 1
+        quantile25_class = classes[quantile25_idx]
+        quantile25_class_cnt = vc[quantile25_class]
+        if max_class_count > (quantile25_class_cnt * multiclass_bootstrap_loops):
+            msg = bundle.get("imbalance_multiclass").format(quantile25_class, quantile25_class_cnt)
+            logger.warning(msg)
+            print(msg)
+            if warning_counter:
+                warning_counter.increment()
+            # 25% and lower classes will stay as is. Higher classes will be downsampled
+            sample_strategy = dict()
+            for class_idx in range(quantile25_idx):
+                # compare class count with count_of_quantile25_class * 2
+                class_value = classes[class_idx]
+                class_count = vc[class_value]
+                sample_strategy[class_value] = min(class_count, quantile25_class_cnt * multiclass_bootstrap_loops)
+            sampler = RandomUnderSampler(
+                sampling_strategy=sample_strategy, random_state=random_state
+            )
+            X = df[SYSTEM_RECORD_ID]
+            X = X.to_frame(SYSTEM_RECORD_ID)
+            new_x, _ = sampler.fit_resample(X, target)  # type: ignore
+            resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
+    elif len(df) > min_sample_threshold and min_class_count < min_sample_threshold / 2:
+        msg = bundle.get("dataset_rarest_class_less_threshold").format(
+            min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
+        )
+        logger.warning(msg)
+        print(msg)
+        if warning_counter:
+            warning_counter.increment()
+        # fill up to min_sample_threshold by majority class
+        minority_class = df[df[target_column] == min_class_value]
+        majority_class = df[df[target_column] != min_class_value]
+        sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
+        sampled_majority_class = majority_class.sample(
+            n=sample_size, random_state=random_state
+        )
+        resampled_data = df[
+            (df[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
+            | (df[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
+        ]
+    elif max_class_count > min_class_count * binary_bootstrap_loops:
+        msg = bundle.get("dataset_rarest_class_less_threshold").format(
+            min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
+        )
+        logger.warning(msg)
+        print(msg)
+        if warning_counter:
+            warning_counter.increment()
+        sampler = RandomUnderSampler(
+            sampling_strategy={max_class_value: binary_bootstrap_loops * min_class_count}, random_state=random_state
+        )
+        X = df[SYSTEM_RECORD_ID]
+        X = X.to_frame(SYSTEM_RECORD_ID)
+        new_x, _ = sampler.fit_resample(X, target)  # type: ignore
+        resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
+    logger.info(f"Shape after rebalance resampling: {resampled_data}")
+    return resampled_data

{upgini-1.1.261a3250.post2 → upgini-1.1.262/src/upgini.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: upgini
-Version: 1.1.261a3250.post2
+Version: 1.1.262
 Summary: Intelligent data search & enrichment for Machine Learning
 Home-page: https://upgini.com/
 Author: Upgini Developers

{upgini-1.1.261a3250.post2 → upgini-1.1.262}/src/upgini.egg-info/SOURCES.txt RENAMED Viewed

@@ -23,7 +23,6 @@ src/upgini/ads_management/ads_manager.py
 src/upgini/autofe/__init__.py
 src/upgini/autofe/all_operands.py
 src/upgini/autofe/binary.py
-src/upgini/autofe/date.py
 src/upgini/autofe/feature.py
 src/upgini/autofe/groupby.py
 src/upgini/autofe/operand.py
@@ -64,7 +63,6 @@ src/upgini/utils/sklearn_ext.py
 src/upgini/utils/target_utils.py
 src/upgini/utils/track_info.py
 src/upgini/utils/warning_counter.py
-tests/test_autofe_operands.py
 tests/test_binary_dataset.py
 tests/test_blocked_time_series.py
 tests/test_categorical_dataset.py

{upgini-1.1.261a3250.post2 → upgini-1.1.262}/tests/test_etalon_validation.py RENAMED Viewed

@@ -260,11 +260,13 @@ def test_imbalanced_target():
     }
     dataset.task_type = ModelTaskType.MULTICLASS
     dataset._Dataset__resample()
-    assert len(dataset) == 400
+    assert len(dataset) == 1800
     value_counts = dataset.data["target"].value_counts()
     assert len(value_counts) == 4
-    for label in dataset.data["target"].unique():
-        assert value_counts[label] == 100
+    assert value_counts["a"] == 100
+    assert value_counts["b"] == 400
+    assert value_counts["c"] == 500
+    assert value_counts["d"] == 800
 def test_fail_on_small_class_observations():

{upgini-1.1.261a3250.post2 → upgini-1.1.262}/tests/test_features_enricher.py RENAMED Viewed

@@ -2163,6 +2163,7 @@ def test_idempotent_order_with_imbalanced_dataset(requests_mock: Mocker):
                 pass
             actual_result_df = result_wrapper.df.sort_values(by="system_record_id").reset_index(drop=True)
+            # actual_result_df.to_parquet(expected_result_path)
             assert_frame_equal(actual_result_df, expected_result_df)
         for i in range(5):

upgini-1.1.262/tests/test_target_utils.py ADDED Viewed

@@ -0,0 +1,134 @@
+import numpy as np
+import pandas as pd
+import pytest
+from pandas.testing import assert_frame_equal
+from upgini.errors import ValidationError
+from upgini.metadata import SYSTEM_RECORD_ID, TARGET, ModelTaskType
+from upgini.resource_bundle import bundle
+from upgini.utils.target_utils import balance_undersample, define_task
+def test_invalid_target():
+    y = pd.Series(["", "", ""])
+    with pytest.raises(ValidationError, match=bundle.get("empty_target")):
+        define_task(y)
+    y = pd.Series([np.nan, np.inf, -np.inf])
+    with pytest.raises(ValidationError, match=bundle.get("empty_target")):
+        define_task(y)
+    y = pd.Series([1, 1, 1, 1, 1])
+    with pytest.raises(ValidationError, match=bundle.get("dataset_constant_target")):
+        define_task(y)
+def test_define_binary_task_type():
+    y = pd.Series([0, 1, 0, 1, 0, 1])
+    assert define_task(y, False) == ModelTaskType.BINARY
+    assert define_task(y, True) == ModelTaskType.BINARY
+    y = pd.Series(["a", "b", "a", "b", "a"])
+    assert define_task(y, False) == ModelTaskType.BINARY
+    assert define_task(y, True) == ModelTaskType.BINARY
+def test_define_multiclass_task_type():
+    y = pd.Series(range(1, 51))
+    assert define_task(y, False) == ModelTaskType.MULTICLASS
+    assert define_task(y, True) == ModelTaskType.MULTICLASS
+    y = pd.Series([float(x) for x in range(1, 51)])
+    assert define_task(y, False) == ModelTaskType.MULTICLASS
+    assert define_task(y, True) == ModelTaskType.MULTICLASS
+    y = pd.Series(range(0, 50))
+    assert define_task(y, False) == ModelTaskType.MULTICLASS
+    assert define_task(y, True) == ModelTaskType.MULTICLASS
+    y = pd.Series(["a", "b", "c", "b", "a"])
+    assert define_task(y, False) == ModelTaskType.MULTICLASS
+    assert define_task(y, True) == ModelTaskType.MULTICLASS
+    y = pd.Series(["0", "1", "2", "3", "a"])
+    assert define_task(y, False) == ModelTaskType.MULTICLASS
+    assert define_task(y, True) == ModelTaskType.MULTICLASS
+    y = pd.Series([0.0, 3.0, 5.0, 0.0, 5.0, 0.0, 3.0, 0.0, 5.0, 0.0, 5.0, 0.0, 3.0, 0.0, 3.0, 5.0, 3.0])
+    assert define_task(y, False) == ModelTaskType.MULTICLASS
+def test_define_regression_task_type():
+    y = pd.Series([0.0, 3.0, 5.0, 0.0, 5.0, 0.0, 3.0, 0.0, 5.0, 0.0, 5.0, 0.0, 3.0, 0.0, 3.0, 5.0, 3.0])
+    assert define_task(y, True) == ModelTaskType.REGRESSION
+    y = pd.Series([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.5])
+    assert define_task(y, False) == ModelTaskType.REGRESSION
+    assert define_task(y, True) == ModelTaskType.REGRESSION
+    y = pd.Series([0, 1, 2, 3, 4, 5, 6, 8])
+    assert define_task(y, False) == ModelTaskType.REGRESSION
+    assert define_task(y, True) == ModelTaskType.REGRESSION
+    y = pd.Series([0.0, 3.0, 5.0, 0.0, 5.0, 0.0, 3.0])
+    assert define_task(y, False) == ModelTaskType.REGRESSION
+    assert define_task(y, True) == ModelTaskType.REGRESSION
+def test_balance_undersampling_binary():
+    df = pd.DataFrame({SYSTEM_RECORD_ID: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], TARGET: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0]})
+    balanced_df = balance_undersample(
+        df, TARGET, ModelTaskType.BINARY, 42, imbalance_threshold=0.1, min_sample_threshold=2
+    )
+    # Get all minority class and 5x of majority class if minority class count (1)
+    # more or equal to min_sample_threshold/2 (1)
+    expected_df = pd.DataFrame({
+        SYSTEM_RECORD_ID: [1, 2, 3, 7, 9, 10],
+        TARGET: [0, 1, 0, 0, 0, 0]
+    })
+    assert_frame_equal(balanced_df.sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True), expected_df)
+    balanced_df = balance_undersample(
+        df, TARGET, ModelTaskType.BINARY, 42, imbalance_threshold=0.1, min_sample_threshold=8
+    )
+    # Get all minority class and fill up to min_sample_threshold (8) by majority class
+    expected_df = pd.DataFrame({
+        SYSTEM_RECORD_ID: [1, 2, 3, 4, 6, 7, 9, 10],
+        TARGET: [0, 1, 0, 0, 0, 0, 0, 0]
+    })
+    assert_frame_equal(balanced_df.sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True), expected_df)
+    df = pd.DataFrame({"system_record_id": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], TARGET: [0, 1, 0, 0, 0, 0, 0, 0, 1, 0]})
+    balanced_df = balance_undersample(
+        df, "target", ModelTaskType.BINARY, 42, imbalance_threshold=0.1, min_sample_threshold=4
+    )
+    # Get full dataset if majority class count (8) less than x5 of minority class count (2)
+    assert_frame_equal(balanced_df, df)
+def test_balance_undersaampling_multiclass():
+    df = pd.DataFrame({
+        SYSTEM_RECORD_ID: [1, 2, 3, 4, 5, 6],
+        TARGET: ["a", "b", "c", "c", "b", "c"]
+        # a - 1, b - 2, c - 3
+    })
+    balanced_df = balance_undersample(
+        df, TARGET, ModelTaskType.MULTICLASS, 42, imbalance_threshold=0.1, min_sample_threshold=10
+    )
+    # Get full dataset if majority class count (3) less than x2 of 25% class (b) count (2)
+    assert_frame_equal(balanced_df, df)
+    df = pd.DataFrame({
+        SYSTEM_RECORD_ID: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
+        TARGET: ["a", "b", "c", "c", "c", "b", "c", "d", "d", "d", "c"]
+        # a - 1, b - 2, c - 5, d - 3
+    })
+    balanced_df = balance_undersample(
+        df, TARGET, ModelTaskType.MULTICLASS, 42, imbalance_threshold=0.1, min_sample_threshold=10
+    )
+    expected_df = pd.DataFrame({
+        SYSTEM_RECORD_ID: [1, 2, 3, 4, 5, 6, 8, 9, 10, 11],
+        TARGET: ["a", "b", "c", "c", "c", "b", "d", "d", "d", "c"]
+    })
+    # Get all of 25% quantile class (b) and minor classes (a) and x2 (or all if less) of major classes
+    assert_frame_equal(balanced_df.sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True), expected_df)

upgini-1.1.261a3250.post2/src/upgini/autofe/date.py DELETED Viewed

@@ -1,42 +0,0 @@
-import numpy as np
-import pandas as pd
-from upgini.autofe.operand import PandasOperand, VectorizableMixin
-class DateDiff(PandasOperand, VectorizableMixin):
-    name = "date_diff"
-    is_binary = True
-    has_symmetry_importance = True
-    is_vectorizable = True
-    unit: str = "D"
-    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
-        return self.__replace_negative((left - right) / np.timedelta64(1, self.unit))
-    def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
-        group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
-        d1 = data[value_columns]
-        d2 = data[group_column]
-        return self.__replace_negative(d1.sub(d2, axis=0) / np.timedelta64(1, self.unit))
-    def __replace_negative(self, df):
-        df[df < 0] = None
-        return df
-class DateDiffFuture(PandasOperand):
-    name = "date_diff_future"
-    is_binary = True
-    has_symmetry_importance = True
-    is_vectorizable = False
-    unit: str = "D"
-    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
-        future = pd.to_datetime(dict(day=right.dt.day, month=right.dt.month, year=left.dt.year))
-        before = future[future < left]
-        future[future < left] = pd.to_datetime(dict(day=before.dt.day, month=before.dt.month, year=before.dt.year + 1))
-        diff = (future - left) / np.timedelta64(1, self.unit)
-        return diff

upgini-1.1.261a3250.post2/src/upgini/utils/target_utils.py DELETED Viewed

@@ -1,74 +0,0 @@
-import logging
-from typing import Optional, Union
-import numpy as np
-import pandas as pd
-from pandas.api.types import is_numeric_dtype
-from upgini.errors import ValidationError
-from upgini.metadata import ModelTaskType
-from upgini.resource_bundle import bundle
-def correct_string_target(y: Union[pd.Series, np.ndarray]) -> Union[pd.Series, np.ndarray]:
-    if isinstance(y, pd.Series):
-        return y.astype(str).astype("category").cat.codes
-    elif isinstance(y, np.ndarray):
-        return pd.Series(y).astype(str).astype("category").cat.codes.values
-def define_task(
-    y: pd.Series, has_date: bool = False, logger: Optional[logging.Logger] = None, silent: bool = False
-) -> ModelTaskType:
-    if logger is None:
-        logger = logging.getLogger()
-    target = y.dropna()
-    if is_numeric_dtype(target):
-        target = target.loc[np.isfinite(target)]
-    else:
-        target = target.loc[target != ""]
-    if len(target) == 0:
-        raise ValidationError(bundle.get("empty_target"))
-    target_items = target.nunique()
-    if target_items == 1:
-        raise ValidationError(bundle.get("dataset_constant_target"))
-    if target_items == 2:
-        task = ModelTaskType.BINARY
-    else:
-        try:
-            target = pd.to_numeric(target)
-            is_numeric = True
-        except Exception:
-            is_numeric = False
-        # If any value is non numeric - multiclass
-        if not is_numeric:
-            task = ModelTaskType.MULTICLASS
-        else:
-            if target.nunique() <= 50 and is_int_encoding(target.unique()):
-                task = ModelTaskType.MULTICLASS
-            elif has_date:
-                task = ModelTaskType.REGRESSION
-            else:
-                non_zero_target = target[target != 0]
-                target_items = non_zero_target.nunique()
-                target_ratio = target_items / len(non_zero_target)
-                if (
-                    (target.dtype.kind == "f" and np.any(target != target.astype(int)))  # any non integer
-                    or target_items > 50
-                    or target_ratio > 0.2
-                ):
-                    task = ModelTaskType.REGRESSION
-                else:
-                    task = ModelTaskType.MULTICLASS
-    logger.info(f"Detected task type: {task}")
-    if not silent:
-        print(bundle.get("target_type_detected").format(task))
-    return task
-def is_int_encoding(unique_values):
-    return set(unique_values) == set(range(len(unique_values))) or set(unique_values) == set(
-        range(1, len(unique_values) + 1)
-    )

upgini-1.1.261a3250.post2/tests/test_autofe_operands.py DELETED Viewed

@@ -1,28 +0,0 @@
-import pandas as pd
-from upgini.autofe.date import DateDiff, DateDiffFuture
-from datetime import datetime
-from pandas.testing import assert_series_equal
-def test_date_diff():
-    df = pd.DataFrame(
-        [[datetime(1993, 12, 10), datetime(2022, 10, 10)], [datetime(2023, 10, 10), datetime(2022, 10, 10)]],
-        columns=["date1", "date2"],
-    )
-    operand = DateDiff()
-    expected_result = pd.Series([10531, None])
-    assert_series_equal(operand.calculate_binary(df.date2, df.date1), expected_result)
-    assert_series_equal(operand.calculate_group(df, main_column="date1")["date2"].rename(None), expected_result)
-def test_date_diff_future():
-    df = pd.DataFrame(
-        [[datetime(1993, 12, 10), datetime(2022, 10, 10)], [datetime(1993, 4, 10), datetime(2022, 10, 10)]],
-        columns=["date1", "date2"],
-    )
-    operand = DateDiffFuture()
-    expected_result = pd.Series([61.0, 182.0])
-    assert_series_equal(operand.calculate_binary(df.date2, df.date1), expected_result)

upgini-1.1.261a3250.post2/tests/test_target_utils.py DELETED Viewed

@@ -1,74 +0,0 @@
-import numpy as np
-import pandas as pd
-import pytest
-from upgini.errors import ValidationError
-from upgini.metadata import ModelTaskType
-from upgini.resource_bundle import bundle
-from upgini.utils.target_utils import define_task
-def test_invalid_target():
-    y = pd.Series(["", "", ""])
-    with pytest.raises(ValidationError, match=bundle.get("empty_target")):
-        define_task(y)
-    y = pd.Series([np.nan, np.inf, -np.inf])
-    with pytest.raises(ValidationError, match=bundle.get("empty_target")):
-        define_task(y)
-    y = pd.Series([1, 1, 1, 1, 1])
-    with pytest.raises(ValidationError, match=bundle.get("dataset_constant_target")):
-        define_task(y)
-def test_define_binary_task_type():
-    y = pd.Series([0, 1, 0, 1, 0, 1])
-    assert define_task(y, False) == ModelTaskType.BINARY
-    assert define_task(y, True) == ModelTaskType.BINARY
-    y = pd.Series(["a", "b", "a", "b", "a"])
-    assert define_task(y, False) == ModelTaskType.BINARY
-    assert define_task(y, True) == ModelTaskType.BINARY
-def test_define_multiclass_task_type():
-    y = pd.Series(range(1, 51))
-    assert define_task(y, False) == ModelTaskType.MULTICLASS
-    assert define_task(y, True) == ModelTaskType.MULTICLASS
-    y = pd.Series([float(x) for x in range(1, 51)])
-    assert define_task(y, False) == ModelTaskType.MULTICLASS
-    assert define_task(y, True) == ModelTaskType.MULTICLASS
-    y = pd.Series(range(0, 50))
-    assert define_task(y, False) == ModelTaskType.MULTICLASS
-    assert define_task(y, True) == ModelTaskType.MULTICLASS
-    y = pd.Series(["a", "b", "c", "b", "a"])
-    assert define_task(y, False) == ModelTaskType.MULTICLASS
-    assert define_task(y, True) == ModelTaskType.MULTICLASS
-    y = pd.Series(["0", "1", "2", "3", "a"])
-    assert define_task(y, False) == ModelTaskType.MULTICLASS
-    assert define_task(y, True) == ModelTaskType.MULTICLASS
-    y = pd.Series([0.0, 3.0, 5.0, 0.0, 5.0, 0.0, 3.0, 0.0, 5.0, 0.0, 5.0, 0.0, 3.0, 0.0, 3.0, 5.0, 3.0])
-    assert define_task(y, False) == ModelTaskType.MULTICLASS
-def test_define_regression_task_type():
-    y = pd.Series([0.0, 3.0, 5.0, 0.0, 5.0, 0.0, 3.0, 0.0, 5.0, 0.0, 5.0, 0.0, 3.0, 0.0, 3.0, 5.0, 3.0])
-    assert define_task(y, True) == ModelTaskType.REGRESSION
-    y = pd.Series([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.5])
-    assert define_task(y, False) == ModelTaskType.REGRESSION
-    assert define_task(y, True) == ModelTaskType.REGRESSION
-    y = pd.Series([0, 1, 2, 3, 4, 5, 6, 8])
-    assert define_task(y, False) == ModelTaskType.REGRESSION
-    assert define_task(y, True) == ModelTaskType.REGRESSION
-    y = pd.Series([0.0, 3.0, 5.0, 0.0, 5.0, 0.0, 3.0])
-    assert define_task(y, False) == ModelTaskType.REGRESSION
-    assert define_task(y, True) == ModelTaskType.REGRESSION