PyPI - upgini - Versions diffs - 1.1.262a3250.post3__py3-none-any.whl → 1.1.274a4__py3-none-any.whl - Mend

upgini 1.1.262a3250.post3py3-none-any.whl → 1.1.274a4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

upgini/autofe/all_operands.py +12 -2
upgini/autofe/date.py +68 -8
upgini/autofe/feature.py +1 -1
upgini/data_source/data_source_publisher.py +24 -5
upgini/dataset.py +21 -58
upgini/features_enricher.py +114 -40
upgini/fingerprint.js +8 -0
upgini/metrics.py +58 -7
upgini/normalizer/phone_normalizer.py +2 -2
upgini/resource_bundle/strings.properties +8 -3
upgini/search_task.py +1 -1
upgini/utils/datetime_utils.py +53 -2
upgini/utils/deduplicate_utils.py +61 -18
upgini/utils/sklearn_ext.py +1 -2
upgini/utils/target_utils.py +125 -2
{upgini-1.1.262a3250.post3.dist-info → upgini-1.1.274a4.dist-info}/METADATA +2 -2
{upgini-1.1.262a3250.post3.dist-info → upgini-1.1.274a4.dist-info}/RECORD +20 -19
{upgini-1.1.262a3250.post3.dist-info → upgini-1.1.274a4.dist-info}/LICENSE +0 -0
{upgini-1.1.262a3250.post3.dist-info → upgini-1.1.274a4.dist-info}/WHEEL +0 -0
{upgini-1.1.262a3250.post3.dist-info → upgini-1.1.274a4.dist-info}/top_level.txt +0 -0

upgini/autofe/all_operands.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from typing import Dict
-from upgini.autofe.date import DateDiff, DateDiffFuture
+from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded
 from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
 from upgini.autofe.operand import Operand
 from upgini.autofe.unary import Abs, Log, Residual, Sqrt, Square, Sigmoid, Floor, Freq
@@ -37,7 +37,17 @@ ALL_OPERANDS: Dict[str, Operand] = {
         Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
         Sim(),
         DateDiff(),
-        DateDiffFuture(),
+        DateDiffType2(),
+        DateListDiff(aggregation="min"),
+        DateListDiff(aggregation="max"),
+        DateListDiff(aggregation="mean"),
+        DateListDiff(aggregation="nunique"),
+        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=0, upper_bound=18),
+        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=18, upper_bound=23),
+        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=23, upper_bound=30),
+        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=30, upper_bound=45),
+        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
+        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
     ]
 }

upgini/autofe/date.py CHANGED Viewed

@@ -1,11 +1,12 @@
-from typing import Optional, Union
+from typing import Any, Optional, Union
 import numpy as np
 import pandas as pd
+from pydantic import BaseModel
-from upgini.autofe.operand import PandasOperand, VectorizableMixin
+from upgini.autofe.operand import PandasOperand
-class DateDiffMixin:
+class DateDiffMixin(BaseModel):
     diff_unit: str = "D"
     left_unit: Optional[str] = None
     right_unit: Optional[str] = None
@@ -34,18 +35,77 @@ class DateDiff(PandasOperand, DateDiffMixin):
         return x
-class DateDiffFuture(PandasOperand, DateDiffMixin):
-    name = "date_diff_future"
+class DateDiffType2(PandasOperand, DateDiffMixin):
+    name = "date_diff_type2"
     is_binary = True
     has_symmetry_importance = True
-    is_vectorizable = False
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         left = self._convert_to_date(left, self.left_unit)
         right = self._convert_to_date(right, self.right_unit)
-        future = pd.to_datetime(dict(day=right.dt.day, month=right.dt.month, year=left.dt.year))
+        future = right + (left.dt.year - right.dt.year).apply(
+            lambda y: np.datetime64("NaT") if np.isnan(y) else pd.tseries.offsets.DateOffset(years=y)
+        )
+        future = pd.to_datetime(future)
         before = future[future < left]
-        future[future < left] = pd.to_datetime(dict(day=before.dt.day, month=before.dt.month, year=before.dt.year + 1))
+        future[future < left] = before + pd.tseries.offsets.DateOffset(years=1)
         diff = (future - left) / np.timedelta64(1, self.diff_unit)
         return diff
+_ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len, 0)}
+class DateListDiff(PandasOperand, DateDiffMixin):
+    is_binary = True
+    has_symmetry_importance = True
+    aggregation: str
+    def __init__(self, **data: Any) -> None:
+        if "name" not in data:
+            data["name"] = f"date_diff_{data.get('aggregation')}"
+        super().__init__(**data)
+    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        left = self._convert_to_date(left, self.left_unit)
+        right = right.apply(lambda x: pd.arrays.DatetimeArray(self._convert_to_date(x, self.right_unit)))
+        return pd.Series(left - right.values).apply(lambda x: self._agg(self._diff(x)))
+    def _diff(self, x):
+        x = x / np.timedelta64(1, self.diff_unit)
+        return x[x > 0]
+    def _agg(self, x):
+        method = getattr(np, self.aggregation, None)
+        default = np.nan
+        if method is None and self.aggregation in _ext_aggregations:
+            method, default = _ext_aggregations[self.aggregation]
+        elif not callable(method):
+            raise ValueError(f"Unsupported aggregation: {self.aggregation}")
+        return method(x) if len(x) > 0 else default
+class DateListDiffBounded(DateListDiff):
+    lower_bound: Optional[int]
+    upper_bound: Optional[int]
+    def __init__(self, **data: Any) -> None:
+        if "name" not in data:
+            lower_bound = data.get("lower_bound")
+            upper_bound = data.get("upper_bound")
+            components = [
+                "date_diff",
+                data.get("diff_unit"),
+                str(lower_bound if lower_bound is not None else "minusinf"),
+                str(upper_bound if upper_bound is not None else "plusinf"),
+            ]
+            components.append(data.get("aggregation"))
+            data["name"] = "_".join(components)
+        super().__init__(**data)
+    def _agg(self, x):
+        x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
+        return super()._agg(x)

upgini/autofe/feature.py CHANGED Viewed

@@ -305,7 +305,7 @@ class FeatureGroup:
         grouped_features = []
         def groupby_func(f: Feature) -> Tuple[Operand, Union[Column, Feature]]:
-            return (f.op, f.children[f.op.group_index])
+            return (f.op, f.children[0 if not f.op.is_vectorizable else f.op.group_index])
         for op_child, features in itertools.groupby(candidates, groupby_func):
             op, main_child = op_child

upgini/data_source/data_source_publisher.py CHANGED Viewed

@@ -40,7 +40,7 @@ class DataSourcePublisher:
         if logs_enabled:
             self.logger = LoggerFactory().get_logger(endpoint, api_key)
         else:
-            self.logger = logging.getLogger()
+            self.logger = logging.getLogger("muted_logger")
             self.logger.setLevel("FATAL")
     def place(
@@ -48,6 +48,7 @@ class DataSourcePublisher:
         data_table_uri: str,
         search_keys: Dict[str, SearchKey],
         update_frequency: str,
+        exclude_from_autofe_generation: Optional[List[str]],
         secondary_search_keys: Optional[Dict[str, SearchKey]] = None,
         sort_column: Optional[str] = None,
         date_format: Optional[str] = None,
@@ -57,7 +58,6 @@ class DataSourcePublisher:
         join_date_abs_limit_days: Optional[int] = None,
         features_for_embeddings: Optional[List[str]] = DEFAULT_GENERATE_EMBEDDINGS,
         data_table_id_to_replace: Optional[str] = None,
-        exclude_from_autofe_generation: Optional[List[str]] = None,
         _force_generation=False,
         _silent=False,
     ) -> str:
@@ -72,8 +72,8 @@ class DataSourcePublisher:
                     )
                 if search_keys is None or len(search_keys) == 0:
                     raise ValidationError("Empty search keys")
-                if SearchKey.DATE in search_keys.values() and date_format is None:
-                    raise ValidationError("date_format is required for DATE search key")
+                # if SearchKey.DATE in search_keys.values() and date_format is None:
+                #     raise ValidationError("date_format is required for DATE search key")
                 if update_frequency not in self.ACCEPTABLE_UPDATE_FREQUENCIES:
                     raise ValidationError(
                         f"Invalid update frequency: {update_frequency}. "
@@ -85,11 +85,19 @@ class DataSourcePublisher:
                     or set(search_keys.values()) == {SearchKey.MSISDN_RANGE_FROM, SearchKey.MSISDN_RANGE_TO}
                 ) and sort_column is None:
                     raise ValidationError("Sort column is required for passed search keys")
+                if (
+                    set(search_keys.values()) == {SearchKey.PHONE, SearchKey.DATE}
+                    and snapshot_frequency_days is None
+                    and join_date_abs_limit_days is None
+                ):
+                    raise ValidationError(
+                        "With MSISDN and DATE keys one of the snapshot_frequency_days or"
+                        " join_date_abs_limit_days parameters is required"
+                    )
                 request = {
                     "dataTableUri": data_table_uri,
                     "searchKeys": {k: v.value.value for k, v in search_keys.items()},
-                    "dateFormat": date_format,
                     "excludeColumns": exclude_columns,
                     "hashFeatureNames": str(hash_feature_names).lower(),
                     "snapshotFrequencyDays": snapshot_frequency_days,
@@ -98,6 +106,8 @@ class DataSourcePublisher:
                     "featuresForEmbeddings": features_for_embeddings,
                     "forceGeneration": str(_force_generation).lower(),
                 }
+                if date_format is not None:
+                    request["dateFormat"] = date_format
                 if secondary_search_keys is not None:
                     request["secondarySearchKeys"] = {k: v.value.value for k, v in secondary_search_keys.items()}
                 if sort_column is not None:
@@ -170,6 +180,7 @@ class DataSourcePublisher:
                     print(msg)
                     self.logger.info(msg)
                     self._rest_client.stop_ads_management_task(task_id, trace_id)
+                raise
             except Exception:
                 self.logger.exception("Failed to register data table")
                 raise
@@ -289,6 +300,7 @@ class DataSourcePublisher:
                 raise ValidationError("One of arguments: bq_table_id or search_keys should be presented")
             if bq_table_id is not None and search_keys is not None:
                 raise ValidationError("Only one argument could be presented: bq_table_id or search_keys")
+            task_id = None
             try:
                 search_keys = [k.value.value for k in search_keys] if search_keys else None
                 request = {"bqTableId": bq_table_id, "searchKeys": search_keys}
@@ -303,6 +315,13 @@ class DataSourcePublisher:
                     raise Exception("Failed to register ADS: " + status_response["errorMessage"])
                 print("Uploading successfully finished")
+            except KeyboardInterrupt:
+                if task_id is not None:
+                    msg = f"Stopping AdsManagementTask {task_id}"
+                    print(msg)
+                    self.logger.info(msg)
+                    self._rest_client.stop_ads_management_task(task_id, trace_id)
+                raise
             except Exception:
                 self.logger.exception(f"Failed to upload table {bq_table_id}")
                 raise

upgini/dataset.py CHANGED Viewed

@@ -39,10 +39,10 @@ from upgini.metadata import (
 )
 from upgini.normalizer.phone_normalizer import PhoneNormalizer
 from upgini.resource_bundle import ResourceBundle, get_custom_bundle
-from upgini.sampler.random_under_sampler import RandomUnderSampler
 from upgini.search_task import SearchTask
 from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
 from upgini.utils.email_utils import EmailSearchKeyConverter
+from upgini.utils.target_utils import balance_undersample
 try:
     from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
@@ -60,7 +60,9 @@ class Dataset:  # (pd.DataFrame):
     FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
     FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
     MIN_SAMPLE_THRESHOLD = 5_000
-    IMBALANCE_THESHOLD = 0.4
+    IMBALANCE_THESHOLD = 0.6
+    BINARY_BOOTSTRAP_LOOPS = 5
+    MULTICLASS_BOOTSTRAP_LOOPS = 2
     MIN_TARGET_CLASS_ROWS = 100
     MAX_MULTICLASS_CLASS_COUNT = 100
     MIN_SUPPORTED_DATE_TS = 946684800000  # 2000-01-01
@@ -460,10 +462,8 @@ class Dataset:  # (pd.DataFrame):
             self.task_type == ModelTaskType.BINARY and len(train_segment) > self.MIN_SAMPLE_THRESHOLD
         ):
             count = len(train_segment)
-            min_class_count = count
-            min_class_value = None
-            target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, "")
-            target = train_segment[target_column].copy()
+            target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
+            target = train_segment[target_column]
             target_classes_count = target.nunique()
             if target_classes_count > self.MAX_MULTICLASS_CLASS_COUNT:
@@ -473,12 +473,9 @@ class Dataset:  # (pd.DataFrame):
                 self.logger.warning(msg)
                 raise ValidationError(msg)
-            unique_target = target.unique()
-            for v in list(unique_target):  # type: ignore
-                current_class_count = len(train_segment.loc[target == v])
-                if current_class_count < min_class_count:
-                    min_class_count = current_class_count
-                    min_class_value = v
+            vc = target.value_counts()
+            min_class_value = vc.index[len(vc) - 1]
+            min_class_count = vc[min_class_value]
             if min_class_count < self.MIN_TARGET_CLASS_ROWS:
                 msg = self.bundle.get("dataset_rarest_class_less_min").format(
@@ -491,53 +488,19 @@ class Dataset:  # (pd.DataFrame):
             min_class_threshold = min_class_percent * count
             if min_class_count < min_class_threshold:
-                msg = self.bundle.get("dataset_rarest_class_less_threshold").format(
-                    min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
-                )
-                self.logger.warning(msg)
-                print(msg)
-                self.warning_counter.increment()
-                train_segment = train_segment.copy().sort_values(by=SYSTEM_RECORD_ID)
-                if self.task_type == ModelTaskType.MULTICLASS:
-                    # Sort classes by rows count and find 25% quantile class
-                    classes = target.value_counts().index
-                    quantile25_idx = int(0.75 * len(classes))
-                    quantile25_class = classes[quantile25_idx]
-                    count_of_quantile25_class = len(target[target == quantile25_class])
-                    msg = self.bundle.get("imbalance_multiclass").format(quantile25_class, count_of_quantile25_class)
-                    self.logger.warning(msg)
-                    print(msg)
-                    # 25% and lower classes will stay as is. Higher classes will be downsampled
-                    parts = []
-                    for class_idx in range(quantile25_idx):
-                        sampled = train_segment[train_segment[target_column] == classes[class_idx]].sample(
-                            n=count_of_quantile25_class, random_state=self.random_state
-                        )
-                        parts.append(sampled)
-                    for class_idx in range(quantile25_idx, len(classes)):
-                        parts.append(train_segment[train_segment[target_column] == classes[class_idx]])
-                    resampled_data = pd.concat(parts)
-                elif self.task_type == ModelTaskType.BINARY and min_class_count < self.MIN_SAMPLE_THRESHOLD / 2:
-                    minority_class = train_segment[train_segment[target_column] == min_class_value]
-                    majority_class = train_segment[train_segment[target_column] != min_class_value]
-                    sampled_majority_class = majority_class.sample(
-                        n=self.MIN_SAMPLE_THRESHOLD - min_class_count, random_state=self.random_state
-                    )
-                    resampled_data = train_segment[
-                        (train_segment[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
-                        | (train_segment[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
-                    ]
-                else:
-                    sampler = RandomUnderSampler(random_state=self.random_state)
-                    X = train_segment[SYSTEM_RECORD_ID]
-                    X = X.to_frame(SYSTEM_RECORD_ID)
-                    new_x, _ = sampler.fit_resample(X, target)  # type: ignore
-                    resampled_data = train_segment[train_segment[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
-                self.data = resampled_data
-                self.logger.info(f"Shape after rebalance resampling: {self.data.shape}")
                 self.imbalanced = True
+                self.data = balance_undersample(
+                    df=train_segment,
+                    target_column=target_column,
+                    task_type=self.task_type,
+                    random_state=self.random_state,
+                    imbalance_threshold=self.IMBALANCE_THESHOLD,
+                    binary_bootstrap_loops=self.BINARY_BOOTSTRAP_LOOPS,
+                    multiclass_bootstrap_loops=self.MULTICLASS_BOOTSTRAP_LOOPS,
+                    logger=self.logger,
+                    bundle=self.bundle,
+                    warning_counter=self.warning_counter,
+                )
         # Resample over fit threshold
         if not self.imbalanced and EVAL_SET_INDEX in self.data.columns:

upgini 1.1.262a3250.post3__py3-none-any.whl → 1.1.274a4__py3-none-any.whl

upgini 1.1.262a3250.post3py3-none-any.whl → 1.1.274a4py3-none-any.whl