PyPI - upgini - Versions diffs - 1.1.280.dev0__py3-none-any.whl → 1.2.31a1__py3-none-any.whl - Mend

upgini 1.1.280.dev0py3-none-any.whl → 1.2.31a1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (43) hide show

upgini/__about__.py +1 -1
upgini/__init__.py +4 -20
upgini/autofe/all_operands.py +39 -9
upgini/autofe/binary.py +148 -45
upgini/autofe/date.py +197 -26
upgini/autofe/feature.py +102 -19
upgini/autofe/groupby.py +22 -22
upgini/autofe/operand.py +9 -6
upgini/autofe/unary.py +83 -41
upgini/autofe/vector.py +8 -8
upgini/data_source/data_source_publisher.py +128 -5
upgini/dataset.py +50 -386
upgini/features_enricher.py +931 -542
upgini/http.py +27 -16
upgini/lazy_import.py +35 -0
upgini/metadata.py +84 -59
upgini/metrics.py +164 -34
upgini/normalizer/normalize_utils.py +197 -0
upgini/resource_bundle/strings.properties +66 -51
upgini/search_task.py +10 -4
upgini/utils/Roboto-Regular.ttf +0 -0
upgini/utils/base_search_key_detector.py +14 -12
upgini/utils/country_utils.py +16 -0
upgini/utils/custom_loss_utils.py +39 -36
upgini/utils/datetime_utils.py +98 -45
upgini/utils/deduplicate_utils.py +135 -112
upgini/utils/display_utils.py +46 -15
upgini/utils/email_utils.py +54 -16
upgini/utils/feature_info.py +172 -0
upgini/utils/features_validator.py +34 -20
upgini/utils/ip_utils.py +100 -1
upgini/utils/phone_utils.py +343 -0
upgini/utils/postal_code_utils.py +34 -0
upgini/utils/sklearn_ext.py +28 -19
upgini/utils/target_utils.py +113 -57
upgini/utils/warning_counter.py +1 -0
upgini/version_validator.py +8 -4
{upgini-1.1.280.dev0.dist-info → upgini-1.2.31a1.dist-info}/METADATA +31 -16
upgini-1.2.31a1.dist-info/RECORD +65 -0
upgini/normalizer/phone_normalizer.py +0 -340
upgini-1.1.280.dev0.dist-info/RECORD +0 -62
{upgini-1.1.280.dev0.dist-info → upgini-1.2.31a1.dist-info}/WHEEL +0 -0
{upgini-1.1.280.dev0.dist-info → upgini-1.2.31a1.dist-info}/licenses/LICENSE +0 -0

upgini/utils/datetime_utils.py CHANGED Viewed

@@ -6,15 +6,11 @@ from typing import Dict, List, Optional
 import numpy as np
 import pandas as pd
 from dateutil.relativedelta import relativedelta
-from pandas.api.types import (
-    is_numeric_dtype,
-    is_period_dtype,
-)
+from pandas.api.types import is_numeric_dtype
 from upgini.errors import ValidationError
-from upgini.metadata import SearchKey
+from upgini.metadata import EVAL_SET_INDEX, SearchKey
 from upgini.resource_bundle import ResourceBundle, get_custom_bundle
-from upgini.utils.warning_counter import WarningCounter
 DATE_FORMATS = [
     "%Y-%m-%d",
@@ -31,18 +27,20 @@ DATE_FORMATS = [
     "%Y-%m-%dT%H:%M:%S.%f",
 ]
-DATETIME_PATTERN = r"^[\d\s\.\-:T]+$"
+DATETIME_PATTERN = r"^[\d\s\.\-:T/+]+$"
 class DateTimeSearchKeyConverter:
     DATETIME_COL = "_date_time"
+    # MIN_SUPPORTED_DATE_TS = datetime.datetime(1999, 12, 31)  # 946684800000  # 2000-01-01
+    MIN_SUPPORTED_DATE_TS = pd.to_datetime(datetime.datetime(1999, 12, 31)).tz_localize(None)
     def __init__(
         self,
         date_column: str,
         date_format: Optional[str] = None,
         logger: Optional[logging.Logger] = None,
-        bundle: ResourceBundle = None,
+        bundle: Optional[ResourceBundle] = None,
     ):
         self.date_column = date_column
         self.date_format = date_format
@@ -53,6 +51,7 @@ class DateTimeSearchKeyConverter:
             self.logger.setLevel("FATAL")
         self.generated_features: List[str] = []
         self.bundle = bundle or get_custom_bundle()
+        self.has_old_dates = False
     @staticmethod
     def _int_to_opt(i: int) -> Optional[int]:
@@ -81,8 +80,8 @@ class DateTimeSearchKeyConverter:
             df[self.date_column] = df[self.date_column].apply(lambda x: x.replace(tzinfo=None))
         elif isinstance(df[self.date_column].values[0], datetime.date):
             df[self.date_column] = pd.to_datetime(df[self.date_column], errors="coerce")
-        elif is_period_dtype(df[self.date_column]):
-            df[self.date_column] = pd.to_datetime(df[self.date_column].astype("string"))
+        elif isinstance(df[self.date_column].dtype, pd.PeriodDtype):
+            df[self.date_column] = df[self.date_column].dt.to_timestamp()
         elif is_numeric_dtype(df[self.date_column]):
             # 315532801 - 2524608001    - seconds
             # 315532801000 - 2524608001000 - milliseconds
@@ -94,11 +93,10 @@ class DateTimeSearchKeyConverter:
                 df[self.date_column] = pd.to_datetime(df[self.date_column], unit="us")
             elif df[self.date_column].apply(lambda x: 10**11 < x < 10**14).all():
                 df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ms")
-            elif df[self.date_column].apply(lambda x: 0 < x < 10 * 11).all():
+            elif df[self.date_column].apply(lambda x: 0 < x < 10**11).all():
                 df[self.date_column] = pd.to_datetime(df[self.date_column], unit="s")
             else:
                 msg = self.bundle.get("unsupported_date_type").format(self.date_column)
-                self.logger.warning(msg)
                 raise ValidationError(msg)
         else:
             df[self.date_column] = df[self.date_column].astype("string").apply(self.clean_date)
@@ -108,20 +106,66 @@ class DateTimeSearchKeyConverter:
         # as additional features
         seconds = "datetime_seconds"
         df[self.date_column] = df[self.date_column].dt.tz_localize(None)
+        df = self.clean_old_dates(df)
+        # Define function to apply sine and cosine transformations
+        def add_cyclical_features(df, column, period):
+            period_suffix = f"_{period}" if column != "day_in_quarter" else ""
+            sin_feature = f"datetime_{column}_sin{period_suffix}"
+            cos_feature = f"datetime_{column}_cos{period_suffix}"
+            if sin_feature not in df.columns:
+                df[sin_feature] = np.sin(2 * np.pi * df[column] / period)
+                self.generated_features.append(sin_feature)
+            if cos_feature not in df.columns:
+                df[cos_feature] = np.cos(2 * np.pi * df[column] / period)
+                self.generated_features.append(cos_feature)
+        df["quarter"] = df[self.date_column].dt.quarter
+        # Calculate the start date of the quarter for each timestamp
+        df["quarter_start"] = df[self.date_column].dt.to_period("Q").dt.start_time
+        # Calculate the day in the quarter
+        df["day_in_quarter"] = (df[self.date_column] - df["quarter_start"]).dt.days + 1
+        # Vectorized calculation of days_in_quarter
+        quarter = df["quarter"]
+        start = df["quarter_start"]
+        year = start.dt.year
+        month = start.dt.month
+        quarter_end_year = np.where(quarter == 4, year + 1, year)
+        quarter_end_month = np.where(quarter == 4, 1, month + 3)
+        end = pd.to_datetime({"year": quarter_end_year, "month": quarter_end_month, "day": 1})
+        end.index = df.index
+        df["days_in_quarter"] = (end - start).dt.days
+        add_cyclical_features(df, "day_in_quarter", df["days_in_quarter"])  # Days in the quarter
+        df.drop(columns=["quarter", "quarter_start", "day_in_quarter", "days_in_quarter"], inplace=True)
         df[seconds] = (df[self.date_column] - df[self.date_column].dt.floor("D")).dt.seconds
         seconds_without_na = df[seconds].dropna()
         if (seconds_without_na != 0).any() and seconds_without_na.nunique() > 1:
             self.logger.info("Time found in date search key. Add extra features based on time")
-            seconds_in_day = 60 * 60 * 24
-            orders = [1, 2, 24, 48]
-            for order in orders:
-                sin_feature = f"datetime_time_sin_{order}"
-                cos_feature = f"datetime_time_cos_{order}"
-                df[sin_feature] = np.round(np.sin(2 * np.pi * order * df[seconds] / seconds_in_day), 10)
-                df[cos_feature] = np.round(np.cos(2 * np.pi * order * df[seconds] / seconds_in_day), 10)
-                self.generated_features.append(sin_feature)
-                self.generated_features.append(cos_feature)
+            # Extract basic components
+            df["second"] = df[self.date_column].dt.second
+            df["minute"] = df[self.date_column].dt.minute
+            df["hour"] = df[self.date_column].dt.hour
+            # Apply cyclical transformations
+            add_cyclical_features(df, "second", 60)  # Seconds in a minute
+            add_cyclical_features(df, "minute", 60)  # Minutes in an hour
+            add_cyclical_features(df, "minute", 30)  # Minutes in half an hour
+            add_cyclical_features(df, "hour", 24)  # Hours in a day
+            # Drop intermediate columns if not needed
+            df.drop(columns=["second", "minute", "hour"], inplace=True)
         df.drop(columns=seconds, inplace=True)
@@ -147,7 +191,19 @@ class DateTimeSearchKeyConverter:
                     return pd.to_datetime(df[self.date_column], format=date_format)
                 except ValueError:
                     pass
-            raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
+            try:
+                return pd.to_datetime(df[self.date_column])
+            except ValueError:
+                raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
+    def clean_old_dates(self, df: pd.DataFrame) -> pd.DataFrame:
+        condition = df[self.date_column] <= self.MIN_SUPPORTED_DATE_TS
+        old_subset = df[condition]
+        if len(old_subset) > 0:
+            self.has_old_dates = True
+            df.loc[condition, self.date_column] = None
+            self.logger.info(f"Set to None: {len(old_subset)} of {len(df)} rows because they are before 2000-01-01")
+        return df
 def is_time_series(df: pd.DataFrame, date_col: str) -> bool:
@@ -185,7 +241,10 @@ def is_time_series(df: pd.DataFrame, date_col: str) -> bool:
 def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[str]) -> bool:
     df = df.copy()
     seconds = "datetime_seconds"
-    df[date_col] = pd.to_datetime(df[date_col])
+    if isinstance(df[date_col].dtype, pd.PeriodDtype):
+        df[date_col] = df[date_col].dt.to_timestamp()
+    else:
+        df[date_col] = pd.to_datetime(df[date_col])
     df[date_col] = df[date_col].dt.tz_localize(None)
     df[seconds] = (df[date_col] - df[date_col].dt.floor("D")).dt.seconds
@@ -231,24 +290,25 @@ def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[st
     return len(accumulated_changing_columns) <= 2
-def validate_dates_distribution(
-    X: pd.DataFrame,
+def is_dates_distribution_valid(
+    df: pd.DataFrame,
     search_keys: Dict[str, SearchKey],
-    logger: Optional[logging.Logger] = None,
-    bundle: Optional[ResourceBundle] = None,
-    warning_counter: Optional[WarningCounter] = None,
-):
-    maybe_date_col = None
-    for key, key_type in search_keys.items():
-        if key_type in [SearchKey.DATE, SearchKey.DATETIME]:
-            maybe_date_col = key
+) -> bool:
+    maybe_date_col = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
+    if EVAL_SET_INDEX in df.columns:
+        X = df.query(f"{EVAL_SET_INDEX} == 0")
+    else:
+        X = df
     if maybe_date_col is None:
         for col in X.columns:
             if col in search_keys:
                 continue
             try:
-                if pd.__version__ >= "2.0.0":
+                if isinstance(X[col].dtype, pd.PeriodDtype):
+                    pass
+                elif pd.__version__ >= "2.0.0":
                     # Format mixed to avoid massive warnings
                     pd.to_datetime(X[col], format="mixed")
                 else:
@@ -261,7 +321,9 @@ def validate_dates_distribution(
     if maybe_date_col is None:
         return
-    if pd.__version__ >= "2.0.0":
+    if isinstance(X[maybe_date_col].dtype, pd.PeriodDtype):
+        dates = X[maybe_date_col].dt.to_timestamp().dt.date
+    elif pd.__version__ >= "2.0.0":
         dates = pd.to_datetime(X[maybe_date_col], format="mixed").dt.date
     else:
         dates = pd.to_datetime(X[maybe_date_col]).dt.date
@@ -272,13 +334,4 @@ def validate_dates_distribution(
     date_counts_2 = date_counts[round(len(date_counts) / 2) :]
     ratio = date_counts_2.mean() / date_counts_1.mean()
-    if ratio > 1.2 or ratio < 0.8:
-        if warning_counter is not None:
-            warning_counter.increment()
-        if logger is None:
-            logger = logging.getLogger("muted_logger")
-            logger.setLevel("FATAL")
-        bundle = bundle or get_custom_bundle()
-        msg = bundle.get("x_unstable_by_date")
-        print(msg)
-        logger.warning(msg)
+    return ratio >= 0.8 and ratio <= 1.2

upgini/utils/deduplicate_utils.py CHANGED Viewed

@@ -1,10 +1,19 @@
+import logging
 from logging import Logger
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional, Tuple, Union
 import pandas as pd
-from upgini.metadata import EVAL_SET_INDEX, SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
-from upgini.resource_bundle import ResourceBundle
+from upgini.metadata import (
+    ENTITY_SYSTEM_RECORD_ID,
+    EVAL_SET_INDEX,
+    SORT_ID,
+    SYSTEM_RECORD_ID,
+    TARGET,
+    ModelTaskType,
+    SearchKey,
+)
+from upgini.resource_bundle import ResourceBundle, get_custom_bundle
 from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
 from upgini.utils.target_utils import define_task
@@ -14,17 +23,19 @@ def remove_fintech_duplicates(
     search_keys: Dict[str, SearchKey],
     date_format: Optional[str] = None,
     logger: Optional[Logger] = None,
-    silent=False,
     bundle: ResourceBundle = None,
-) -> pd.DataFrame:
-    # Base checks
+) -> Tuple[pd.DataFrame, Optional[List[str]]]:
+    # Initial checks for target type and date column
+    bundle = bundle or get_custom_bundle()
+    if logger is None:
+        logger = logging.getLogger()
+        logger.setLevel(logging.FATAL)
     date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
     if define_task(df[TARGET], date_col is not None, silent=True) != ModelTaskType.BINARY:
-        return df
+        return df, []
-    date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
     if date_col is None:
-        return df
+        return df, []
     personal_cols = []
     phone_col = _get_column_by_key(search_keys, SearchKey.PHONE)
@@ -37,116 +48,133 @@ def remove_fintech_duplicates(
     if hem_col:
         personal_cols.append(hem_col)
     if len(personal_cols) == 0:
-        return df
-    sub_df = df[personal_cols + [date_col, TARGET]]
-    # Fast check for duplicates by personal keys
-    if not sub_df[personal_cols].duplicated().any():
-        return df
-    grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
-    # counts of diff dates by set of personal keys
-    uniques = grouped_by_personal_cols[date_col].nunique()
-    total = len(uniques)
-    diff_dates = len(uniques[uniques > 1])
-    if diff_dates / total >= 0.6:
-        return df
-    # Additional checks
-    duplicates = sub_df.duplicated(personal_cols, keep=False)
-    duplicate_rows = sub_df[duplicates]
-    if len(duplicate_rows) == 0:
-        return df
-    # if there is no different target values in personal keys duplicate rows
-    nonunique_target_groups = grouped_by_personal_cols[TARGET].nunique() > 1
-    if nonunique_target_groups.sum() == 0:
-        return df
-    def has_diff_target_within_60_days(rows):
-        rows = rows.sort_values(by=date_col)
-        return len(rows[rows[TARGET].ne(rows[TARGET].shift()) & (rows[date_col].diff() < 60 * 24 * 60 * 60 * 1000)]) > 0
-    nonunique_target_rows = nonunique_target_groups[nonunique_target_groups].reset_index().drop(columns=TARGET)
-    sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
-    sub_df = DateTimeSearchKeyConverter(date_col, date_format=date_format, logger=logger, bundle=bundle).convert(sub_df)
-    grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
-    rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
-    if len(rows_with_diff_target) > 0:
-        unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
-        if EVAL_SET_INDEX not in df.columns:
-            rows_to_remove = pd.merge(df.reset_index(), unique_keys_to_delete, on=personal_cols)
-            rows_to_remove = rows_to_remove.set_index(df.index.name or "index")
-            perc = len(rows_to_remove) * 100 / len(df)
-            msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
-                perc, len(rows_to_remove), rows_to_remove.index.to_list()
-            )
-            if not silent:
-                print(msg)
-            if logger:
-                logger.warning(msg)
-            logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
-            df = df[~df.index.isin(rows_to_remove.index)]
-            logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
-        else:
-            # Indices in train and eval_set can be the same so we remove rows from them separately
-            train = df.query(f"{EVAL_SET_INDEX} == 0")
-            train_rows_to_remove = pd.merge(train.reset_index(), unique_keys_to_delete, on=personal_cols)
-            train_rows_to_remove = train_rows_to_remove.set_index(train.index.name or "index")
-            train_perc = len(train_rows_to_remove) * 100 / len(train)
-            msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
-                train_perc, len(train_rows_to_remove), train_rows_to_remove.index.to_list()
+        return df, []
+    # Splitting into train and eval_set parts
+    if EVAL_SET_INDEX in df.columns:
+        train_df = df[df[EVAL_SET_INDEX] == 0]
+        eval_dfs = [df[df[EVAL_SET_INDEX] == idx] for idx in df[EVAL_SET_INDEX].unique() if idx != 0]
+    else:
+        train_df = df
+        eval_dfs = []
+    warning_messages = []
+    def process_df(segment_df: pd.DataFrame, eval_index=0) -> Tuple[pd.DataFrame, Optional[str]]:
+        """Process a subset of the dataset to remove duplicates based on personal keys."""
+        # Fast check for duplicates based on personal keys
+        if not segment_df[personal_cols].duplicated().any():
+            return segment_df, None
+        sub_df = segment_df[personal_cols + [date_col, TARGET]].copy()
+        # Group by personal columns to check for unique dates
+        grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
+        # Checking for different dates by the same personal keys
+        uniques = grouped_by_personal_cols[date_col].nunique()
+        total = len(uniques)
+        diff_dates = len(uniques[uniques > 1])
+        if diff_dates / total >= 0.6:
+            return segment_df, None
+        # Check for duplicate rows
+        duplicates = sub_df.duplicated(personal_cols, keep=False)
+        duplicate_rows = sub_df[duplicates]
+        if len(duplicate_rows) == 0:
+            return segment_df, None
+        # Check if there are different target values for the same personal keys
+        nonunique_target_groups = grouped_by_personal_cols[TARGET].nunique() > 1
+        if nonunique_target_groups.sum() == 0:
+            return segment_df, None
+        # Helper function to check if there are different target values within 60 days
+        def has_diff_target_within_60_days(rows: pd.DataFrame):
+            rows = rows.sort_values(by=date_col)
+            return (
+                len(rows[rows[TARGET].ne(rows[TARGET].shift()) & (rows[date_col].diff() < 60 * 24 * 60 * 60 * 1000)])
+                > 0
             )
-            if not silent:
-                print(msg)
-            if logger:
-                logger.warning(msg)
-            logger.info(f"Train dataset shape before clean fintech duplicates: {train.shape}")
-            train = train[~train.index.isin(train_rows_to_remove.index)]
-            logger.info(f"Train dataset shape after clean fintech duplicates: {train.shape}")
-            evals = [df.query(f"{EVAL_SET_INDEX} == {i}") for i in df[EVAL_SET_INDEX].unique() if i != 0]
-            new_evals = []
-            for i, eval in enumerate(evals):
-                eval_rows_to_remove = pd.merge(eval.reset_index(), unique_keys_to_delete, on=personal_cols)
-                eval_rows_to_remove = eval_rows_to_remove.set_index(eval.index.name or "index")
-                eval_perc = len(eval_rows_to_remove) * 100 / len(eval)
+        # Filter rows with different target values within 60 days
+        nonunique_target_rows = nonunique_target_groups[nonunique_target_groups].reset_index().drop(columns=TARGET)
+        sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
+        # Convert date columns for further checks
+        sub_df = DateTimeSearchKeyConverter(date_col, date_format=date_format, logger=logger, bundle=bundle).convert(
+            sub_df
+        )
+        grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
+        rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
+        if len(rows_with_diff_target) > 0:
+            unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
+            rows_to_remove = pd.merge(segment_df.reset_index(), unique_keys_to_delete, on=personal_cols)
+            rows_to_remove = rows_to_remove.set_index(segment_df.index.name or "index")
+            perc = len(rows_to_remove) * 100 / len(segment_df)
+            if eval_index == 0:
+                msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
+                    perc, len(rows_to_remove), rows_to_remove.index.to_list()
+                )
+            else:
                 msg = bundle.get("dataset_eval_diff_target_duplicates_fintech").format(
-                    eval_perc, len(eval_rows_to_remove), i + 1, eval_rows_to_remove.index.to_list()
+                    perc, len(rows_to_remove), eval_index, rows_to_remove.index.to_list()
                 )
-                if not silent:
-                    print(msg)
-                if logger:
-                    logger.warning(msg)
-                logger.info(f"Eval {i + 1} dataset shape before clean fintech duplicates: {eval.shape}")
-                eval = eval[~eval.index.isin(eval_rows_to_remove.index)]
-                logger.info(f"Eval {i + 1} dataset shape after clean fintech duplicates: {eval.shape}")
-                new_evals.append(eval)
-            logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
-            df = pd.concat([train] + new_evals)
-            logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
-    return df
+            return segment_df[~segment_df.index.isin(rows_to_remove.index)], msg
+        return segment_df, None
+    # Process the train part separately
+    logger.info(f"Train dataset shape before clean fintech duplicates: {train_df.shape}")
+    train_df, train_warning = process_df(train_df)
+    if train_warning:
+        warning_messages.append(train_warning)
+    logger.info(f"Train dataset shape after clean fintech duplicates: {train_df.shape}")
+    # Process each eval_set part separately
+    new_eval_dfs = []
+    for i, eval_df in enumerate(eval_dfs, 1):
+        logger.info(f"Eval {i} dataset shape before clean fintech duplicates: {eval_df.shape}")
+        cleaned_eval_df, eval_warning = process_df(eval_df, i)
+        if eval_warning:
+            warning_messages.append(eval_warning)
+        logger.info(f"Eval {i} dataset shape after clean fintech duplicates: {cleaned_eval_df.shape}")
+        new_eval_dfs.append(cleaned_eval_df)
+    # Combine the processed train and eval parts back into one dataset
+    logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
+    if new_eval_dfs:
+        df = pd.concat([train_df] + new_eval_dfs)
+    else:
+        df = train_df
+    logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
+    return df, warning_messages
 def clean_full_duplicates(
-    df: pd.DataFrame, logger: Optional[Logger] = None, silent=False, bundle: ResourceBundle = None
-) -> pd.DataFrame:
+    df: pd.DataFrame, logger: Optional[Logger] = None, bundle: Optional[ResourceBundle] = None
+) -> Tuple[pd.DataFrame, Optional[str]]:
+    if logger is None:
+        logger = logging.getLogger()
+        logger.setLevel(logging.FATAL)
+    if bundle is None:
+        bundle = get_custom_bundle()
     nrows = len(df)
     if nrows == 0:
-        return df
+        return df, None
     # Remove full duplicates (exclude system_record_id, sort_id and eval_set_index)
     unique_columns = df.columns.tolist()
     if SYSTEM_RECORD_ID in unique_columns:
         unique_columns.remove(SYSTEM_RECORD_ID)
+    if ENTITY_SYSTEM_RECORD_ID in unique_columns:
+        unique_columns.remove(ENTITY_SYSTEM_RECORD_ID)
     if SORT_ID in unique_columns:
         unique_columns.remove(SORT_ID)
     if EVAL_SET_INDEX in unique_columns:
         unique_columns.remove(EVAL_SET_INDEX)
     logger.info(f"Dataset shape before clean duplicates: {df.shape}")
     # Train segment goes first so if duplicates are found in train and eval set
     # then we keep unique rows in train segment
@@ -155,11 +183,9 @@ def clean_full_duplicates(
     nrows_after_full_dedup = len(df)
     share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
     if share_full_dedup > 0:
-        msg = bundle.get("dataset_full_duplicates").format(share_full_dedup)
-        logger.warning(msg)
-        # if not silent_mode:
-        #     print(msg)
-        # self.warning_counter.increment()
+        logger.warning(bundle.get("dataset_full_duplicates").format(share_full_dedup))
+    msg = None
     if TARGET in df.columns:
         unique_columns.remove(TARGET)
         marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
@@ -170,13 +196,10 @@ def clean_full_duplicates(
             share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
             msg = bundle.get("dataset_diff_target_duplicates").format(share_tgt_dedup, num_dup_rows, dups_indices)
-            logger.warning(msg)
-            if not silent:
-                print(msg)
             df = df.drop_duplicates(subset=unique_columns, keep=False)
             logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
-    return df
+    return df, msg
 def _get_column_by_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[str]:

upgini 1.1.280.dev0__py3-none-any.whl → 1.2.31a1__py3-none-any.whl

Potentially problematic release.

upgini 1.1.280.dev0py3-none-any.whl → 1.2.31a1py3-none-any.whl