PyPI - upgini - Versions diffs - 1.2.86.dev1__py3-none-any.whl → 1.2.87__py3-none-any.whl - Mend

upgini 1.2.86.dev1py3-none-any.whl → 1.2.87py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

upgini/__about__.py +1 -1
upgini/data_source/data_source_publisher.py +21 -0
upgini/features_enricher.py +91 -41
upgini/metrics.py +103 -41
upgini/resource_bundle/strings.properties +3 -1
upgini/utils/datetime_utils.py +130 -118
upgini/utils/deduplicate_utils.py +4 -4
upgini/utils/email_utils.py +5 -5
upgini/utils/sklearn_ext.py +112 -8
{upgini-1.2.86.dev1.dist-info → upgini-1.2.87.dist-info}/METADATA +1 -1
{upgini-1.2.86.dev1.dist-info → upgini-1.2.87.dist-info}/RECORD +13 -13
{upgini-1.2.86.dev1.dist-info → upgini-1.2.87.dist-info}/WHEEL +0 -0
{upgini-1.2.86.dev1.dist-info → upgini-1.2.87.dist-info}/licenses/LICENSE +0 -0

upgini/metrics.py CHANGED Viewed

@@ -6,16 +6,26 @@ import re
 from collections import defaultdict
 from copy import deepcopy
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    Literal,
+    Optional,
+    Protocol,
+    Tuple,
+    Union,
+    runtime_checkable,
+)
 import lightgbm as lgb
 import numpy as np
 import pandas as pd
 from catboost import CatBoostClassifier, CatBoostRegressor
-from category_encoders.cat_boost import CatBoostEncoder
 from lightgbm import LGBMClassifier, LGBMRegressor
 from numpy import log1p
-from pandas.api.types import is_numeric_dtype, is_integer_dtype, is_float_dtype
+from pandas.api.types import is_float_dtype, is_integer_dtype, is_numeric_dtype
 from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
 from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
@@ -32,10 +42,7 @@ except ImportError:
     available_scorers = SCORERS
 from sklearn.metrics import mean_squared_error
 from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
-from sklearn.model_selection import (  # , TimeSeriesSplit
-    BaseCrossValidator,
-    TimeSeriesSplit,
-)
+from sklearn.model_selection import BaseCrossValidator, TimeSeriesSplit
 from upgini.errors import ValidationError
 from upgini.metadata import ModelTaskType
@@ -57,6 +64,16 @@ CATBOOST_REGRESSION_PARAMS = {
     "allow_writing_files": False,
 }
+CATBOOST_TS_PARAMS = {
+    "learning_rate": 0.05,
+    "early_stopping_rounds": 20,
+    "use_best_model": True,
+    "one_hot_max_size": 100,
+    "verbose": False,
+    "random_state": 42,
+    "allow_writing_files": False,
+}
 CATBOOST_BINARY_PARAMS = {
     "iterations": 250,
     "learning_rate": 0.05,
@@ -311,6 +328,7 @@ class EstimatorWrapper:
         self.target_type = target_type
         self.add_params = add_params
         self.cv_estimators = None
+        self.cv_cat_encoders: Optional[List[Optional[HasTransform]]] = None
         self.groups = groups
         self.text_features = text_features
         self.logger = logger or logging.getLogger()
@@ -391,9 +409,7 @@ class EstimatorWrapper:
                     self.converted_to_int.append(c)
                     self.cat_features.remove(c)
                 elif is_float_dtype(x[c]) or (x[c].dtype == "category" and is_float_dtype(x[c].cat.categories)):
-                    self.logger.info(
-                        f"Convert float cat feature {c} to string"
-                    )
+                    self.logger.info(f"Convert float cat feature {c} to string")
                     x[c] = x[c].astype(str)
                     self.converted_to_str.append(c)
                 elif x[c].dtype not in ["category", "int64"]:
@@ -439,7 +455,9 @@ class EstimatorWrapper:
         return x, y, {}
-    def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
+    def calculate_shap(
+        self, x: pd.DataFrame, y: pd.Series, estimator, cat_encoder: Optional[HasTransform]
+    ) -> Optional[Dict[str, float]]:
         return None
     def cross_val_predict(
@@ -470,9 +488,11 @@ class EstimatorWrapper:
                 fit_params=fit_params,
                 return_estimator=True,
                 error_score="raise",
+                random_state=DEFAULT_RANDOM_STATE,
             )
             metrics_by_fold = cv_results["test_score"]
             self.cv_estimators = cv_results["estimator"]
+            self.cv_cat_encoders = cv_results["cat_encoder"]
             self.check_fold_metrics(metrics_by_fold)
@@ -480,14 +500,14 @@ class EstimatorWrapper:
             splits = self.cv.split(x, y, groups)
-            for estimator, split in zip(self.cv_estimators, splits):
+            for estimator, cat_encoder, split in zip(self.cv_estimators, self.cv_cat_encoders, splits):
                 _, validation_idx = split
                 cv_x = x.iloc[validation_idx]
                 if isinstance(y, pd.Series):
                     cv_y = y.iloc[validation_idx]
                 else:
                     cv_y = y[validation_idx]
-                shaps = self.calculate_shap(cv_x, cv_y, estimator)
+                shaps = self.calculate_shap(cv_x, cv_y, estimator, cat_encoder)
                 if shaps is not None:
                     for feature, shap_value in shaps.items():
                         shap_values_all_folds[feature].append(shap_value)
@@ -527,8 +547,19 @@ class EstimatorWrapper:
             metric, metric_std = roc_auc_score(y, x[baseline_score_column]), None
         else:
             metrics = []
-            for est in self.cv_estimators:
-                metrics.append(self.scorer(est, x, y))
+            for est, cat_encoder in zip(self.cv_estimators, self.cv_cat_encoders):
+                x_copy = x.copy()
+                if cat_encoder is not None:
+                    if hasattr(cat_encoder, "feature_names_in_"):
+                        encoded = cat_encoder.transform(x_copy[cat_encoder.feature_names_in_])
+                    else:
+                        encoded = cat_encoder.transform(x[self.cat_features])
+                    if isinstance(self.cv, TimeSeriesSplit) or isinstance(self.cv, BlockedTimeSeriesSplit):
+                        encoded = encoded.astype(int)
+                    else:
+                        encoded = encoded.astype("category")
+                    x_copy[self.cat_features] = encoded
+                metrics.append(self.scorer(est, x_copy, y))
             metric, metric_std = self._calculate_metric_from_folds(metrics)
         return _CrossValResults(metric=metric, metric_std=metric_std, shap_values=None)
@@ -551,7 +582,7 @@ class EstimatorWrapper:
         text_features: Optional[List[str]] = None,
         add_params: Optional[Dict[str, Any]] = None,
         groups: Optional[List[str]] = None,
-        has_date: Optional[bool] = None,
+        has_time: bool = False,
     ) -> EstimatorWrapper:
         scorer, metric_name, multiplier = define_scorer(target_type, scoring)
         kwargs = {
@@ -568,7 +599,7 @@ class EstimatorWrapper:
         if estimator is None:
             if EstimatorWrapper.default_estimator == "catboost":
                 logger.info("Using CatBoost as default estimator")
-                params = {"has_time": has_date}
+                params = {"has_time": has_time}
                 if target_type == ModelTaskType.MULTICLASS:
                     params = _get_add_params(params, CATBOOST_MULTICLASS_PARAMS)
                     params = _get_add_params(params, add_params)
@@ -578,7 +609,10 @@ class EstimatorWrapper:
                     params = _get_add_params(params, add_params)
                     estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
                 elif target_type == ModelTaskType.REGRESSION:
-                    params = _get_add_params(params, CATBOOST_REGRESSION_PARAMS)
+                    if not isinstance(cv, TimeSeriesSplit) and not isinstance(cv, BlockedTimeSeriesSplit):
+                        params = _get_add_params(params, CATBOOST_TS_PARAMS)
+                    else:
+                        params = _get_add_params(params, CATBOOST_REGRESSION_PARAMS)
                     params = _get_add_params(params, add_params)
                     estimator = CatBoostWrapper(CatBoostRegressor(**params), **kwargs)
                 else:
@@ -610,8 +644,8 @@ class EstimatorWrapper:
                 estimator_copy = deepcopy(estimator)
             kwargs["estimator"] = estimator_copy
             if is_catboost_estimator(estimator):
-                if has_date is not None:
-                    estimator_copy.set_params(has_time=has_date)
+                if has_time is not None:
+                    estimator_copy.set_params(has_time=has_time)
                 estimator = CatBoostWrapper(**kwargs)
             else:
                 if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
@@ -769,15 +803,26 @@ class CatBoostWrapper(EstimatorWrapper):
             else:
                 raise e
-    def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
+    def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator, cat_encoder) -> Optional[Dict[str, float]]:
         try:
             from catboost import Pool
+            cat_features = None
+            if cat_encoder is not None:
+                if isinstance(self.cv, TimeSeriesSplit) or isinstance(self.cv, BlockedTimeSeriesSplit):
+                    encoded = cat_encoder.transform(x[self.cat_features]).astype(int)
+                else:
+                    encoded = cat_encoder.transform(x[self.cat_features])
+                    cat_features = encoded.columns.to_list()
+                x[self.cat_features] = encoded
+            else:
+                cat_features = self.cat_features
             # Create Pool for fold data, if need (for example, when categorical features are present)
             fold_pool = Pool(
                 x,
                 y,
-                cat_features=self.cat_features,
+                cat_features=cat_features,
                 text_features=self.text_features,
                 embedding_features=self.grouped_embedding_features,
             )
@@ -834,7 +879,6 @@ class LightGBMWrapper(EstimatorWrapper):
             text_features=text_features,
             logger=logger,
         )
-        self.cat_encoder = None
         self.n_classes = None
     def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
@@ -846,10 +890,10 @@ class LightGBMWrapper(EstimatorWrapper):
                 params["eval_metric"] = "auc"
             params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
         if self.cat_features:
-            encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, cols=self.cat_features, return_df=True)
-            encoded = encoder.fit_transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
-            x[self.cat_features] = encoded
-            self.cat_encoder = encoder
+            for c in self.cat_features:
+                if x[c].dtype != "category":
+                    x[c] = x[c].astype("category")
         for c in x.columns:
             if x[c].dtype not in ["category", "int64", "float64", "bool"]:
                 self.logger.warning(f"Feature {c} is not numeric and will be dropped")
@@ -859,15 +903,26 @@ class LightGBMWrapper(EstimatorWrapper):
     def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
         x, y_numpy, params = super()._prepare_to_calculate(x, y)
-        if self.cat_features is not None and self.cat_encoder is not None:
-            encoded = self.cat_encoder.transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
-            x[self.cat_features] = encoded
+        if self.cat_features:
+            for c in self.cat_features:
+                if x[c].dtype != "category":
+                    x[c] = x[c].astype("category")
         return x, y_numpy, params
-    def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
+    def calculate_shap(
+        self, x: pd.DataFrame, y: pd.Series, estimator, cat_encoder: Optional[HasTransform]
+    ) -> Optional[Dict[str, float]]:
         try:
+            x_copy = x.copy()
+            if cat_encoder is not None:
+                if isinstance(self.cv, TimeSeriesSplit) or isinstance(self.cv, BlockedTimeSeriesSplit):
+                    encoded = cat_encoder.transform(x_copy[self.cat_features]).astype(int)
+                else:
+                    encoded = cat_encoder.transform(x_copy[self.cat_features]).astype("category")
+                x_copy[self.cat_features] = encoded
             shap_matrix = estimator.predict(
-                x,
+                x_copy,
                 predict_disable_shape_check=True,
                 raw_score=True,
                 pred_leaf=False,
@@ -926,10 +981,10 @@ class OtherEstimatorWrapper(EstimatorWrapper):
         num_features = [col for col in x.columns if col not in self.cat_features]
         x[num_features] = x[num_features].fillna(-999)
         if self.cat_features:
-            encoder = CatBoostEncoder(random_state=DEFAULT_RANDOM_STATE, return_df=True)
-            encoded = encoder.fit_transform(x[self.cat_features].astype("object"), y_numpy).astype("category")
-            x[self.cat_features] = encoded
-            self.cat_encoder = encoder
+            for c in self.cat_features:
+                if x[c].dtype != "category":
+                    x[c] = x[c].astype("category")
+            params["cat_features"] = self.cat_features
         for c in x.columns:
             if x[c].dtype not in ["category", "int64", "float64", "bool"]:
                 self.logger.warning(f"Feature {c} is not numeric and will be dropped")
@@ -940,15 +995,22 @@ class OtherEstimatorWrapper(EstimatorWrapper):
     def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
         x, y_numpy, params = super()._prepare_to_calculate(x, y)
         if self.cat_features is not None:
+            for c in self.cat_features:
+                if x[c].dtype != "category":
+                    x[c] = x[c].astype("category")
             num_features = [col for col in x.columns if col not in self.cat_features]
-            x[num_features] = x[num_features].fillna(-999)
-            if self.cat_features and self.cat_encoder is not None:
-                x[self.cat_features] = self.cat_encoder.transform(
-                    x[self.cat_features].astype("object"), y_numpy
-                ).astype("category")
+        else:
+            num_features = x.columns
+        x[num_features] = x[num_features].fillna(-999)
         return x, y_numpy, params
+@runtime_checkable
+class HasTransform(Protocol):
+    def transform(self, X: pd.DataFrame, y: Optional[Union[pd.Series, np.ndarray]] = None) -> pd.DataFrame: ...
 def validate_scoring_argument(scoring: Union[Callable, str, None]):
     if scoring is None:
         return

upgini/resource_bundle/strings.properties CHANGED Viewed

@@ -68,6 +68,8 @@ too_many_generate_features=Too many columns passed in `generate_features` argume
 invalid_round_embeddings=Argument `round_embeddings` should be non negative integer
 no_important_features_for_transform=There are no important features for transform. Return input as transformed
 search_task_not_initial=Passed search_id {} is transform id. Please use search task id of fit call: {}.
+binary_target_unique_count_not_2=Binary target should contain only 2 unique values, but {} found
+binary_target_eval_unique_count_not_2=Binary target should contain only 2 unique values, but {} found in eval_set
 # Validation errors
     # params validation
@@ -156,7 +158,7 @@ dataset_too_few_rows=X size should be at least {} rows after validation
 dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample X
 dataset_empty_column_names=Some column names are empty. Add names please
 dataset_full_duplicates={:.5f}% of the rows are fully duplicated
-dataset_diff_target_duplicates={:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
+dataset_diff_target_duplicates={:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nSample of incorrect row indexes: {}
 dataset_train_diff_target_duplicates_fintech={:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
 dataset_eval_diff_target_duplicates_fintech={:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
 dataset_drop_old_dates=We don't have data before '2000-01-01' and removed all earlier records from the search dataset

upgini/utils/datetime_utils.py CHANGED Viewed

@@ -41,6 +41,7 @@ class DateTimeSearchKeyConverter:
         date_format: Optional[str] = None,
         logger: Optional[logging.Logger] = None,
         bundle: Optional[ResourceBundle] = None,
+        generate_cyclical_features: bool = True,
     ):
         self.date_column = date_column
         self.date_format = date_format
@@ -51,6 +52,7 @@ class DateTimeSearchKeyConverter:
             self.logger.setLevel("FATAL")
         self.generated_features: List[str] = []
         self.bundle = bundle or get_custom_bundle()
+        self.generate_cyclical_features = generate_cyclical_features
         self.has_old_dates = False
     @staticmethod
@@ -121,61 +123,63 @@ class DateTimeSearchKeyConverter:
                 df[cos_feature] = np.cos(2 * np.pi * df[column] / period)
                 self.generated_features.append(cos_feature)
-        # df["quarter"] = df[self.date_column].dt.quarter
+        if self.generate_cyclical_features:
-        # # Calculate the start date of the quarter for each timestamp
-        # df["quarter_start"] = df[self.date_column].dt.to_period("Q").dt.start_time
+            df["quarter"] = df[self.date_column].dt.quarter
-        # # Calculate the day in the quarter
-        # df["day_in_quarter"] = (df[self.date_column] - df["quarter_start"]).dt.days + 1
+            # Calculate the start date of the quarter for each timestamp
+            df["quarter_start"] = df[self.date_column].dt.to_period("Q").dt.start_time
-        # # Vectorized calculation of days_in_quarter
-        # quarter = df["quarter"]
-        # start = df["quarter_start"]
-        # year = start.dt.year
-        # month = start.dt.month
+            # Calculate the day in the quarter
+            df["day_in_quarter"] = (df[self.date_column] - df["quarter_start"]).dt.days + 1
-        # quarter_end_year = np.where(quarter == 4, year + 1, year)
-        # quarter_end_month = np.where(quarter == 4, 1, month + 3)
+            # Vectorized calculation of days_in_quarter
+            quarter = df["quarter"]
+            start = df["quarter_start"]
+            year = start.dt.year
+            month = start.dt.month
-        # end = pd.to_datetime({"year": quarter_end_year, "month": quarter_end_month, "day": 1})
-        # end.index = df.index
+            quarter_end_year = np.where(quarter == 4, year + 1, year)
+            quarter_end_month = np.where(quarter == 4, 1, month + 3)
-        # df["days_in_quarter"] = (end - start).dt.days
+            end = pd.to_datetime({"year": quarter_end_year, "month": quarter_end_month, "day": 1})
+            end.index = df.index
-        # add_cyclical_features(df, "day_in_quarter", df["days_in_quarter"])  # Days in the quarter
+            df["days_in_quarter"] = (end - start).dt.days
-        # df.drop(columns=["quarter", "quarter_start", "day_in_quarter", "days_in_quarter"], inplace=True)
+            add_cyclical_features(df, "day_in_quarter", df["days_in_quarter"])  # Days in the quarter
-        df[seconds] = (df[self.date_column] - df[self.date_column].dt.floor("D")).dt.seconds
+            df.drop(columns=["quarter", "quarter_start", "day_in_quarter", "days_in_quarter"], inplace=True)
-        seconds_without_na = df[seconds].dropna()
-        if (seconds_without_na != 0).any() and seconds_without_na.nunique() > 1:
-            self.logger.info("Time found in date search key. Add extra features based on time")
+            df[seconds] = (df[self.date_column] - df[self.date_column].dt.floor("D")).dt.seconds
-            # Extract basic components
-            df["second"] = df[self.date_column].dt.second
-            df["minute"] = df[self.date_column].dt.minute
-            df["hour"] = df[self.date_column].dt.hour
+            seconds_without_na = df[seconds].dropna()
+            if (seconds_without_na != 0).any() and seconds_without_na.nunique() > 1:
+                self.logger.info("Time found in date search key. Add extra features based on time")
-            # Apply cyclical transformations
-            add_cyclical_features(df, "second", 60)  # Seconds in a minute
-            add_cyclical_features(df, "minute", 60)  # Minutes in an hour
-            add_cyclical_features(df, "minute", 30)  # Minutes in half an hour
-            add_cyclical_features(df, "hour", 24)  # Hours in a day
+                # Extract basic components
+                df["second"] = df[self.date_column].dt.second
+                df["minute"] = df[self.date_column].dt.minute
+                df["hour"] = df[self.date_column].dt.hour
-            # Drop intermediate columns if not needed
-            df.drop(columns=["second", "minute", "hour"], inplace=True)
-        else:
-            keep_time = False
+                # Apply cyclical transformations
+                add_cyclical_features(df, "second", 60)  # Seconds in a minute
+                add_cyclical_features(df, "minute", 60)  # Minutes in an hour
+                add_cyclical_features(df, "minute", 30)  # Minutes in half an hour
+                add_cyclical_features(df, "hour", 24)  # Hours in a day
+                # Drop intermediate columns if not needed
+                df.drop(columns=["second", "minute", "hour"], inplace=True)
+            else:
+                keep_time = False
-        for generated_feature in self.generated_features[:]:
-            if df[generated_feature].dropna().nunique() <= 1:
-                self.logger.warning(f"Generated constant feature {generated_feature} will be dropped")
-                df.drop(columns=generated_feature, inplace=True)
-                self.generated_features.remove(generated_feature)
+            for generated_feature in self.generated_features[:]:
+                if df[generated_feature].dropna().nunique() <= 1:
+                    self.logger.warning(f"Generated constant feature {generated_feature} will be dropped")
+                    df.drop(columns=generated_feature, inplace=True)
+                    self.generated_features.remove(generated_feature)
-        df.drop(columns=seconds, inplace=True)
+            df.drop(columns=seconds, inplace=True)
         if keep_time:
             df[self.DATETIME_COL] = df[self.date_column].astype(np.int64) // 1_000_000
@@ -247,99 +251,107 @@ def is_time_series(df: pd.DataFrame, date_col: str) -> bool:
 def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[str]) -> bool:
-    df = df.copy()
-    seconds = "datetime_seconds"
-    if isinstance(df[date_col].dtype, pd.PeriodDtype):
-        df[date_col] = df[date_col].dt.to_timestamp()
-    else:
-        df[date_col] = pd.to_datetime(df[date_col])
-    df[date_col] = df[date_col].dt.tz_localize(None)
-    df[seconds] = (df[date_col] - df[date_col].dt.floor("D")).dt.seconds
-    seconds_without_na = df[seconds].dropna()
-    columns_to_drop = [c for c in search_keys if c != date_col] + [seconds]
-    df.drop(columns=columns_to_drop, inplace=True)
-    # Date, not datetime
-    if (seconds_without_na != 0).any() and seconds_without_na.nunique() > 1:
-        return False
+    try:
+        df = df.copy()
+        seconds = "datetime_seconds"
+        if isinstance(df[date_col].dtype, pd.PeriodDtype):
+            df[date_col] = df[date_col].dt.to_timestamp()
+        elif is_numeric_dtype(df[date_col]):
+            df[date_col] = pd.to_datetime(df[date_col], unit="ms")
+        else:
+            df[date_col] = pd.to_datetime(df[date_col])
+        df[date_col] = df[date_col].dt.tz_localize(None)
+        df[seconds] = (df[date_col] - df[date_col].dt.floor("D")).dt.seconds
-    nunique_dates = df[date_col].nunique()
-    # Unique dates count more than 270
-    if nunique_dates < 270:
-        return False
+        seconds_without_na = df[seconds].dropna()
+        columns_to_drop = [c for c in search_keys if c != date_col] + [seconds]
+        df.drop(columns=columns_to_drop, inplace=True)
+        # Date, not datetime
+        if (seconds_without_na != 0).any() and seconds_without_na.nunique() > 1:
+            return False
-    min_date = df[date_col].min()
-    max_date = df[date_col].max()
-    days_delta = (max_date - min_date).days + 1
-    # Missing dates less than 30% (unique dates count and days delta between earliest and latest dates)
-    if nunique_dates / days_delta < 0.3:
-        return False
+        nunique_dates = df[date_col].nunique()
+        # Unique dates count more than 270
+        if nunique_dates < 270:
+            return False
+        min_date = df[date_col].min()
+        max_date = df[date_col].max()
+        days_delta = (max_date - min_date).days + 1
+        # Missing dates less than 30% (unique dates count and days delta between earliest and latest dates)
+        if nunique_dates / days_delta < 0.3:
+            return False
-    accumulated_changing_columns = set()
+        accumulated_changing_columns = set()
-    def check_differences(group: pd.DataFrame):
-        changing_columns = group.columns[group.nunique(dropna=False) > 1].to_list()
-        accumulated_changing_columns.update(changing_columns)
+        def check_differences(group: pd.DataFrame):
+            changing_columns = group.columns[group.nunique(dropna=False) > 1].to_list()
+            accumulated_changing_columns.update(changing_columns)
-    def is_multiple_rows(group: pd.DataFrame) -> bool:
-        return group.shape[0] > 1
+        def is_multiple_rows(group: pd.DataFrame) -> bool:
+            return group.shape[0] > 1
-    grouped = df.groupby(date_col)[[c for c in df.columns if c != date_col]]
-    dates_with_multiple_rows = grouped.apply(is_multiple_rows).sum()
+        grouped = df.groupby(date_col)[[c for c in df.columns if c != date_col]]
+        dates_with_multiple_rows = grouped.apply(is_multiple_rows).sum()
-    # share of dates with more than one record is more than 99%
-    if dates_with_multiple_rows / nunique_dates < 0.99:
-        return False
+        # share of dates with more than one record is more than 99%
+        if dates_with_multiple_rows / nunique_dates < 0.99:
+            return False
-    if df.shape[1] <= 3:
-        return True
+        if df.shape[1] <= 3:
+            return True
-    grouped.apply(check_differences)
-    return len(accumulated_changing_columns) <= 2
+        grouped.apply(check_differences)
+        return len(accumulated_changing_columns) <= 2
+    except Exception:
+        return False
 def is_dates_distribution_valid(
     df: pd.DataFrame,
     search_keys: Dict[str, SearchKey],
 ) -> bool:
-    maybe_date_col = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
+    try:
+        maybe_date_col = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
-    if EVAL_SET_INDEX in df.columns:
-        X = df.query(f"{EVAL_SET_INDEX} == 0")
-    else:
-        X = df
+        if EVAL_SET_INDEX in df.columns:
+            X = df.query(f"{EVAL_SET_INDEX} == 0")
+        else:
+            X = df
-    if maybe_date_col is None:
-        for col in X.columns:
-            if col in search_keys:
-                continue
-            try:
-                if isinstance(X[col].dtype, pd.PeriodDtype):
+        if maybe_date_col is None:
+            for col in X.columns:
+                if col in search_keys:
+                    continue
+                try:
+                    if isinstance(X[col].dtype, pd.PeriodDtype):
+                        pass
+                    elif pd.__version__ >= "2.0.0":
+                        # Format mixed to avoid massive warnings
+                        pd.to_datetime(X[col], format="mixed")
+                    else:
+                        pd.to_datetime(X[col])
+                    maybe_date_col = col
+                    break
+                except Exception:
                     pass
-                elif pd.__version__ >= "2.0.0":
-                    # Format mixed to avoid massive warnings
-                    pd.to_datetime(X[col], format="mixed")
-                else:
-                    pd.to_datetime(X[col])
-                maybe_date_col = col
-                break
-            except Exception:
-                pass
-    if maybe_date_col is None:
-        return
-    if isinstance(X[maybe_date_col].dtype, pd.PeriodDtype):
-        dates = X[maybe_date_col].dt.to_timestamp().dt.date
-    elif pd.__version__ >= "2.0.0":
-        dates = pd.to_datetime(X[maybe_date_col], format="mixed").dt.date
-    else:
-        dates = pd.to_datetime(X[maybe_date_col]).dt.date
-    date_counts = dates.value_counts().sort_index()
-    date_counts_1 = date_counts[: round(len(date_counts) / 2)]
-    date_counts_2 = date_counts[round(len(date_counts) / 2) :]
-    ratio = date_counts_2.mean() / date_counts_1.mean()
-    return ratio >= 0.8 and ratio <= 1.2
+        if maybe_date_col is None:
+            return
+        if isinstance(X[maybe_date_col].dtype, pd.PeriodDtype):
+            dates = X[maybe_date_col].dt.to_timestamp().dt.date
+        elif pd.__version__ >= "2.0.0":
+            dates = pd.to_datetime(X[maybe_date_col], format="mixed").dt.date
+        else:
+            dates = pd.to_datetime(X[maybe_date_col]).dt.date
+        date_counts = dates.value_counts().sort_index()
+        date_counts_1 = date_counts[: round(len(date_counts) / 2)]
+        date_counts_2 = date_counts[round(len(date_counts) / 2) :]
+        ratio = date_counts_2.mean() / date_counts_1.mean()
+        return ratio >= 0.8 and ratio <= 1.2
+    except Exception:
+        return False

upgini/utils/deduplicate_utils.py CHANGED Viewed

@@ -104,9 +104,9 @@ def remove_fintech_duplicates(
         sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
         # Convert date columns for further checks
-        sub_df = DateTimeSearchKeyConverter(date_col, date_format=date_format, logger=logger, bundle=bundle).convert(
-            sub_df
-        )
+        sub_df = DateTimeSearchKeyConverter(
+            date_col, date_format=date_format, logger=logger, bundle=bundle, generate_cyclical_features=False
+        ).convert(sub_df)
         grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
         rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
@@ -192,7 +192,7 @@ def clean_full_duplicates(
         unique_columns.remove(TARGET)
         marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
         if marked_duplicates.sum() > 0:
-            dups_indices = df[marked_duplicates].index.to_list()
+            dups_indices = df[marked_duplicates].index.to_list()[:100]
             nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
             num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
             share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup

upgini 1.2.86.dev1__py3-none-any.whl → 1.2.87__py3-none-any.whl

upgini 1.2.86.dev1py3-none-any.whl → 1.2.87py3-none-any.whl