PyPI - upgini - Versions diffs - 1.1.244a13__py3-none-any.whl → 1.1.244a15__py3-none-any.whl - Mend

upgini 1.1.244a13py3-none-any.whl → 1.1.244a15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (9) hide show

upgini/dataset.py CHANGED Viewed

@@ -36,14 +36,12 @@ from upgini.metadata import (
     NumericInterval,
     RuntimeParameters,
     SearchCustomization,
-    SearchKey,
 )
 from upgini.normalizer.phone_normalizer import PhoneNormalizer
 from upgini.resource_bundle import bundle
 from upgini.sampler.random_under_sampler import RandomUnderSampler
 from upgini.search_task import SearchTask
 from upgini.utils import combine_search_keys
-from upgini.utils.deduplicate_utils import remove_fintech_duplicates
 from upgini.utils.email_utils import EmailSearchKeyConverter
 try:
@@ -817,19 +815,19 @@ class Dataset:  # (pd.DataFrame):
         self.__convert_features_types()
-        search_keys = {
-            col: SearchKey.from_meaning_type(key_type)
-            for col, key_type in self.meaning_types.items()
-            if SearchKey.from_meaning_type(key_type) is not None
-        }
+        # search_keys = {
+        #     col: SearchKey.from_meaning_type(key_type)
+        #     for col, key_type in self.meaning_types.items()
+        #     if SearchKey.from_meaning_type(key_type) is not None
+        # }
-        if validate_target:
-            need_full_defuplication, self.data = remove_fintech_duplicates(self.data, search_keys, self.logger)
-        else:
-            need_full_defuplication = True
+        # if validate_target:
+        #     need_full_defuplication, self.data = remove_fintech_duplicates(self.data, search_keys, self.logger)
+        # else:
+        #     need_full_defuplication = True
-        if need_full_defuplication:
-            self.__clean_duplicates(silent_mode)
+        # if need_full_defuplication:
+        #     self.__clean_duplicates(silent_mode)
         self.__validate_dataset(validate_target, silent_mode)

upgini/features_enricher.py CHANGED Viewed

@@ -65,7 +65,10 @@ from upgini.utils.datetime_utils import (
     is_blocked_time_series,
     is_time_series,
 )
-from upgini.utils.deduplicate_utils import remove_fintech_duplicates
+from upgini.utils.deduplicate_utils import (
+    clean_full_duplicates,
+    remove_fintech_duplicates,
+)
 from upgini.utils.display_utils import (
     display_html_dataframe,
     do_without_pandas_limits,
@@ -1850,6 +1853,8 @@ class FeaturesEnricher(TransformerMixin):
             )
             meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
+            df = clean_full_duplicates(df, self.logger, silent=silent_mode)
             df = df.reset_index(drop=True)
             system_columns_with_original_index = [SYSTEM_RECORD_ID] + generated_features
             df_with_original_index = df[system_columns_with_original_index].copy()
@@ -2131,6 +2136,10 @@ class FeaturesEnricher(TransformerMixin):
         df = self.__add_country_code(df, self.fit_search_keys)
+        need_full_defuplication, df = remove_fintech_duplicates(df, self.fit_search_keys, self.logger)
+        if need_full_defuplication:
+            df = clean_full_duplicates(df, self.logger)
         date_column = self._get_date_column(self.fit_search_keys)
         self.__adjust_cv(df, date_column, model_task_type)

upgini/metrics.py CHANGED Viewed

@@ -203,6 +203,7 @@ class EstimatorWrapper:
         add_params: Optional[Dict[str, Any]] = None,
         groups: Optional[np.ndarray] = None,
         text_features: Optional[List[str]] = None,
+        logger: Optional[logging.Logger] = None,
     ):
         self.estimator = estimator
         self.scorer = scorer
@@ -216,6 +217,7 @@ class EstimatorWrapper:
         self.cv_estimators = None
         self.groups = groups
         self.text_features = text_features
+        self.logger = logger or logging.getLogger()
     def fit(self, X: pd.DataFrame, y: np.ndarray, **kwargs):
         X, y, _, fit_params = self._prepare_to_fit(X, y)
@@ -411,13 +413,14 @@ class CatBoostWrapper(EstimatorWrapper):
         emb_pattern = r"(.+)_emb\d+"
         self.emb_features = [c for c in X.columns if re.match(emb_pattern, c) and is_numeric_dtype(X[c])]
         embedding_features = []
-        if len(self.emb_features) > 1:
+        if len(self.emb_features) > 3:  # There is no reason to reduce embeddings dimension with less than 4
             X, embedding_features = self.group_embeddings(X)
             params["embedding_features"] = embedding_features
         # Find text features from passed in generate_features
         if self.text_features is not None:
             self.text_features = [f for f in self.text_features if not is_numeric_dtype(X[f])]
             params["text_features"] = self.text_features
         # Find rest categorical features

upgini/utils/deduplicate_utils.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Dict, List, Optional, Tuple, Union
 import pandas as pd
-from upgini.metadata import TARGET, ModelTaskType, SearchKey
+from upgini.metadata import SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
 from upgini.resource_bundle import bundle
 from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
 from upgini.utils.target_utils import define_task
@@ -35,8 +35,15 @@ def remove_fintech_duplicates(
     if len(personal_cols) == 0:
         return need_full_deduplication, df
-    grouped_by_personal_cols = df.groupby(personal_cols, group_keys=False)
+    sub_df = df[personal_cols + [date_col, TARGET]]
+    # Fast check for duplicates by personal keys
+    if not sub_df[personal_cols].duplicated().any():
+        return need_full_deduplication, df
+    grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
+    # counts of diff dates by set of personal keys
     uniques = grouped_by_personal_cols[date_col].nunique()
     total = len(uniques)
     diff_dates = len(uniques[uniques > 1])
@@ -47,35 +54,83 @@ def remove_fintech_duplicates(
     need_full_deduplication = False
-    duplicates = df.duplicated(personal_cols, keep=False)
-    duplicate_rows = df[duplicates]
+    duplicates = sub_df.duplicated(personal_cols, keep=False)
+    duplicate_rows = sub_df[duplicates]
     if len(duplicate_rows) == 0:
         return need_full_deduplication, df
-    if grouped_by_personal_cols[TARGET].apply(lambda x: len(x.unique()) == 1).all():
+    # if there is no different target values in personal keys duplicate rows
+    nonunique_target_groups = grouped_by_personal_cols[TARGET].nunique() > 1
+    if nonunique_target_groups.sum() == 0:
         return need_full_deduplication, df
     def has_diff_target_within_60_days(rows):
         rows = rows.sort_values(by=date_col)
         return len(rows[rows[TARGET].ne(rows[TARGET].shift()) & (rows[date_col].diff() < 60 * 24 * 60 * 60 * 1000)]) > 0
-    df = DateTimeSearchKeyConverter(date_col).convert(df)
-    grouped_by_personal_cols = df.groupby(personal_cols, group_keys=False)
+    nonunique_target_rows = nonunique_target_groups[nonunique_target_groups].reset_index().drop(columns=TARGET)
+    sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
+    sub_df = DateTimeSearchKeyConverter(date_col).convert(sub_df)
+    grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
     rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
     if len(rows_with_diff_target) > 0:
-        perc = len(rows_with_diff_target) * 100 / len(df)
+        unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
+        rows_to_remove = pd.merge(df.reset_index(), unique_keys_to_delete, on=personal_cols)
+        rows_to_remove = rows_to_remove.set_index(df.index.name or "index")
+        perc = len(rows_to_remove) * 100 / len(df)
         msg = bundle.get("dataset_diff_target_duplicates_fintech").format(
-            perc, len(rows_with_diff_target), rows_with_diff_target.index.to_list()
+            perc, len(rows_to_remove), rows_to_remove.index.to_list()
         )
         if not silent:
             print(msg)
         if logger:
             logger.warning(msg)
-        df = df[~df.index.isin(rows_with_diff_target.index)]
+        logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
+        df = df[~df.index.isin(rows_to_remove.index)]
+        logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
     return need_full_deduplication, df
+def clean_full_duplicates(
+    df: pd.DataFrame, logger: Optional[Logger] = None, silent=False
+) -> pd.DataFrame:
+    nrows = len(df)
+    if nrows == 0:
+        return
+    # Remove absolute duplicates (exclude system_record_id)
+    unique_columns = df.columns.tolist()
+    if SYSTEM_RECORD_ID in unique_columns:
+        unique_columns.remove(SYSTEM_RECORD_ID)
+    logger.info(f"Dataset shape before clean duplicates: {df.shape}")
+    df = df.drop_duplicates(subset=unique_columns)
+    logger.info(f"Dataset shape after clean duplicates: {df.shape}")
+    nrows_after_full_dedup = len(df)
+    share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
+    if share_full_dedup > 0:
+        msg = bundle.get("dataset_full_duplicates").format(share_full_dedup)
+        logger.warning(msg)
+        # if not silent_mode:
+        #     print(msg)
+        # self.warning_counter.increment()
+    if TARGET in df.columns:
+        unique_columns.remove(TARGET)
+        marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
+        if marked_duplicates.sum() > 0:
+            dups_indices = df[marked_duplicates].index.to_list()
+            nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns))
+            num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
+            share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
+            msg = bundle.get("dataset_diff_target_duplicates").format(share_tgt_dedup, num_dup_rows, dups_indices)
+            logger.warning(msg)
+            if not silent:
+                print(msg)
+            df = df.drop_duplicates(subset=unique_columns, keep=False)
+            logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
 def _get_column_by_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[str]:
     for col, key_type in search_keys.items():
         if (isinstance(keys, list) and key_type in keys) or key_type == keys:

{upgini-1.1.244a13.dist-info → upgini-1.1.244a15.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: upgini
-Version: 1.1.244a13
+Version: 1.1.244a15
 Summary: Intelligent data search & enrichment for Machine Learning
 Home-page: https://upgini.com/
 Author: Upgini Developers

{upgini-1.1.244a13.dist-info → upgini-1.1.244a15.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,12 @@
 upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
 upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
-upgini/dataset.py,sha256=2oOmBe8_mpwJ8Fw14gw4uZ1GgLU4PtjozkXhvIXhRq0,50022
+upgini/dataset.py,sha256=WGpnmpnmfdyB2DAwaj7mkk2s0e-6Z6bg5BWj1lUE2p0,49960
 upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
-upgini/features_enricher.py,sha256=0dcpk0jFmvDrMmjMD2XlJhiW7la8YCKBbWEGJQSA7Uc,165283
+upgini/features_enricher.py,sha256=a3RBqMMxY3lH6bkvc20I3zyL5oQF3VUDUIvwqgtzxxA,165592
 upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
 upgini/http.py,sha256=eSG4gOpmCGlXmB6KIPNzAG8tRZNUjyYpMeUeHw_2li4,42264
 upgini/metadata.py,sha256=55t0uQI910tzTcnwxZCUL1413BhTiSm8oqiwp-94NyA,9613
-upgini/metrics.py,sha256=3LP_7yo1LYCllxI5E_eorrcTTX2MTkSsQwydQTlenbo,25952
+upgini/metrics.py,sha256=BCEotBr4_PCfUheswZ_FPAj6Lk_P-iyl9Qfi8WqdbqY,26136
 upgini/search_task.py,sha256=5n4qGJmtu48s0-FHAtF3L5qVLMd1JVW3FJlM8dFbh-s,17063
 upgini/spinner.py,sha256=Dm1dQ5F_z_Ua2odLxZX7OypcOX9tSx_vE5MGaKtUmfw,1118
 upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
@@ -40,7 +40,7 @@ upgini/utils/country_utils.py,sha256=1KXhLSNqkNYVL3on8-zK0Arc_SspUH7AMZvGZICysOU
 upgini/utils/custom_loss_utils.py,sha256=DBslpjWGPt7xTeypt78baR59012SYphbPsO_YLKdilo,3972
 upgini/utils/cv_utils.py,sha256=6pSSL_Ft_8C6n6aInJeiyeSBD7McjsMxKZpHqSBV0uY,2491
 upgini/utils/datetime_utils.py,sha256=awsLpnFjBNcrsCDyyiiJLicHgHiGCNAwi0UOwRKGD7s,8645
-upgini/utils/deduplicate_utils.py,sha256=nFRHUanDqCURk1tF7nuLzHqmpo8pJOW-UMEj_3PTBDg,3083
+upgini/utils/deduplicate_utils.py,sha256=OxJ3ygvRQL5H_h2Kn0mwRaj5Ux8FmCQ8ZV4YQvSRZyw,5794
 upgini/utils/display_utils.py,sha256=tiq5sFOfMwkKCjQ7OGdyK_twe0Qdr9F3mzkW1QXSDog,10664
 upgini/utils/email_utils.py,sha256=3CvHXTSzlgLyGsQOXfRYVfFhfPy6OXG4uXOBWRaLfHg,3479
 upgini/utils/fallback_progress_bar.py,sha256=cdbd1XGcWm4Ed4eAqV2_St3z7uC_kkH22gEyrN5ub6M,1090
@@ -54,8 +54,8 @@ upgini/utils/sklearn_ext.py,sha256=fvuTWJ5AnT3ED9KSaQu_yIgW2JR19hFlaGDoVP3k60g,4
 upgini/utils/target_utils.py,sha256=qyj-bGsIEl9X2Vc5gwXtsuRaocvId8bn46F7mZ9dy9A,1707
 upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
 upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
-upgini-1.1.244a13.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.1.244a13.dist-info/METADATA,sha256=8GXzJ6Sos2jMZZar9rcMgjyyIUPly2H0Yxqv4Gup9iw,48265
-upgini-1.1.244a13.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
-upgini-1.1.244a13.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
-upgini-1.1.244a13.dist-info/RECORD,,
+upgini-1.1.244a15.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.1.244a15.dist-info/METADATA,sha256=GslYuWCFvWkrO6G5g88d5yIzv5nqe4OdOqyVHHEKO0k,48265
+upgini-1.1.244a15.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
+upgini-1.1.244a15.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
+upgini-1.1.244a15.dist-info/RECORD,,

{upgini-1.1.244a13.dist-info → upgini-1.1.244a15.dist-info}/LICENSE RENAMED Viewed

File without changes

{upgini-1.1.244a13.dist-info → upgini-1.1.244a15.dist-info}/WHEEL RENAMED Viewed

File without changes

{upgini-1.1.244a13.dist-info → upgini-1.1.244a15.dist-info}/top_level.txt RENAMED Viewed

File without changes

upgini 1.1.244a13__py3-none-any.whl → 1.1.244a15__py3-none-any.whl

Potentially problematic release.

upgini 1.1.244a13py3-none-any.whl → 1.1.244a15py3-none-any.whl