PyPI - upgini - Versions diffs - 1.2.12__py3-none-any.whl → 1.2.13__py3-none-any.whl - Mend

upgini 1.2.12py3-none-any.whl → 1.2.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (10) hide show

upgini/__about__.py +1 -1
upgini/autofe/binary.py +4 -2
upgini/features_enricher.py +1 -0
upgini/metrics.py +1 -1
upgini/resource_bundle/strings.properties +1 -1
upgini/utils/deduplicate_utils.py +93 -88
{upgini-1.2.12.dist-info → upgini-1.2.13.dist-info}/METADATA +1 -1
{upgini-1.2.12.dist-info → upgini-1.2.13.dist-info}/RECORD +10 -10
{upgini-1.2.12.dist-info → upgini-1.2.13.dist-info}/WHEEL +0 -0
{upgini-1.2.12.dist-info → upgini-1.2.13.dist-info}/licenses/LICENSE +0 -0

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.12"
1	+ __version__ = "1.2.13"

upgini/autofe/binary.py CHANGED Viewed

@@ -142,9 +142,9 @@ class Distance(PandasOperand):
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         return pd.Series(
             1 - self.__dot(left, right) / (self.__norm(left) * self.__norm(right)), index=left.index
-        )
+        ).astype(np.float64)
-    # row-wise dot product
+    # row-wise dot product, handling None values
     def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
         left = left.apply(lambda x: np.array(x))
         right = right.apply(lambda x: np.array(x))
@@ -152,7 +152,9 @@ class Distance(PandasOperand):
         res = res.reindex(left.index.union(right.index))
         return res
+    # Calculate the norm of a vector, handling None values
     def __norm(self, vector: pd.Series) -> pd.Series:
+        vector = vector.fillna(np.nan)
         return np.sqrt(self.__dot(vector, vector))

upgini/features_enricher.py CHANGED Viewed

@@ -3322,6 +3322,7 @@ class FeaturesEnricher(TransformerMixin):
         # index overrites from result_features
         original_index_name = df_with_original_index.index.name
         df_with_original_index = df_with_original_index.reset_index()
+        # TODO drop system_record_id before merge
         result_features = pd.merge(
             df_with_original_index,
             result_features,

upgini/metrics.py CHANGED Viewed

@@ -526,7 +526,7 @@ class CatBoostWrapper(EstimatorWrapper):
         emb_name = "__grouped_embeddings"
         df = df.copy()
         df[self.emb_features] = df[self.emb_features].fillna(0.0)
-        df[emb_name] = df[self.emb_features].values.tolist()
+        df[emb_name] = pd.Series(df[self.emb_features].values.tolist())
         df = df.drop(columns=self.emb_features)
         return df, [emb_name]

upgini/resource_bundle/strings.properties CHANGED Viewed

@@ -96,7 +96,7 @@ invalid_date_format=Failed to parse date in column `{}`. Try to pass explicit da
 unsupported_date_type=Unsupported type of date column `{}`. Convert to datetime please.
 invalid_postal_code=All values of POSTAL_CODE column `{}` are invalid
 invalid_country=All values of COUNTRY column `{}` are invalid
-invalid_ip=All values of IPv4 column `{}` are invalid
+invalid_ip=All values of IP column `{}` are invalid
     # X and y validation
 unsupported_x_type=Unsupported type of X: {}. Use pandas.DataFrame, pandas.Series or numpy.ndarray or list
 x_contains_dup_columns=X contains duplicate column names. Please rename or drop duplicates

upgini/utils/deduplicate_utils.py CHANGED Viewed

@@ -25,12 +25,11 @@ def remove_fintech_duplicates(
     silent=False,
     bundle: ResourceBundle = None,
 ) -> pd.DataFrame:
-    # Base checks
+    # Initial checks for target type and date column
     date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
     if define_task(df[TARGET], date_col is not None, silent=True) != ModelTaskType.BINARY:
         return df
-    date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
     if date_col is None:
         return df
@@ -47,97 +46,103 @@ def remove_fintech_duplicates(
     if len(personal_cols) == 0:
         return df
-    sub_df = df[personal_cols + [date_col, TARGET]]
-    # Fast check for duplicates by personal keys
-    if not sub_df[personal_cols].duplicated().any():
-        return df
-    grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
-    # counts of diff dates by set of personal keys
-    uniques = grouped_by_personal_cols[date_col].nunique()
-    total = len(uniques)
-    diff_dates = len(uniques[uniques > 1])
-    if diff_dates / total >= 0.6:
-        return df
-    # Additional checks
-    duplicates = sub_df.duplicated(personal_cols, keep=False)
-    duplicate_rows = sub_df[duplicates]
-    if len(duplicate_rows) == 0:
-        return df
-    # if there is no different target values in personal keys duplicate rows
-    nonunique_target_groups = grouped_by_personal_cols[TARGET].nunique() > 1
-    if nonunique_target_groups.sum() == 0:
-        return df
-    def has_diff_target_within_60_days(rows):
-        rows = rows.sort_values(by=date_col)
-        return len(rows[rows[TARGET].ne(rows[TARGET].shift()) & (rows[date_col].diff() < 60 * 24 * 60 * 60 * 1000)]) > 0
-    nonunique_target_rows = nonunique_target_groups[nonunique_target_groups].reset_index().drop(columns=TARGET)
-    sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
-    sub_df = DateTimeSearchKeyConverter(date_col, date_format=date_format, logger=logger, bundle=bundle).convert(sub_df)
-    grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
-    rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
-    if len(rows_with_diff_target) > 0:
-        unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
-        if EVAL_SET_INDEX not in df.columns:
-            rows_to_remove = pd.merge(df.reset_index(), unique_keys_to_delete, on=personal_cols)
-            rows_to_remove = rows_to_remove.set_index(df.index.name or "index")
-            perc = len(rows_to_remove) * 100 / len(df)
-            msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
-                perc, len(rows_to_remove), rows_to_remove.index.to_list()
-            )
-            if not silent:
-                print(msg)
-            if logger:
-                logger.warning(msg)
-            logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
-            df = df[~df.index.isin(rows_to_remove.index)]
-            logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
-        else:
-            # Indices in train and eval_set can be the same so we remove rows from them separately
-            train = df.query(f"{EVAL_SET_INDEX} == 0")
-            train_rows_to_remove = pd.merge(train.reset_index(), unique_keys_to_delete, on=personal_cols)
-            train_rows_to_remove = train_rows_to_remove.set_index(train.index.name or "index")
-            train_perc = len(train_rows_to_remove) * 100 / len(train)
-            msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
-                train_perc, len(train_rows_to_remove), train_rows_to_remove.index.to_list()
+    # Splitting into train and eval_set parts
+    if EVAL_SET_INDEX in df.columns:
+        train_df = df[df[EVAL_SET_INDEX] == 0]
+        eval_dfs = [df[df[EVAL_SET_INDEX] == idx] for idx in df[EVAL_SET_INDEX].unique() if idx != 0]
+    else:
+        train_df = df
+        eval_dfs = []
+    def process_df(segment_df: pd.DataFrame, eval_index=0) -> pd.DataFrame:
+        """Process a subset of the dataset to remove duplicates based on personal keys."""
+        # Fast check for duplicates based on personal keys
+        if not segment_df[personal_cols].duplicated().any():
+            return segment_df
+        sub_df = segment_df[personal_cols + [date_col, TARGET]].copy()
+        # Group by personal columns to check for unique dates
+        grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
+        # Checking for different dates by the same personal keys
+        uniques = grouped_by_personal_cols[date_col].nunique()
+        total = len(uniques)
+        diff_dates = len(uniques[uniques > 1])
+        if diff_dates / total >= 0.6:
+            return segment_df
+        # Check for duplicate rows
+        duplicates = sub_df.duplicated(personal_cols, keep=False)
+        duplicate_rows = sub_df[duplicates]
+        if len(duplicate_rows) == 0:
+            return segment_df
+        # Check if there are different target values for the same personal keys
+        nonunique_target_groups = grouped_by_personal_cols[TARGET].nunique() > 1
+        if nonunique_target_groups.sum() == 0:
+            return segment_df
+        # Helper function to check if there are different target values within 60 days
+        def has_diff_target_within_60_days(rows: pd.DataFrame):
+            rows = rows.sort_values(by=date_col)
+            return (
+                len(rows[rows[TARGET].ne(rows[TARGET].shift()) & (rows[date_col].diff() < 60 * 24 * 60 * 60 * 1000)])
+                > 0
             )
+        # Filter rows with different target values within 60 days
+        nonunique_target_rows = nonunique_target_groups[nonunique_target_groups].reset_index().drop(columns=TARGET)
+        sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
+        # Convert date columns for further checks
+        sub_df = DateTimeSearchKeyConverter(date_col, date_format=date_format, logger=logger, bundle=bundle).convert(
+            sub_df
+        )
+        grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
+        rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
+        if len(rows_with_diff_target) > 0:
+            unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
+            rows_to_remove = pd.merge(segment_df.reset_index(), unique_keys_to_delete, on=personal_cols)
+            rows_to_remove = rows_to_remove.set_index(segment_df.index.name or "index")
+            perc = len(rows_to_remove) * 100 / len(segment_df)
+            if eval_index == 0:
+                msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
+                    perc, len(rows_to_remove), rows_to_remove.index.to_list()
+                )
+            else:
+                msg = bundle.get("dataset_eval_diff_target_duplicates_fintech").format(
+                    perc, len(rows_to_remove), eval_index, rows_to_remove.index.to_list()
+                )
             if not silent:
                 print(msg)
             if logger:
                 logger.warning(msg)
-            logger.info(f"Train dataset shape before clean fintech duplicates: {train.shape}")
-            train = train[~train.index.isin(train_rows_to_remove.index)]
-            logger.info(f"Train dataset shape after clean fintech duplicates: {train.shape}")
-            evals = [df.query(f"{EVAL_SET_INDEX} == {i}") for i in df[EVAL_SET_INDEX].unique() if i != 0]
-            new_evals = []
-            for i, eval in enumerate(evals):
-                eval_rows_to_remove = pd.merge(eval.reset_index(), unique_keys_to_delete, on=personal_cols)
-                eval_rows_to_remove = eval_rows_to_remove.set_index(eval.index.name or "index")
-                eval_perc = len(eval_rows_to_remove) * 100 / len(eval)
-                msg = bundle.get("dataset_eval_diff_target_duplicates_fintech").format(
-                    eval_perc, len(eval_rows_to_remove), i + 1, eval_rows_to_remove.index.to_list()
-                )
-                if not silent:
-                    print(msg)
-                if logger:
-                    logger.warning(msg)
-                logger.info(f"Eval {i + 1} dataset shape before clean fintech duplicates: {eval.shape}")
-                eval = eval[~eval.index.isin(eval_rows_to_remove.index)]
-                logger.info(f"Eval {i + 1} dataset shape after clean fintech duplicates: {eval.shape}")
-                new_evals.append(eval)
-            logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
-            df = pd.concat([train] + new_evals)
-            logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
+            return segment_df[~segment_df.index.isin(rows_to_remove.index)]
+        return segment_df
+    # Process the train part separately
+    logger.info(f"Train dataset shape before clean fintech duplicates: {train_df.shape}")
+    train_df = process_df(train_df)
+    logger.info(f"Train dataset shape after clean fintech duplicates: {train_df.shape}")
+    # Process each eval_set part separately
+    new_eval_dfs = []
+    for i, eval_df in enumerate(eval_dfs, 1):
+        logger.info(f"Eval {i} dataset shape before clean fintech duplicates: {eval_df.shape}")
+        cleaned_eval_df = process_df(eval_df, i)
+        logger.info(f"Eval {i} dataset shape after clean fintech duplicates: {cleaned_eval_df.shape}")
+        new_eval_dfs.append(cleaned_eval_df)
+    # Combine the processed train and eval parts back into one dataset
+    logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
+    if new_eval_dfs:
+        df = pd.concat([train_df] + new_eval_dfs)
+    else:
+        df = train_df
+    logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
     return df

{upgini-1.2.12.dist-info → upgini-1.2.13.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.12
+Version: 1.2.13
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

{upgini-1.2.12.dist-info → upgini-1.2.13.dist-info}/RECORD RENAMED Viewed

@@ -1,13 +1,13 @@
-upgini/__about__.py,sha256=dbW85A2PinQCZabwD2DNDTfOE9315GDtQQKAsJP8IXk,23
+upgini/__about__.py,sha256=rQSlPcfj4yT4krIq6epTVQyBzIX4etVOgfupVkM-RnU,23
 upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
 upgini/dataset.py,sha256=olZ-OHSfBNoBSCo7R5t7uCLukI2nO7afpx_A-HCiJLk,31067
 upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
-upgini/features_enricher.py,sha256=eRkI2qpV-IprB1dQAMxzto6I6Q3b3SBuDMVR1_OFlyA,188008
+upgini/features_enricher.py,sha256=HJJZbZScVrl6ugDBQE71m7om5-ahvMyEnAqZNw-OEJ0,188058
 upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
 upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
 upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
-upgini/metrics.py,sha256=aKJwAYUGNRdiz9z-bxDxs4jGZQ_VkPXa7sZ52C0VpVI,31243
+upgini/metrics.py,sha256=bgi1rc3vCCeCuwRX1doQSQCzaV5OEiYHv_6XIvapnaw,31254
 upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
 upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
 upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1389
@@ -15,7 +15,7 @@ upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9Jvf
 upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
 upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/autofe/all_operands.py,sha256=cCCB44qvkmuWyiRM5Xykx8tkHPIjQthrWyj67STWN80,2578
-upgini/autofe/binary.py,sha256=TRjEdxsfyPY5E8ksYfdKMmU6GtvALfGFPNVIG7DBhzM,7520
+upgini/autofe/binary.py,sha256=zMhtHVuGUAFLUqem-XiXqJj-GRXxS88tdz8tFuDfSNM,7659
 upgini/autofe/date.py,sha256=OpFc3Al0xO3qlESn2Uokfxw51ArVqmh3xngWwdrsaqE,9762
 upgini/autofe/feature.py,sha256=eL7wABUhDKZzv3E-RPJNcyGwSfB0UptcfU2RbvsOks4,15082
 upgini/autofe/groupby.py,sha256=r-xl_keZZgm_tpiEoDhjYSkT6NHv7a4cRQR4wJ4uCp8,3263
@@ -30,7 +30,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
 upgini/normalizer/normalize_utils.py,sha256=bHRPWCNrUvt2R9qMX6dZFCJ0i8ENVCQ2Rw3dHH9IJEg,7447
 upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
 upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
-upgini/resource_bundle/strings.properties,sha256=hWldMqtv80lwv8HV00Hk2-3tflu4BkD6tiXOfGDZPl8,26458
+upgini/resource_bundle/strings.properties,sha256=9kvmcUrsSFUCrzOiN0Ozf-lQ2H8Igz5gATUPoHMOaU4,26456
 upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
 upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -43,7 +43,7 @@ upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk
 upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
 upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
 upgini/utils/datetime_utils.py,sha256=4tsGeehU0KS6wqNsc9gEEWZ9s6T9E0UReUIO3rSuXNU,12174
-upgini/utils/deduplicate_utils.py,sha256=Zvs7zW4QzaERQmJNPrTVf2ZTVBkBLOycFCzyMwtXuV8,8770
+upgini/utils/deduplicate_utils.py,sha256=NpaPtBYXwUtfKTRHWrtz2uUq6tZN6C_Nd719ydPRF2Q,8484
 upgini/utils/display_utils.py,sha256=A2ouB5eiZ-Kyt9ykYxkLQwyoRPrdYeJymwNTiajtFXs,10990
 upgini/utils/email_utils.py,sha256=j0Ug1R_0AnCg1Y92zIZ4XMwvKo3G5_pcOlBN1OH_gZs,5191
 upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
 upgini/utils/target_utils.py,sha256=BVtDmrmFMKerSUWaNOIEdzsYHIFiODdpnWbE50QDPDc,7864
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
 upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
-upgini-1.2.12.dist-info/METADATA,sha256=k_J1xVbmpvm56wJ_hDo17cEK6rXRhhqJp3rSbw233xA,48577
-upgini-1.2.12.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-upgini-1.2.12.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.12.dist-info/RECORD,,
+upgini-1.2.13.dist-info/METADATA,sha256=IRJWMi0M4nUgCqMwp4kffx8QXgR1DJ2VsqH5Y7-nQ2E,48577
+upgini-1.2.13.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
+upgini-1.2.13.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.13.dist-info/RECORD,,

{upgini-1.2.12.dist-info → upgini-1.2.13.dist-info}/WHEEL RENAMED Viewed

File without changes

{upgini-1.2.12.dist-info → upgini-1.2.13.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.12__py3-none-any.whl → 1.2.13__py3-none-any.whl

Potentially problematic release.

upgini 1.2.12py3-none-any.whl → 1.2.13py3-none-any.whl