PyPI - upgini - Versions diffs - 1.1.262a3250.post3__py3-none-any.whl → 1.1.274a4__py3-none-any.whl - Mend

upgini 1.1.262a3250.post3py3-none-any.whl → 1.1.274a4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

upgini/autofe/all_operands.py +12 -2
upgini/autofe/date.py +68 -8
upgini/autofe/feature.py +1 -1
upgini/data_source/data_source_publisher.py +24 -5
upgini/dataset.py +21 -58
upgini/features_enricher.py +114 -40
upgini/fingerprint.js +8 -0
upgini/metrics.py +58 -7
upgini/normalizer/phone_normalizer.py +2 -2
upgini/resource_bundle/strings.properties +8 -3
upgini/search_task.py +1 -1
upgini/utils/datetime_utils.py +53 -2
upgini/utils/deduplicate_utils.py +61 -18
upgini/utils/sklearn_ext.py +1 -2
upgini/utils/target_utils.py +125 -2
{upgini-1.1.262a3250.post3.dist-info → upgini-1.1.274a4.dist-info}/METADATA +2 -2
{upgini-1.1.262a3250.post3.dist-info → upgini-1.1.274a4.dist-info}/RECORD +20 -19
{upgini-1.1.262a3250.post3.dist-info → upgini-1.1.274a4.dist-info}/LICENSE +0 -0
{upgini-1.1.262a3250.post3.dist-info → upgini-1.1.274a4.dist-info}/WHEEL +0 -0
{upgini-1.1.262a3250.post3.dist-info → upgini-1.1.274a4.dist-info}/top_level.txt +0 -0

upgini/resource_bundle/strings.properties CHANGED Viewed

@@ -38,6 +38,7 @@ loss_selection_warn=\nWARNING: Loss `{0}` is not supported for feature selection
 loss_calc_metrics_warn=\nWARNING: Loss `{0}` is not supported for metrics calculation with {1}
 multivariate_timeseries_detected=\nWARNING: Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
 group_k_fold_in_classification=\nWARNING: Using group K-fold cross-validation split for classification task.
+current_date_added=\nWARNING: No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
 # Errors
 failed_search_by_task_id=Failed to retrieve the specified search results
@@ -111,6 +112,9 @@ x_is_empty=X is empty
 y_is_empty=y is empty
 x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
 missing_generate_feature=\nWARNING: Feature {} specified in `generate_features` is not present in input columns: {}
+x_unstable_by_date=\nWARNING: Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
+train_unstable_target=\nWARNING: Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
+eval_unstable_target=\nWARNING: Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
     # eval set validation
 unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
 eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
@@ -145,7 +149,8 @@ dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample
 dataset_empty_column_names=Some column names are empty. Add names please
 dataset_full_duplicates=\nWARNING: {:.5f}% of the rows are fully duplicated
 dataset_diff_target_duplicates=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
-dataset_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
+dataset_train_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
+dataset_eval_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
 dataset_drop_old_dates=\nWARNING: We don't have data before '2000-01-01' and removed all earlier records from the search dataset
 dataset_all_dates_old=There is empty train dataset after removing data before '2000-01-01'
 dataset_invalid_target_type=Unexpected dtype of target for binary task type: {}. Expected int or bool
@@ -196,10 +201,10 @@ email_detected=Emails detected in column `{}`. It will be used as a search key\n
 email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
 phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
 phone_detected_not_registered=\nWARNING: Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
-target_type_detected=Detected task type: {}\n
+target_type_detected=\nDetected task type: {}\n
 # all_ok_community_invite=Chat with us in Slack community:
 all_ok_community_invite=❓ Support request
-too_small_for_metrics=Your train dataset contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
+too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
 imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
 loss_selection_info=Using loss `{}` for feature selection
 loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator

upgini/search_task.py CHANGED Viewed

@@ -57,7 +57,7 @@ class SearchTask:
         if logger is not None:
             self.logger = logger
         else:
-            self.logger = logging.getLogger()
+            self.logger = logging.getLogger("muted_logger")
             self.logger.setLevel("FATAL")
         self.provider_metadata_v2: Optional[List[ProviderTaskMetadataV2]] = None
         self.unused_features_for_generation: Optional[List[str]] = None

upgini/utils/datetime_utils.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import datetime
 import logging
 import re
-from typing import List, Optional
+from typing import Dict, List, Optional
 import numpy as np
 import pandas as pd
@@ -9,7 +9,9 @@ from dateutil.relativedelta import relativedelta
 from pandas.api.types import is_numeric_dtype, is_period_dtype, is_string_dtype
 from upgini.errors import ValidationError
+from upgini.metadata import SearchKey
 from upgini.resource_bundle import ResourceBundle, get_custom_bundle
+from upgini.utils.warning_counter import WarningCounter
 DATE_FORMATS = [
     "%Y-%m-%d",
@@ -44,7 +46,7 @@ class DateTimeSearchKeyConverter:
         if logger is not None:
             self.logger = logger
         else:
-            self.logger = logging.getLogger()
+            self.logger = logging.getLogger("muted_logger")
             self.logger.setLevel("FATAL")
         self.generated_features: List[str] = []
         self.bundle = bundle or get_custom_bundle()
@@ -98,6 +100,9 @@ class DateTimeSearchKeyConverter:
                 msg = self.bundle.get("unsupported_date_type").format(self.date_column)
                 self.logger.warning(msg)
                 raise ValidationError(msg)
+        else:
+            df[self.date_column] = df[self.date_column].astype("string").apply(self.clean_date)
+            df[self.date_column] = self.parse_date(df)
         # If column with date is datetime then extract seconds of the day and minute of the hour
         # as additional features
@@ -225,3 +230,49 @@ def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[st
     is_diff_less_than_two_columns = grouped.apply(check_differences)
     return is_diff_less_than_two_columns.all()
+def validate_dates_distribution(
+    X: pd.DataFrame,
+    search_keys: Dict[str, SearchKey],
+    logger: Optional[logging.Logger] = None,
+    bundle: Optional[ResourceBundle] = None,
+    warning_counter: Optional[WarningCounter] = None,
+):
+    maybe_date_col = None
+    for key, key_type in search_keys.items():
+        if key_type in [SearchKey.DATE, SearchKey.DATETIME]:
+            maybe_date_col = key
+    if maybe_date_col is None:
+        for col in X.columns:
+            if col in search_keys:
+                continue
+            try:
+                pd.to_datetime(X[col])
+                maybe_date_col = col
+                break
+            except Exception:
+                pass
+    if maybe_date_col is None:
+        return
+    dates = pd.to_datetime(X[maybe_date_col]).dt.date
+    date_counts = dates.value_counts().sort_index()
+    date_counts_1 = date_counts[: round(len(date_counts) / 2)]
+    date_counts_2 = date_counts[round(len(date_counts) / 2) :]
+    ratio = date_counts_2.mean() / date_counts_1.mean()
+    if ratio > 1.2 or ratio < 0.8:
+        if warning_counter is not None:
+            warning_counter.increment()
+        if logger is None:
+            logger = logging.getLogger("muted_logger")
+            logger.setLevel("FATAL")
+        bundle = bundle or get_custom_bundle()
+        msg = bundle.get("x_unstable_by_date")
+        print(msg)
+        logger.warning(msg)

upgini/utils/deduplicate_utils.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Dict, List, Optional, Union
 import pandas as pd
-from upgini.metadata import SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
+from upgini.metadata import EVAL_SET_INDEX, SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
 from upgini.resource_bundle import ResourceBundle
 from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
 from upgini.utils.target_utils import define_task
@@ -78,20 +78,58 @@ def remove_fintech_duplicates(
     rows_with_diff_target = grouped_by_personal_cols.filter(has_diff_target_within_60_days)
     if len(rows_with_diff_target) > 0:
         unique_keys_to_delete = rows_with_diff_target[personal_cols].drop_duplicates()
-        rows_to_remove = pd.merge(df.reset_index(), unique_keys_to_delete, on=personal_cols)
-        rows_to_remove = rows_to_remove.set_index(df.index.name or "index")
-        perc = len(rows_to_remove) * 100 / len(df)
-        msg = bundle.get("dataset_diff_target_duplicates_fintech").format(
-            perc, len(rows_to_remove), rows_to_remove.index.to_list()
-        )
-        if not silent:
-            print(msg)
-        if logger:
-            logger.warning(msg)
-        logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
-        df = df[~df.index.isin(rows_to_remove.index)]
-        logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
+        if EVAL_SET_INDEX not in df.columns:
+            rows_to_remove = pd.merge(df.reset_index(), unique_keys_to_delete, on=personal_cols)
+            rows_to_remove = rows_to_remove.set_index(df.index.name or "index")
+            perc = len(rows_to_remove) * 100 / len(df)
+            msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
+                perc, len(rows_to_remove), rows_to_remove.index.to_list()
+            )
+            if not silent:
+                print(msg)
+            if logger:
+                logger.warning(msg)
+            logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
+            df = df[~df.index.isin(rows_to_remove.index)]
+            logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
+        else:
+            # Indices in train and eval_set can be the same so we remove rows from them separately
+            train = df.query(f"{EVAL_SET_INDEX} == 0")
+            train_rows_to_remove = pd.merge(train.reset_index(), unique_keys_to_delete, on=personal_cols)
+            train_rows_to_remove = train_rows_to_remove.set_index(train.index.name or "index")
+            train_perc = len(train_rows_to_remove) * 100 / len(train)
+            msg = bundle.get("dataset_train_diff_target_duplicates_fintech").format(
+                train_perc, len(train_rows_to_remove), train_rows_to_remove.index.to_list()
+            )
+            if not silent:
+                print(msg)
+            if logger:
+                logger.warning(msg)
+            logger.info(f"Train dataset shape before clean fintech duplicates: {train.shape}")
+            train = train[~train.index.isin(train_rows_to_remove.index)]
+            logger.info(f"Train dataset shape after clean fintech duplicates: {train.shape}")
+            evals = [df.query(f"{EVAL_SET_INDEX} == {i}") for i in df[EVAL_SET_INDEX].unique() if i != 0]
+            new_evals = []
+            for i, eval in enumerate(evals):
+                eval_rows_to_remove = pd.merge(eval.reset_index(), unique_keys_to_delete, on=personal_cols)
+                eval_rows_to_remove = eval_rows_to_remove.set_index(eval.index.name or "index")
+                eval_perc = len(eval_rows_to_remove) * 100 / len(eval)
+                msg = bundle.get("dataset_eval_diff_target_duplicates_fintech").format(
+                    eval_perc, len(eval_rows_to_remove), i + 1, eval_rows_to_remove.index.to_list()
+                )
+                if not silent:
+                    print(msg)
+                if logger:
+                    logger.warning(msg)
+                logger.info(f"Eval {i + 1} dataset shape before clean fintech duplicates: {eval.shape}")
+                eval = eval[~eval.index.isin(eval_rows_to_remove.index)]
+                logger.info(f"Eval {i + 1} dataset shape after clean fintech duplicates: {eval.shape}")
+                new_evals.append(eval)
+            logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
+            df = pd.concat([train] + new_evals)
+            logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
     return df
@@ -101,14 +139,18 @@ def clean_full_duplicates(
     nrows = len(df)
     if nrows == 0:
         return df
-    # Remove absolute duplicates (exclude system_record_id)
+    # Remove full duplicates (exclude system_record_id, sort_id and eval_set_index)
     unique_columns = df.columns.tolist()
     if SYSTEM_RECORD_ID in unique_columns:
         unique_columns.remove(SYSTEM_RECORD_ID)
     if SORT_ID in unique_columns:
         unique_columns.remove(SORT_ID)
+    if EVAL_SET_INDEX in unique_columns:
+        unique_columns.remove(EVAL_SET_INDEX)
     logger.info(f"Dataset shape before clean duplicates: {df.shape}")
-    df = df.drop_duplicates(subset=unique_columns)
+    # Train segment goes first so if duplicates are found in train and eval set
+    # then we keep unique rows in train segment
+    df = df.drop_duplicates(subset=unique_columns, keep="first")
     logger.info(f"Dataset shape after clean duplicates: {df.shape}")
     nrows_after_full_dedup = len(df)
     share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
@@ -123,7 +165,7 @@ def clean_full_duplicates(
         marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
         if marked_duplicates.sum() > 0:
             dups_indices = df[marked_duplicates].index.to_list()
-            nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns))
+            nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
             num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
             share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
@@ -133,6 +175,7 @@ def clean_full_duplicates(
                 print(msg)
             df = df.drop_duplicates(subset=unique_columns, keep=False)
             logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
     return df

upgini/utils/sklearn_ext.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import functools
-import logging
 import numbers
 import time
 import warnings
@@ -313,7 +312,7 @@ def cross_validate(
         return ret
     except Exception:
-        logging.exception("Failed to execute overriden cross_validate. Fallback to original")
+        # logging.exception("Failed to execute overriden cross_validate. Fallback to original")
         raise
         # fit_params["use_best_model"] = False
         # return original_cross_validate(

upgini/utils/target_utils.py CHANGED Viewed

@@ -6,8 +6,10 @@ import pandas as pd
 from pandas.api.types import is_numeric_dtype
 from upgini.errors import ValidationError
-from upgini.metadata import ModelTaskType
-from upgini.resource_bundle import bundle
+from upgini.metadata import SYSTEM_RECORD_ID, ModelTaskType
+from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
+from upgini.sampler.random_under_sampler import RandomUnderSampler
+from upgini.utils.warning_counter import WarningCounter
 def correct_string_target(y: Union[pd.Series, np.ndarray]) -> Union[pd.Series, np.ndarray]:
@@ -72,3 +74,124 @@ def is_int_encoding(unique_values):
     return set(unique_values) == set(range(len(unique_values))) or set(unique_values) == set(
         range(1, len(unique_values) + 1)
     )
+def balance_undersample(
+    df: pd.DataFrame,
+    target_column: str,
+    task_type: ModelTaskType,
+    random_state: int,
+    imbalance_threshold: int = 0.2,
+    min_sample_threshold: int = 5000,
+    binary_bootstrap_loops: int = 5,
+    multiclass_bootstrap_loops: int = 2,
+    logger: Optional[logging.Logger] = None,
+    bundle: Optional[ResourceBundle] = None,
+    warning_counter: Optional[WarningCounter] = None,
+) -> pd.DataFrame:
+    if logger is None:
+        logger = logging.getLogger("muted_logger")
+        logger.setLevel("FATAL")
+    bundle = bundle or get_custom_bundle()
+    if SYSTEM_RECORD_ID not in df.columns:
+        raise Exception("System record id must be presented for undersampling")
+    count = len(df)
+    target = df[target_column].copy()
+    target_classes_count = target.nunique()
+    vc = target.value_counts()
+    max_class_value = vc.index[0]
+    min_class_value = vc.index[len(vc) - 1]
+    max_class_count = vc[max_class_value]
+    min_class_count = vc[min_class_value]
+    min_class_percent = imbalance_threshold / target_classes_count
+    min_class_threshold = min_class_percent * count
+    resampled_data = df
+    df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
+    if task_type == ModelTaskType.MULTICLASS:
+        # Sort classes by rows count and find 25% quantile class
+        classes = vc.index
+        quantile25_idx = int(0.75 * len(classes)) - 1
+        quantile25_class = classes[quantile25_idx]
+        quantile25_class_cnt = vc[quantile25_class]
+        if max_class_count > (quantile25_class_cnt * multiclass_bootstrap_loops):
+            msg = bundle.get("imbalance_multiclass").format(quantile25_class, quantile25_class_cnt)
+            logger.warning(msg)
+            print(msg)
+            if warning_counter:
+                warning_counter.increment()
+            # 25% and lower classes will stay as is. Higher classes will be downsampled
+            sample_strategy = dict()
+            for class_idx in range(quantile25_idx):
+                # compare class count with count_of_quantile25_class * 2
+                class_value = classes[class_idx]
+                class_count = vc[class_value]
+                sample_strategy[class_value] = min(class_count, quantile25_class_cnt * multiclass_bootstrap_loops)
+            sampler = RandomUnderSampler(sampling_strategy=sample_strategy, random_state=random_state)
+            X = df[SYSTEM_RECORD_ID]
+            X = X.to_frame(SYSTEM_RECORD_ID)
+            new_x, _ = sampler.fit_resample(X, target)  # type: ignore
+            resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
+    elif len(df) > min_sample_threshold and min_class_count < min_sample_threshold / 2:
+        msg = bundle.get("dataset_rarest_class_less_threshold").format(
+            min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
+        )
+        logger.warning(msg)
+        print(msg)
+        if warning_counter:
+            warning_counter.increment()
+        # fill up to min_sample_threshold by majority class
+        minority_class = df[df[target_column] == min_class_value]
+        majority_class = df[df[target_column] != min_class_value]
+        sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
+        sampled_majority_class = majority_class.sample(n=sample_size, random_state=random_state)
+        resampled_data = df[
+            (df[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
+            | (df[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
+        ]
+    elif max_class_count > min_class_count * binary_bootstrap_loops:
+        msg = bundle.get("dataset_rarest_class_less_threshold").format(
+            min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
+        )
+        logger.warning(msg)
+        print(msg)
+        if warning_counter:
+            warning_counter.increment()
+        sampler = RandomUnderSampler(
+            sampling_strategy={max_class_value: binary_bootstrap_loops * min_class_count}, random_state=random_state
+        )
+        X = df[SYSTEM_RECORD_ID]
+        X = X.to_frame(SYSTEM_RECORD_ID)
+        new_x, _ = sampler.fit_resample(X, target)  # type: ignore
+        resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
+    logger.info(f"Shape after rebalance resampling: {resampled_data}")
+    return resampled_data
+def calculate_psi(expected: pd.Series, actual: pd.Series) -> float:
+    df = pd.concat([expected, actual])
+    # Define the bins for the target variable
+    df_min = df.min()
+    df_max = df.max()
+    bins = [df_min, (df_min + df_max) / 2, df_max]
+    # Calculate the base distribution
+    train_distribution = expected.value_counts(bins=bins, normalize=True).sort_index().values
+    # Calculate the target distribution
+    test_distribution = actual.value_counts(bins=bins, normalize=True).sort_index().values
+    # Calculate the PSI
+    return np.sum((train_distribution - test_distribution) * np.log(train_distribution / test_distribution))

{upgini-1.1.262a3250.post3.dist-info → upgini-1.1.274a4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: upgini
-Version: 1.1.262a3250.post3
+Version: 1.1.274a4
 Summary: Intelligent data search & enrichment for Machine Learning
 Home-page: https://upgini.com/
 Author: Upgini Developers
@@ -28,7 +28,7 @@ Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: python-dateutil >=2.8.0
 Requires-Dist: requests >=2.8.0
-Requires-Dist: pandas <2.0.0,>=1.1.0
+Requires-Dist: pandas <2.1.0,>=1.1.0
 Requires-Dist: numpy >=1.19.0
 Requires-Dist: scikit-learn >=1.3.0
 Requires-Dist: pydantic <2.0.0,>=1.8.2

{upgini-1.1.262a3250.post3.dist-info → upgini-1.1.274a4.dist-info}/RECORD RENAMED Viewed

@@ -1,34 +1,35 @@
 upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
 upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
-upgini/dataset.py,sha256=ywBwf93d0IH39ZGfmNDlAwe1ILQtt1WzJ87WfIOMI2g,48149
+upgini/dataset.py,sha256=xb4gIANyGbdcuM8Awyq2pJPiH_3k_LEbETApJgAoRBA,45529
 upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
-upgini/features_enricher.py,sha256=fFSLW6aAzVq5YYaVcl-xbjSd3qYt8dW9hYAIestylSk,172118
+upgini/features_enricher.py,sha256=WDj4DO5lqANBdihEcRmwox4w1kqWVOorlIKY4dbsqrU,175376
+upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
 upgini/http.py,sha256=zaO86LBBLmkieGbgYifk29eVoPCxXimZQ8YkQtKcM0I,42244
 upgini/metadata.py,sha256=fwVxtkR6Mn4iRoOqV6BfMJvJrx65I3YwZUMbZjhPyOI,9673
-upgini/metrics.py,sha256=3VvSZW1cCOIPHImXuqcnWzD3fWcpPzVa9k8eulLbUmY,27426
-upgini/search_task.py,sha256=5n4qGJmtu48s0-FHAtF3L5qVLMd1JVW3FJlM8dFbh-s,17063
+upgini/metrics.py,sha256=U3VJKbKmuWACqI4jTcszXo0WqeXFtV8bWyY9VLBL-rw,29129
+upgini/search_task.py,sha256=tmJ17WUxv3J5NWrYUJB_NKdZ792Ifz8Z8UnDXeQnpss,17077
 upgini/spinner.py,sha256=Dm1dQ5F_z_Ua2odLxZX7OypcOX9tSx_vE5MGaKtUmfw,1118
 upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
 upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
 upgini/ads_management/ads_manager.py,sha256=fP4Yqx3h2Snw5X335TbXEwFoupq1RYsE7y0PAduvetU,2646
 upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-upgini/autofe/all_operands.py,sha256=KWAdcYv6cToc6NZPcCmz6P3N8Nwjp8UqojKuz-f2BZY,1589
+upgini/autofe/all_operands.py,sha256=H66wqVLD-H9k8A4-q2wslhV9QaNxlb49f8YiT0Xfkps,2356
 upgini/autofe/binary.py,sha256=f8LQqZi9zyaMUAv-jASMmWNA_vT05ncYCjZq0qx3USs,3972
-upgini/autofe/date.py,sha256=_VqhFMkzItbWxQjNgxqerx0sWbcV9yxq0q5kI33LvHk,1807
-upgini/autofe/feature.py,sha256=y5UMU8_cSrP9-3xmrmVlGXwIX2_bwTmzgQy4ShwEjMk,11812
+upgini/autofe/date.py,sha256=_6RoEJZ5Kf-Q_aMOFucS6YSIZpCcelgpw-edV4qmRIM,3935
+upgini/autofe/feature.py,sha256=2FQRGtIumNz60hFAjfLReaY18SI7HxzYZOoC5avzSjQ,11847
 upgini/autofe/groupby.py,sha256=iXRfOmOc84ooSzRhsh9GmmG7rTafX0-ekXko8s9Qs68,3089
 upgini/autofe/operand.py,sha256=dhtToPDGWtP_0u_RjayUpezJJZAgq_TzNbPH0bI9OXI,2805
 upgini/autofe/unary.py,sha256=YRTzQLttbDdOnkogWBPnBexpu7uHWSLSFAxSCu3iFdY,3145
 upgini/autofe/vector.py,sha256=5qhI_bdwaWM1l7fgCkx1tMt9R9gxWzoYCl-7WO4KiOs,604
 upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-upgini/data_source/data_source_publisher.py,sha256=yCMyYwFTfv0e7h-kAdtiQCF42J1DbqmJ1Wi0xt_ZzeM,15578
+upgini/data_source/data_source_publisher.py,sha256=taRzyGgrPrTTSGw4Y-Ca5k4bf30aiTa68rxqT9zfqeI,16478
 upgini/mdc/__init__.py,sha256=ETDh3JKbrDdPMOECiYLAa8lvKYe68mv4IY6fZa9FimA,1126
 upgini/mdc/context.py,sha256=Sl1S_InKlzzRxYqwJ2k24lawJdCKWgGJ-RIRfvzWJrk,1468
 upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-upgini/normalizer/phone_normalizer.py,sha256=lhwsPEnfyjeIsndW2EcQGZksXYsfxaQ1ghAzVYoDRKM,9927
+upgini/normalizer/phone_normalizer.py,sha256=_SYMX4GTgwzRXArK54Jp3vUBE5d4jZxSVyze-0tqzg0,9996
 upgini/resource_bundle/__init__.py,sha256=hdvbqL0b0xMWbY6-kiYGsW1ro2GMiWpxxsO9uCv-h9Q,8379
 upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
-upgini/resource_bundle/strings.properties,sha256=MGU_oBc15VAmbPZdThCpm3B4xERAKwbCIUTIG66dvUo,25228
+upgini/resource_bundle/strings.properties,sha256=x-2fXtGc5Z2n7eUg9b6I4yhok56TTXDvzwU1JUaKcj4,26285
 upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
 upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/sampler/base.py,sha256=CC-DvPbrN7zp5--SVFuUqkVmdWM_5F7R0Do98ETV82U,6421
@@ -40,8 +41,8 @@ upgini/utils/blocked_time_series.py,sha256=dMz5ewk3PsoeOrc3lDzInCVPS9u_2XQkV0W6P
 upgini/utils/country_utils.py,sha256=1KXhLSNqkNYVL3on8-zK0Arc_SspUH7AMZvGZICysOU,6462
 upgini/utils/custom_loss_utils.py,sha256=DBslpjWGPt7xTeypt78baR59012SYphbPsO_YLKdilo,3972
 upgini/utils/cv_utils.py,sha256=Tn01RJvpZGZh0PUQUimlBkV-AXwe7s6yjCNFtw352Uc,3525
-upgini/utils/datetime_utils.py,sha256=5wvEz9DWL_RS4EST5FFIidfD36MSL-wij4P9AAJpMl0,8822
-upgini/utils/deduplicate_utils.py,sha256=ckJrpU8Ruc_vcwIPTopbUjyJuNiseLHNAbQlLfhUCxo,5888
+upgini/utils/datetime_utils.py,sha256=XciFOIYI4Zi7PqQS8dHxuPDEtdtwXbOrWsiAa04v2J4,10511
+upgini/utils/deduplicate_utils.py,sha256=6AbARehUCghJZ4PppFtrej2s3gFRruh41MEm6mzakHs,8607
 upgini/utils/display_utils.py,sha256=LKoSwjrE0xgS5_cqVhc2og2CQ1UCZ1nTI2VKboIhoQA,10858
 upgini/utils/email_utils.py,sha256=3CvHXTSzlgLyGsQOXfRYVfFhfPy6OXG4uXOBWRaLfHg,3479
 upgini/utils/fallback_progress_bar.py,sha256=cdbd1XGcWm4Ed4eAqV2_St3z7uC_kkH22gEyrN5ub6M,1090
@@ -51,12 +52,12 @@ upgini/utils/ip_utils.py,sha256=Zf3F2cnQmOCH09QLQHetpjMFu1PnD0cTmDymn0SnSy8,1672
 upgini/utils/phone_utils.py,sha256=JNSkF8G6mgsN8Czy11pamaJdsY6rBINEMpi7jbVt_RA,408
 upgini/utils/postal_code_utils.py,sha256=_8CR9tBqsPptQsmMUvnrCAmBaMIQSWH3JfJ4ly3x_zs,409
 upgini/utils/progress_bar.py,sha256=iNXyqT3vKCeHpfiG5HHwr7Lk2cTtKViM93Fl8iZnjGc,1564
-upgini/utils/sklearn_ext.py,sha256=fvuTWJ5AnT3ED9KSaQu_yIgW2JR19hFlaGDoVP3k60g,44027
-upgini/utils/target_utils.py,sha256=DH812qcZ7Pvf9WVVb33fbwQjb1W9h1hXRNCCiG7Y6tI,2563
+upgini/utils/sklearn_ext.py,sha256=e1aMNXk1zUt7uFnl0FcUF0zOnaXSE7z5xBHmJPknUVs,44014
+upgini/utils/target_utils.py,sha256=9K67tkY7LWhQMO-vbbPqBaO-KriAmg_6fVz5RQRaLQc,7802
 upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
 upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
-upgini-1.1.262a3250.post3.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.1.262a3250.post3.dist-info/METADATA,sha256=3IXK7QAB6WSAAiUgvdnudgEkXMCAz5e9tJQ4L35mOvE,48167
-upgini-1.1.262a3250.post3.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-upgini-1.1.262a3250.post3.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
-upgini-1.1.262a3250.post3.dist-info/RECORD,,
+upgini-1.1.274a4.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.1.274a4.dist-info/METADATA,sha256=xng0cJvEGeFT2zSBqLDy-qf9I6ONKxdKtXsFWokPpPs,48158
+upgini-1.1.274a4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+upgini-1.1.274a4.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
+upgini-1.1.274a4.dist-info/RECORD,,

{upgini-1.1.262a3250.post3.dist-info → upgini-1.1.274a4.dist-info}/LICENSE RENAMED Viewed

File without changes

{upgini-1.1.262a3250.post3.dist-info → upgini-1.1.274a4.dist-info}/WHEEL RENAMED Viewed

File without changes

{upgini-1.1.262a3250.post3.dist-info → upgini-1.1.274a4.dist-info}/top_level.txt RENAMED Viewed

File without changes

upgini 1.1.262a3250.post3__py3-none-any.whl → 1.1.274a4__py3-none-any.whl

upgini 1.1.262a3250.post3py3-none-any.whl → 1.1.274a4py3-none-any.whl