PyPI - upgini - Versions diffs - 1.1.280a3418.post2__py3-none-any.whl → 1.2.31__py3-none-any.whl - Mend

upgini 1.1.280a3418.post2py3-none-any.whl → 1.2.31py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (43) hide show

upgini/__about__.py +1 -1
upgini/__init__.py +4 -20
upgini/autofe/all_operands.py +39 -10
upgini/autofe/binary.py +148 -45
upgini/autofe/date.py +197 -26
upgini/autofe/feature.py +102 -19
upgini/autofe/groupby.py +22 -22
upgini/autofe/operand.py +9 -6
upgini/autofe/unary.py +78 -54
upgini/autofe/vector.py +8 -8
upgini/data_source/data_source_publisher.py +128 -5
upgini/dataset.py +50 -386
upgini/features_enricher.py +936 -541
upgini/http.py +27 -16
upgini/lazy_import.py +35 -0
upgini/metadata.py +84 -59
upgini/metrics.py +164 -34
upgini/normalizer/normalize_utils.py +197 -0
upgini/resource_bundle/strings.properties +66 -51
upgini/search_task.py +10 -4
upgini/utils/Roboto-Regular.ttf +0 -0
upgini/utils/base_search_key_detector.py +14 -12
upgini/utils/country_utils.py +16 -0
upgini/utils/custom_loss_utils.py +39 -36
upgini/utils/datetime_utils.py +98 -45
upgini/utils/deduplicate_utils.py +135 -112
upgini/utils/display_utils.py +46 -15
upgini/utils/email_utils.py +54 -16
upgini/utils/feature_info.py +172 -0
upgini/utils/features_validator.py +34 -20
upgini/utils/ip_utils.py +100 -1
upgini/utils/phone_utils.py +343 -0
upgini/utils/postal_code_utils.py +34 -0
upgini/utils/sklearn_ext.py +28 -19
upgini/utils/target_utils.py +113 -57
upgini/utils/warning_counter.py +1 -0
upgini/version_validator.py +8 -4
{upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31.dist-info}/METADATA +31 -16
upgini-1.2.31.dist-info/RECORD +65 -0
upgini/normalizer/phone_normalizer.py +0 -340
upgini-1.1.280a3418.post2.dist-info/RECORD +0 -62
{upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31.dist-info}/WHEEL +0 -0
{upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31.dist-info}/licenses/LICENSE +0 -0

upgini/utils/phone_utils.py CHANGED Viewed

@@ -1,5 +1,10 @@
+from typing import Optional
+import numpy as np
 import pandas as pd
+from pandas.api.types import is_float_dtype, is_object_dtype, is_string_dtype
+from upgini.errors import ValidationError
 from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
@@ -9,3 +14,341 @@ class PhoneSearchKeyDetector(BaseSearchKeyDetector):
     def _is_search_key_by_values(self, column: pd.Series) -> bool:
         return False
+class PhoneSearchKeyConverter:
+    def __init__(self, phone_column: str, country_column: Optional[str] = None):
+        self.phone_column = phone_column
+        self.country_column = country_column
+    def convert(self, df: pd.DataFrame) -> pd.DataFrame:
+        df = self.phone_to_int(df)
+        if self.country_column is not None:
+            df[self.phone_column] = df.apply(self.add_prefix, axis=1)
+        df[self.phone_column] = df[self.phone_column].astype("Int64")
+        return df
+    def add_prefix(self, row):
+        phone = row[self.phone_column]
+        if pd.isna(phone):
+            return phone
+        country = row[self.country_column]
+        country_prefix_tuple = self.COUNTRIES_PREFIXES.get(country)
+        if country_prefix_tuple is not None:
+            country_prefix, number_of_digits = country_prefix_tuple
+            if len(str(phone)) == number_of_digits:
+                return int(country_prefix + str(phone))
+        return phone
+    def phone_to_int(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Convention: phone number is always presented as int number.
+        phone_number = Country code + National Destination Code + Subscriber Number.
+        Examples:
+        41793834315     for Switzerland
+        46767040672     for Sweden
+        861065529988    for China
+        18143008198     for the USA
+        Inplace conversion of phone to int.
+        Method will remove all non numeric chars from string and convert it to int.
+        None will be set for phone numbers that couldn"t be converted to int
+        """
+        if is_string_dtype(df[self.phone_column]) or is_object_dtype(df[self.phone_column]):
+            convert_func = self.phone_str_to_int_safe
+        elif is_float_dtype(df[self.phone_column]):
+            convert_func = self.phone_float_to_int_safe
+        elif df[self.phone_column].dtype == np.int64 or isinstance(
+            df[self.phone_column].dtype, pd.Int64Dtype
+        ):
+            convert_func = self.phone_int_to_int_safe
+        else:
+            raise ValidationError(
+                f"phone_column_name {self.phone_column} doesn't have supported dtype. "
+                f"Dataset dtypes: {df.dtypes}. "
+                f"Contact developer and request to implement conversion of {self.phone_column} to int"
+            )
+        df[self.phone_column] = df[self.phone_column].apply(convert_func).astype("Int64")
+        return df
+    @staticmethod
+    def phone_float_to_int_safe(value: float) -> Optional[int]:
+        try:
+            return PhoneSearchKeyConverter.validate_length(int(value))
+        except Exception:
+            return None
+    @staticmethod
+    def phone_int_to_int_safe(value: int) -> Optional[int]:
+        try:
+            return PhoneSearchKeyConverter.validate_length(int(value))
+        except Exception:
+            return None
+    @staticmethod
+    def phone_str_to_int_safe(value: str) -> Optional[int]:
+        try:
+            value = str(value)
+            if value.endswith(".0"):
+                value = value[: len(value) - 2]
+            numeric_filter = filter(str.isdigit, value)
+            numeric_string = "".join(numeric_filter)
+            return PhoneSearchKeyConverter.validate_length(int(numeric_string))
+        except Exception:
+            return None
+    @staticmethod
+    def validate_length(value: int) -> Optional[int]:
+        if value < 10000000 or value > 999999999999999:
+            return None
+        else:
+            return value
+    COUNTRIES_PREFIXES = {
+        "US": ("1", 10),
+        "CA": ("1", 10),
+        "AI": ("1", 10),
+        "AG": ("1", 10),
+        "AS": ("1", 10),
+        "BB": ("1", 10),
+        "BS": ("1", 10),
+        "VG": ("1", 10),
+        "VI": ("1", 10),
+        "KY": ("1", 10),
+        "BM": ("1", 10),
+        "GD": ("1", 10),
+        "TC": ("1", 10),
+        "MS": ("1", 10),
+        "MP": ("1", 10),
+        "GU": ("1", 10),
+        "SX": ("1", 10),
+        "LC": ("1", 10),
+        "DM": ("1", 10),
+        "VC": ("1", 10),
+        "PR": ("1", 10),
+        "TT": ("1", 10),
+        "KN": ("1", 10),
+        "JM": ("1", 10),
+        "EG": ("20", 9),
+        "SS": ("211", 9),
+        "MA": ("212", 9),
+        "EH": ("212", 4),
+        "DZ": ("213", 8),
+        "TN": ("216", 8),
+        "LY": ("218", 9),
+        "GM": ("220", 6),
+        "SN": ("221", 9),
+        "MR": ("222", 7),
+        "ML": ("223", 8),
+        "GN": ("224", 9),
+        "CI": ("225", 7),
+        "BF": ("226", 8),
+        "NE": ("227", 8),
+        "TG": ("228", 8),
+        "BJ": ("229", 8),
+        "MU": ("230", 7),
+        "LR": ("231", 9),
+        "SL": ("232", 8),
+        "GH": ("233", 9),
+        "NG": ("234", 9),
+        "TD": ("235", 8),
+        "CF": ("236", 7),
+        "CM": ("237", 9),
+        "CV": ("238", 7),
+        "ST": ("239", 7),
+        "GQ": ("240", 9),
+        "GA": ("241", 8),
+        "CG": ("242", 7),
+        "CD": ("243", 9),
+        "AO": ("244", 9),
+        "GW": ("245", 6),
+        "IO": ("246", 7),
+        "AC": ("247", 5),
+        "SC": ("248", 7),
+        "SD": ("249", 9),
+        "RW": ("250", 9),
+        "ET": ("251", 9),
+        "SO": ("252", 9),
+        "DJ": ("253", 8),
+        "KE": ("254", 9),
+        "TZ": ("255", 9),
+        "UG": ("256", 9),
+        "BI": ("257", 8),
+        "MZ": ("258", 8),
+        "ZM": ("260", 9),
+        "MG": ("261", 9),
+        "RE": ("262", 9),
+        "YT": ("262", 9),
+        "TF": ("262", 9),
+        "ZW": ("263", 9),
+        "NA": ("264", 9),
+        "MW": ("265", 7),
+        "LS": ("266", 8),
+        "BW": ("267", 7),
+        "SZ": ("268", 8),
+        "KM": ("269", 7),
+        "ZA": ("27", 10),
+        "SH": ("290", 5),
+        "TA": ("290", 5),
+        "ER": ("291", 7),
+        "AT": ("43", 10),
+        "AW": ("297", 7),
+        "FO": ("298", 6),
+        "GL": ("299", 6),
+        "GR": ("30", 10),
+        "BE": ("32", 8),
+        "FR": ("33", 9),
+        "ES": ("34", 9),
+        "GI": ("350", 8),
+        "PE": ("51", 8),
+        "MX": ("52", 10),
+        "CU": ("53", 8),
+        "AR": ("54", 10),
+        "BR": ("55", 10),
+        "CL": ("56", 9),
+        "CO": ("57", 8),
+        "VE": ("58", 10),
+        "PT": ("351", 9),
+        "LU": ("352", 8),
+        "IE": ("353", 8),
+        "IS": ("354", 7),
+        "AL": ("355", 8),
+        "MT": ("356", 8),
+        "CY": ("357", 8),
+        "FI": ("358", 9),
+        "BG": ("359", 8),
+        "HU": ("36", 8),
+        "LT": ("370", 8),
+        "LV": ("371", 8),
+        "EE": ("372", 7),
+        "MD": ("373", 8),
+        "AM": ("374", 8),
+        "BY": ("375", 9),
+        "AD": ("376", 6),
+        "MC": ("377", 8),
+        "SM": ("378", 9),
+        "VA": ("3906698", 5),
+        "UA": ("380", 9),
+        "RS": ("381", 9),
+        "ME": ("382", 8),
+        "HR": ("385", 8),
+        "SI": ("386", 8),
+        "BA": ("387", 8),
+        "MK": ("389", 8),
+        "MY": ("60", 9),
+        "AU": ("61", 9),
+        "CX": ("61", 9),
+        "CC": ("61", 9),
+        "ID": ("62", 9),
+        "PH": ("632", 7),
+        "NZ": ("64", 8),
+        "PN": ("64", 8),
+        "SG": ("65", 8),
+        "TH": ("66", 8),
+        "IT": ("39", 10),
+        "RO": ("40", 9),
+        "CH": ("41", 9),
+        "CZ": ("420", 9),
+        "SK": ("421", 9),
+        "GB": ("44", 10),
+        "LI": ("423", 7),
+        "GG": ("44", 10),
+        "IM": ("44", 10),
+        "JE": ("44", 10),
+        "DK": ("45", 8),
+        "SE": ("46", 8),
+        "BD": ("880", 8),
+        "TW": ("886", 9),
+        "JP": ("81", 9),
+        "KR": ("82", 9),
+        "VN": ("84", 10),
+        "KP": ("850", 8),
+        "HK": ("852", 8),
+        "MO": ("853", 8),
+        "KH": ("855", 8),
+        "LA": ("856", 8),
+        "NO": ("47", 8),
+        "SJ": ("47", 8),
+        "BV": ("47", 8),
+        "PL": ("48", 9),
+        "DE": ("49", 10),
+        "TR": ("90", 10),
+        "IN": ("91", 10),
+        "PK": ("92", 9),
+        "AF": ("93", 9),
+        "LK": ("94", 9),
+        "MM": ("95", 7),
+        "IR": ("98", 10),
+        "MV": ("960", 7),
+        "LB": ("961", 7),
+        "JO": ("962", 9),
+        "SY": ("963", 10),
+        "IQ": ("964", 10),
+        "KW": ("965", 7),
+        "SA": ("966", 9),
+        "YE": ("967", 7),
+        "OM": ("968", 8),
+        "PS": ("970", 8),
+        "AE": ("971", 8),
+        "IL": ("972", 9),
+        "BH": ("973", 8),
+        "QA": ("974", 8),
+        "BT": ("975", 7),
+        "MN": ("976", 8),
+        "NP": ("977", 8),
+        "TJ": ("992", 9),
+        "TM": ("993", 8),
+        "AZ": ("994", 9),
+        "GE": ("995", 9),
+        "KG": ("996", 9),
+        "UZ": ("998", 9),
+        "FK": ("500", 5),
+        "BZ": ("501", 7),
+        "GT": ("502", 8),
+        "SV": ("503", 8),
+        "HN": ("504", 8),
+        "NI": ("505", 8),
+        "CR": ("506", 8),
+        "PA": ("507", 7),
+        "PM": ("508", 6),
+        "HT": ("509", 8),
+        "GS": ("500", 5),
+        "MF": ("590", 9),
+        "BL": ("590", 9),
+        "GP": ("590", 9),
+        "BO": ("591", 9),
+        "GY": ("592", 9),
+        "EC": ("593", 9),
+        "GF": ("594", 9),
+        "PY": ("595", 9),
+        "MQ": ("596", 9),
+        "SR": ("597", 9),
+        "UY": ("598", 9),
+        "CW": ("599", 9),
+        "BQ": ("599", 9),
+        "RU": ("7", 10),
+        "KZ": ("7", 10),
+        "TL": ("670", 7),
+        "NF": ("672", 7),
+        "HM": ("672", 7),
+        "BN": ("673", 7),
+        "NR": ("674", 7),
+        "PG": ("675", 7),
+        "TO": ("676", 7),
+        "SB": ("677", 7),
+        "VU": ("678", 7),
+        "FJ": ("679", 7),
+        "PW": ("680", 7),
+        "WF": ("681", 7),
+        "CK": ("682", 5),
+        "NU": ("683", 7),
+        "WS": ("685", 7),
+        "KI": ("686", 7),
+        "NC": ("687", 7),
+        "TV": ("688", 7),
+        "PF": ("689", 7),
+        "TK": ("690", 7),
+        "FM": ("691", 7),
+        "MH": ("692", 7),
+    }

upgini/utils/postal_code_utils.py CHANGED Viewed

@@ -1,4 +1,9 @@
 import pandas as pd
+from pandas.api.types import (
+    is_float_dtype,
+    is_object_dtype,
+    is_string_dtype,
+)
 from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
@@ -9,3 +14,32 @@ class PostalCodeSearchKeyDetector(BaseSearchKeyDetector):
     def _is_search_key_by_values(self, column: pd.Series) -> bool:
         return False
+class PostalCodeSearchKeyConverter:
+    def __init__(self, postal_code_column: str):
+        self.postal_code_column = postal_code_column
+    def convert(self, df: pd.DataFrame) -> pd.DataFrame:
+        if is_string_dtype(df[self.postal_code_column]) or is_object_dtype(df[self.postal_code_column]):
+            try:
+                df[self.postal_code_column] = (
+                    df[self.postal_code_column].astype("string").astype("float64").astype("Int64").astype("string")
+                )
+            except Exception:
+                pass
+        elif is_float_dtype(df[self.postal_code_column]):
+            df[self.postal_code_column] = df[self.postal_code_column].astype("Int64").astype("string")
+        df[self.postal_code_column] = (
+            df[self.postal_code_column]
+            .astype("string")
+            .str.upper()
+            .str.replace(r"[^0-9A-Z]", "", regex=True)  # remove non alphanumeric characters
+            .str.replace(r"^0+\B", "", regex=True)  # remove leading zeros
+        )
+        # if (df[self.postal_code_column] == "").all():
+        #     raise ValidationError(self.bundle.get("invalid_postal_code").format(self.postal_code_column))
+        return df

upgini/utils/sklearn_ext.py CHANGED Viewed

@@ -17,7 +17,7 @@ from sklearn.base import clone, is_classifier
 from sklearn.exceptions import FitFailedWarning, NotFittedError
 from sklearn.metrics import check_scoring
 from sklearn.metrics._scorer import _MultimetricScorer
-from sklearn.model_selection import check_cv
+from sklearn.model_selection import StratifiedKFold, check_cv
 from sklearn.utils.fixes import np_version, parse_version
 from sklearn.utils.validation import indexable
@@ -312,25 +312,34 @@ def cross_validate(
                 ret[key] = train_scores_dict[name]
         return ret
-    except Exception:
+    except ValueError as e:
         # logging.exception("Failed to execute overriden cross_validate. Fallback to original")
-        raise
-        # fit_params["use_best_model"] = False
-        # return original_cross_validate(
-        #     estimator,
-        #     X,
-        #     y,
-        #     groups=groups,
-        #     scoring=scoring,
-        #     cv=cv,
-        #     n_jobs=n_jobs,
-        #     verbose=verbose,
-        #     fit_params=fit_params,
-        #     pre_dispatch=pre_dispatch,
-        #     return_train_score=return_train_score,
-        #     return_estimator=return_estimator,
-        #     error_score=error_score,
-        # )
+        if hasattr(e, "args") and len(e.args) > 0 and "Only one class present in y_true" in e.args[0]:
+            # Try change CV to StratifiedKFold and retry
+            if hasattr(cv, "shuffle"):
+                shuffle = cv.shuffle
+            else:
+                shuffle = False
+            if hasattr(cv, "random_state") and shuffle:
+                random_state = cv.random_state
+            else:
+                random_state = None
+            return cross_validate(
+                estimator,
+                x,
+                y,
+                groups=groups,
+                scoring=scoring,
+                cv=StratifiedKFold(n_splits=cv.get_n_splits(), shuffle=shuffle, random_state=random_state),
+                n_jobs=n_jobs,
+                verbose=verbose,
+                fit_params=fit_params,
+                pre_dispatch=pre_dispatch,
+                return_train_score=return_train_score,
+                return_estimator=return_estimator,
+                error_score=error_score,
+            )
+        raise e
 def _fit_and_score(

upgini 1.1.280a3418.post2__py3-none-any.whl → 1.2.31__py3-none-any.whl

Potentially problematic release.

upgini 1.1.280a3418.post2py3-none-any.whl → 1.2.31py3-none-any.whl