PyPI - upgini - Versions diffs - 1.1.280.dev0__py3-none-any.whl → 1.2.31a1__py3-none-any.whl - Mend

upgini 1.1.280.dev0py3-none-any.whl → 1.2.31a1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (43) hide show

upgini/__about__.py +1 -1
upgini/__init__.py +4 -20
upgini/autofe/all_operands.py +39 -9
upgini/autofe/binary.py +148 -45
upgini/autofe/date.py +197 -26
upgini/autofe/feature.py +102 -19
upgini/autofe/groupby.py +22 -22
upgini/autofe/operand.py +9 -6
upgini/autofe/unary.py +83 -41
upgini/autofe/vector.py +8 -8
upgini/data_source/data_source_publisher.py +128 -5
upgini/dataset.py +50 -386
upgini/features_enricher.py +931 -542
upgini/http.py +27 -16
upgini/lazy_import.py +35 -0
upgini/metadata.py +84 -59
upgini/metrics.py +164 -34
upgini/normalizer/normalize_utils.py +197 -0
upgini/resource_bundle/strings.properties +66 -51
upgini/search_task.py +10 -4
upgini/utils/Roboto-Regular.ttf +0 -0
upgini/utils/base_search_key_detector.py +14 -12
upgini/utils/country_utils.py +16 -0
upgini/utils/custom_loss_utils.py +39 -36
upgini/utils/datetime_utils.py +98 -45
upgini/utils/deduplicate_utils.py +135 -112
upgini/utils/display_utils.py +46 -15
upgini/utils/email_utils.py +54 -16
upgini/utils/feature_info.py +172 -0
upgini/utils/features_validator.py +34 -20
upgini/utils/ip_utils.py +100 -1
upgini/utils/phone_utils.py +343 -0
upgini/utils/postal_code_utils.py +34 -0
upgini/utils/sklearn_ext.py +28 -19
upgini/utils/target_utils.py +113 -57
upgini/utils/warning_counter.py +1 -0
upgini/version_validator.py +8 -4
{upgini-1.1.280.dev0.dist-info → upgini-1.2.31a1.dist-info}/METADATA +31 -16
upgini-1.2.31a1.dist-info/RECORD +65 -0
upgini/normalizer/phone_normalizer.py +0 -340
upgini-1.1.280.dev0.dist-info/RECORD +0 -62
{upgini-1.1.280.dev0.dist-info → upgini-1.2.31a1.dist-info}/WHEEL +0 -0
{upgini-1.1.280.dev0.dist-info → upgini-1.2.31a1.dist-info}/licenses/LICENSE +0 -0

upgini/autofe/feature.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import hashlib
 import itertools
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Set, Tuple, Union
 import numpy as np
 import pandas as pd
@@ -16,6 +16,15 @@ class Column:
         self.data = data
         self.calculate_all = calculate_all
+    def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
+        return self.name
+    def set_op_params(self, params: Dict[str, str]) -> "Column":
+        return self
+    def get_op_params(self, **kwargs):
+        return dict()
     def rename_columns(self, mapping: Dict[str, str]) -> "Column":
         self.name = self._unhash(mapping.get(self.name) or self.name)
         return self
@@ -35,9 +44,13 @@ class Column:
     def get_column_nodes(self) -> List["Column"]:
         return [self]
-    def get_columns(self) -> List[str]:
+    def get_columns(self, **kwargs) -> List[str]:
         return [self.name]
+    @property
+    def children(self) -> List[Union["Feature", "Column"]]:
+        return []
     def infer_type(self, data: pd.DataFrame) -> DtypeObj:
         return data[self.name].dtype
@@ -51,6 +64,12 @@ class Column:
     def to_pretty_formula(self) -> str:
         return self.to_formula()
+    def __eq__(self, value: object) -> bool:
+        if not isinstance(value, Column):
+            return False
+        else:
+            return self.name == value.name and self.calculate_all == value.calculate_all
 class Feature:
     def __init__(
@@ -69,19 +88,51 @@ class Feature:
         self.cached_display_name = cached_display_name
         self.alias = alias
-    def set_op_params(self, params: Dict[str, str]) -> "Feature":
+    def set_op_params(self, params: Optional[Dict[str, str]]) -> "Feature":
+        obj_dict = self.op.dict().copy()
+        obj_dict.update(params or {})
+        self.op = self.op.__class__.parse_obj(obj_dict)
         self.op.set_params(params)
+        for child in self.children:
+            child_params = {
+                k[len(child.get_display_name()) + 1 :]: v
+                for k, v in params.items()
+                if k.startswith(child.get_display_name())
+            }
+            if not child_params:
+                child_params = params
+            child.set_op_params(child_params)
         return self
+    def get_op_params(self, **kwargs) -> Dict[str, str]:
+        return {
+            k: str(v)
+            for k, v in dict(
+                (
+                    (f"{child.get_display_name(**kwargs)}_{k}", v)
+                    for child in self.children
+                    for k, v in child.get_op_params(**kwargs).items()
+                ),
+                **(self.op.get_params() or {}),
+            ).items()
+            if v is not None
+        }
     def get_hash(self) -> str:
-        return hashlib.sha256("_".join([self.op.name] + [ch.name for ch in self.children]).encode("utf-8")).hexdigest()[
-            :8
-        ]
+        return hashlib.sha256(
+            "_".join([self.op.name] + [ch.get_display_name() for ch in self.children]).encode("utf-8")
+        ).hexdigest()[:8]
     def set_alias(self, alias: str) -> "Feature":
         self.alias = alias
         return self
+    def get_all_operand_names(self) -> Set[str]:
+        return {self.op.name}.union(
+            {n for f in self.children if isinstance(f, Feature) for n in f.get_all_operand_names()}
+        )
     def rename_columns(self, mapping: Dict[str, str]) -> "Feature":
         for child in self.children:
             child.rename_columns(mapping)
@@ -108,19 +159,24 @@ class Feature:
         for child in self.children:
             child.delete_data()
+    def get_op_display_name(self) -> str:
+        return self.op.alias or self.op.name.lower()
     def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
         if self.cached_display_name is not None and cache:
             return self.cached_display_name
+        should_stack_op = not isinstance(self.children[0], Column) if self.op.is_unary else False
+        prev_name = [self.children[0].get_op_display_name()] if should_stack_op else []
         if self.alias:
             components = ["f_autofe", self.alias]
-        elif shorten and not self.op.is_unary:
-            components = ["f_autofe", self.op.alias or self.op.name.lower()]
+        elif shorten and (not self.op.is_unary or should_stack_op):
+            components = ["f_autofe"] + prev_name + [self.get_op_display_name()]
         else:
-            components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + [
-                "autofe",
-                self.op.alias or self.op.name.lower(),
-            ]
+            components = (
+                ["f_" + "_f_".join(self.get_columns(**kwargs))] + ["autofe"] + prev_name + [self.get_op_display_name()]
+            )
         components.extend([str(self.display_index)] if self.display_index is not None else [])
         display_name = "_".join(components)
@@ -211,12 +267,19 @@ class Feature:
     @staticmethod
     def from_formula(string: str) -> Union[Column, "Feature"]:
-        if string[-1] != ")":
-            return Column(string)
         def is_trivial_char(c: str) -> bool:
             return c not in "()+-*/,"
+        if string[-1] != ")":
+            if all(is_trivial_char(c) for c in string):
+                return Column(string)
+            else:
+                raise ValueError(
+                    f"Unsupported column name: {string}. Column names should not have characters: "
+                    "['(', ')', '+', '-', '*', '/', ',']"
+                )
         def find_prev(string: str) -> int:
             if string[-1] != ")":
                 return max([(0 if is_trivial_char(c) else i + 1) for i, c in enumerate(string)])
@@ -238,8 +301,11 @@ class Feature:
             return Feature(find_op(string[: p2 - 1]), [Feature.from_formula(string[p2:-1])])
         p1 = find_prev(string[: p2 - 1])
         if string[0] == "(":
+            op = find_op(string[p2 - 1])
+            if op is None:
+                raise ValueError(f"Unsupported operand: {string[p2 - 1]}")
             return Feature(
-                find_op(string[p2 - 1]),
+                op,
                 [Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
             )
         else:
@@ -250,6 +316,8 @@ class Feature:
                     [Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
                 )
             else:
+                if string[p1 - 1] == "(":
+                    raise ValueError(f"Unsupported operand: {string[: p1 - 1]}")
                 base_features = [
                     Feature.from_formula(string[p2:-1]),
                     Feature.from_formula(string[p1 : p2 - 1]),
@@ -286,11 +354,26 @@ class FeatureGroup:
         return names
     def calculate(self, data: pd.DataFrame, is_root=False) -> pd.DataFrame:
-        main_column = None if self.main_column_node is None else self.main_column_node.get_columns()[0]
         if isinstance(self.op, PandasOperand):
-            columns = self.get_columns()
-            new_data = self.op.calculate_group(data[columns], main_column=main_column)
-            new_data.rename(columns=dict(zip(columns, self.get_display_names())), inplace=True)
+            main_column = None if self.main_column_node is None else self.main_column_node.get_display_name()
+            lower_order_children = []
+            if self.main_column_node is not None:
+                lower_order_children.append(self.main_column_node)
+            lower_order_children.extend(
+                ch for f in self.children for ch in f.children if ch.get_display_name() != main_column
+            )
+            lower_order_names = [ch.get_display_name() for ch in lower_order_children]
+            child_data = pd.concat(
+                [ch.calculate(data) for ch in lower_order_children],
+                axis=1,
+            )
+            child_data.columns = lower_order_names
+            new_data = self.op.calculate_group(child_data, main_column=main_column)
+            new_data.rename(
+                columns=dict(zip((n for n in lower_order_names if n != main_column), self.get_display_names())),
+                inplace=True,
+            )
         else:
             raise NotImplementedError(f"Unrecognized operator {self.op.name}.")

upgini/autofe/groupby.py CHANGED Viewed

@@ -7,9 +7,9 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
 class GroupByThenAgg(PandasOperand, VectorizableMixin):
     agg: Optional[str]
-    is_vectorizable = True
-    is_grouping = True
-    is_distribution_dependent = True
+    is_vectorizable: bool = True
+    is_grouping: bool = True
+    is_distribution_dependent: bool = True
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         temp = left.groupby(right).agg(self.agg)
@@ -24,17 +24,17 @@ class GroupByThenAgg(PandasOperand, VectorizableMixin):
 class GroupByThenMedian(GroupByThenAgg):
-    name = "GroupByThenMedian"
-    pandas_agg = "median"
-    is_distribution_dependent = True
+    name: str = "GroupByThenMedian"
+    pandas_agg: str = "median"
+    is_distribution_dependent: bool = True
 class GroupByThenRank(PandasOperand, VectorizableMixin):
-    name = "GroupByThenRank"
-    is_vectorizable = True
-    is_grouping = True
-    output_type = "float"
-    is_distribution_dependent = True
+    name: str = "GroupByThenRank"
+    is_vectorizable: bool = True
+    is_grouping: bool = True
+    output_type: Optional[str] = "float"
+    is_distribution_dependent: bool = True
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         temp = pd.DataFrame(left[~right.isna()].groupby(right).rank(ascending=True, pct=True)).reset_index()
@@ -49,12 +49,12 @@ class GroupByThenRank(PandasOperand, VectorizableMixin):
 class GroupByThenNUnique(PandasOperand, VectorizableMixin):
-    name = "GroupByThenNUnique"
-    is_vectorizable = True
-    is_grouping = True
-    output_type = "int"
-    is_distribution_dependent = True
-    input_type = "discrete"
+    name: str = "GroupByThenNUnique"
+    is_vectorizable: bool = True
+    is_grouping: bool = True
+    output_type: Optional[str] = "int"
+    is_distribution_dependent: bool = True
+    input_type: Optional[str] = "discrete"
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         nunique = left.groupby(right).nunique()
@@ -69,11 +69,11 @@ class GroupByThenNUnique(PandasOperand, VectorizableMixin):
 class GroupByThenFreq(PandasOperand):
-    name = "GroupByThenFreq"
-    is_grouping = True
-    output_type = "float"
-    is_distribution_dependent = True
-    input_type = "discrete"
+    name: str = "GroupByThenFreq"
+    is_grouping: bool = True
+    output_type: Optional[str] = "float"
+    is_distribution_dependent: bool = True
+    input_type: Optional[str] = "discrete"
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         def _f(x):

upgini/autofe/operand.py CHANGED Viewed

@@ -8,25 +8,28 @@ from pydantic import BaseModel
 class Operand(BaseModel):
     name: str
-    alias: Optional[str]
+    alias: Optional[str] = None
     is_unary: bool = False
+    is_symmetrical: bool = False
     has_symmetry_importance: bool = False
-    input_type: Optional[str]
-    output_type: Optional[str]
+    input_type: Optional[str] = None
+    output_type: Optional[str] = None
     is_categorical: bool = False
     is_vectorizable: bool = False
     is_grouping: bool = False
     is_binary: bool = False
     is_vector: bool = False
     is_distribution_dependent: bool = False
-    params: Optional[Dict[str, str]]
+    params: Optional[Dict[str, str]] = None
     def set_params(self, params: Dict[str, str]):
         self.params = params
         return self
-    def get_params(self) -> Dict[str, str]:
-        return self.params
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = {"alias": self.alias}
+        res.update(self.params or {})
+        return res
 MAIN_COLUMN = "main_column"

upgini/autofe/unary.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from typing import Dict, Optional
 import numpy as np
 import pandas as pd
@@ -5,24 +6,26 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
 class Abs(PandasOperand, VectorizableMixin):
-    name = "abs"
-    is_unary = True
-    is_vectorizable = True
-    group_index = 0
+    name: str = "abs"
+    is_unary: bool = True
+    is_vectorizable: bool = True
+    group_index: int = 0
     def calculate_unary(self, data: pd.Series) -> pd.Series:
-        return data.abs()
+        return data.astype(np.float64).abs()
+        # return data.abs()
     def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
-        return data.abs()
+        return data.astype(np.float64).abs()
+        # return data.abs()
 class Log(PandasOperand, VectorizableMixin):
-    name = "log"
-    is_unary = True
-    is_vectorizable = True
-    output_type = "float"
-    group_index = 0
+    name: str = "log"
+    is_unary: bool = True
+    is_vectorizable: bool = True
+    output_type: Optional[str] = "float"
+    group_index: int = 0
     def calculate_unary(self, data: pd.Series) -> pd.Series:
         return self._round_value(np.log(np.abs(data.replace(0, np.nan))), 10)
@@ -32,11 +35,11 @@ class Log(PandasOperand, VectorizableMixin):
 class Sqrt(PandasOperand, VectorizableMixin):
-    name = "sqrt"
-    is_unary = True
-    is_vectorizable = True
-    output_type = "float"
-    group_index = 0
+    name: str = "sqrt"
+    is_unary: bool = True
+    is_vectorizable: bool = True
+    output_type: Optional[str] = "float"
+    group_index: int = 0
     def calculate_unary(self, data: pd.Series) -> pd.Series:
         return self._round_value(np.sqrt(np.abs(data)))
@@ -46,10 +49,10 @@ class Sqrt(PandasOperand, VectorizableMixin):
 class Square(PandasOperand, VectorizableMixin):
-    name = "square"
-    is_unary = True
-    is_vectorizable = True
-    group_index = 0
+    name: str = "square"
+    is_unary: bool = True
+    is_vectorizable: bool = True
+    group_index: int = 0
     def calculate_unary(self, data: pd.Series) -> pd.Series:
         return np.square(data)
@@ -59,11 +62,11 @@ class Square(PandasOperand, VectorizableMixin):
 class Sigmoid(PandasOperand, VectorizableMixin):
-    name = "sigmoid"
-    is_unary = True
-    is_vectorizable = True
-    output_type = "float"
-    group_index = 0
+    name: str = "sigmoid"
+    is_unary: bool = True
+    is_vectorizable: bool = True
+    output_type: Optional[str] = "float"
+    group_index: int = 0
     def calculate_unary(self, data: pd.Series) -> pd.Series:
         return self._round_value(1 / (1 + np.exp(-data)))
@@ -73,12 +76,12 @@ class Sigmoid(PandasOperand, VectorizableMixin):
 class Floor(PandasOperand, VectorizableMixin):
-    name = "floor"
-    is_unary = True
-    is_vectorizable = True
-    output_type = "int"
-    input_type = "continuous"
-    group_index = 0
+    name: str = "floor"
+    is_unary: bool = True
+    is_vectorizable: bool = True
+    output_type: Optional[str] = "int"
+    input_type: Optional[str] = "continuous"
+    group_index: int = 0
     def calculate_unary(self, data: pd.Series) -> pd.Series:
         return np.floor(data)
@@ -88,11 +91,11 @@ class Floor(PandasOperand, VectorizableMixin):
 class Residual(PandasOperand, VectorizableMixin):
-    name = "residual"
-    is_unary = True
-    is_vectorizable = True
-    input_type = "continuous"
-    group_index = 0
+    name: str = "residual"
+    is_unary: bool = True
+    is_vectorizable: bool = True
+    input_type: Optional[str] = "continuous"
+    group_index: int = 0
     def calculate_unary(self, data: pd.Series) -> pd.Series:
         return data - np.floor(data)
@@ -102,12 +105,51 @@ class Residual(PandasOperand, VectorizableMixin):
 class Freq(PandasOperand):
-    name = "freq"
-    is_unary = True
-    output_type = "float"
-    is_distribution_dependent = True
-    input_type = "discrete"
+    name: str = "freq"
+    is_unary: bool = True
+    output_type: Optional[str] = "float"
+    is_distribution_dependent: bool = True
+    input_type: Optional[str] = "discrete"
     def calculate_unary(self, data: pd.Series) -> pd.Series:
         value_counts = data.value_counts(normalize=True)
         return self._loc(data, value_counts)
+class Norm(PandasOperand):
+    name: str = "norm"
+    is_unary: bool = True
+    output_type: Optional[str] = "float"
+    norm: Optional[float] = None
+    def calculate_unary(self, data: pd.Series) -> pd.Series:
+        data_dropna = data.dropna()
+        if data_dropna.empty:
+            return data
+        if self.norm is not None:
+            normalized_data = data / self.norm
+        else:
+            self.norm = np.sqrt(np.sum(data * data))
+            normalized_data = data / self.norm
+        return normalized_data
+    def set_params(self, params: Dict[str, str]):
+        super().set_params(params)
+        if params is not None and "norm" in params:
+            self.norm = float(params["norm"])
+        return self
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        if self.norm is not None:
+            res["norm"] = self.norm
+        return res
+class Embeddings(PandasOperand):
+    name: str = "emb"
+    is_unary: bool = True
+    input_type: Optional[str] = "string"
+    output_type: Optional[str] = "vector"

upgini/autofe/vector.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import List
+from typing import List, Optional
 import pandas as pd
@@ -6,19 +6,19 @@ from upgini.autofe.operand import PandasOperand, VectorizableMixin
 class Mean(PandasOperand, VectorizableMixin):
-    name = "mean"
-    output_type = "float"
-    is_vector = True
-    group_index = 0
+    name: str = "mean"
+    output_type: Optional[str] = "float"
+    is_vector: bool = True
+    group_index: int = 0
     def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
         return pd.DataFrame(data).T.fillna(0).mean(axis=1)
 class Sum(PandasOperand, VectorizableMixin):
-    name = "sum"
-    is_vector = True
-    group_index = 0
+    name: str = "sum"
+    is_vector: bool = True
+    group_index: int = 0
     def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
         return pd.DataFrame(data).T.fillna(0).sum(axis=1)

upgini 1.1.280.dev0__py3-none-any.whl → 1.2.31a1__py3-none-any.whl

Potentially problematic release.

upgini 1.1.280.dev0py3-none-any.whl → 1.2.31a1py3-none-any.whl