PyPI - upgini - Versions diffs - 1.1.312a5__py3-none-any.whl → 1.1.313a3511.dev1__py3-none-any.whl - Mend

upgini 1.1.312a5py3-none-any.whl → 1.1.313a3511.dev1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (21) hide show

upgini/__about__.py +1 -1
upgini/autofe/all_operands.py +26 -7
upgini/autofe/binary.py +95 -4
upgini/autofe/date.py +26 -6
upgini/autofe/feature.py +25 -11
upgini/autofe/unary.py +7 -0
upgini/dataset.py +386 -33
upgini/features_enricher.py +145 -295
upgini/metadata.py +1 -16
upgini/normalizer/phone_normalizer.py +340 -0
upgini/utils/country_utils.py +0 -16
upgini/utils/datetime_utils.py +16 -38
upgini/utils/email_utils.py +17 -49
upgini/utils/ip_utils.py +1 -100
upgini/utils/phone_utils.py +0 -345
upgini/utils/postal_code_utils.py +0 -34
{upgini-1.1.312a5.dist-info → upgini-1.1.313a3511.dev1.dist-info}/METADATA +3 -1
{upgini-1.1.312a5.dist-info → upgini-1.1.313a3511.dev1.dist-info}/RECORD +20 -20
{upgini-1.1.312a5.dist-info → upgini-1.1.313a3511.dev1.dist-info}/WHEEL +1 -1
upgini/normalizer/normalize_utils.py +0 -203
{upgini-1.1.312a5.dist-info → upgini-1.1.313a3511.dev1.dist-info}/licenses/LICENSE +0 -0

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.1.~~312a5~~"
1	+ __version__ = "1.1.313a3511.dev1"

upgini/autofe/all_operands.py CHANGED Viewed

@@ -1,6 +1,20 @@
 from typing import Dict
-from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
+from upgini.autofe.binary import (
+    Add,
+    Combine,
+    CombineThenFreq,
+    Distance,
+    Divide,
+    JaroWinklerSim1,
+    JaroWinklerSim2,
+    LevenshteinSim,
+    Max,
+    Min,
+    Multiply,
+    Sim,
+    Subtract,
+)
 from upgini.autofe.date import (
     DateDiff,
     DateDiffType2,
@@ -9,9 +23,9 @@ from upgini.autofe.date import (
     DatePercentile,
     DatePercentileMethod2,
 )
-from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
+from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
 from upgini.autofe.operand import Operand
-from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
+from upgini.autofe.unary import Abs, Embeddings, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
 from upgini.autofe.vector import Mean, Sum
 ALL_OPERANDS: Dict[str, Operand] = {
@@ -39,10 +53,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
         GroupByThenAgg(name="GroupByThenMedian", agg="median"),
         GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
         GroupByThenRank(),
-        Operand(name="Combine", has_symmetry_importance=True, output_type="object", is_categorical=True),
-        Operand(name="CombineThenFreq", has_symmetry_importance=True, output_type="float"),
-        Operand(name="GroupByThenNUnique", output_type="int", is_vectorizable=True, is_grouping=True),
-        Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
+        Combine(),
+        CombineThenFreq(),
+        GroupByThenNUnique(),
+        GroupByThenFreq(),
         Sim(),
         DateDiff(),
         DateDiffType2(),
@@ -59,6 +73,11 @@ ALL_OPERANDS: Dict[str, Operand] = {
         DatePercentile(),
         DatePercentileMethod2(),
         Norm(),
+        JaroWinklerSim1(),
+        JaroWinklerSim2(),
+        LevenshteinSim(),
+        Distance(),
+        Embeddings(),
     ]
 }

upgini/autofe/binary.py CHANGED Viewed

@@ -1,7 +1,9 @@
+import abc
+from typing import Optional
+import Levenshtein
 import numpy as np
 import pandas as pd
-from numpy import dot
-from numpy.linalg import norm
+from jarowinkler import jarowinkler_similarity
 from upgini.autofe.operand import PandasOperand, VectorizableMixin
@@ -130,7 +132,29 @@ class CombineThenFreq(PandasOperand):
         self._loc(temp, value_counts)
-class Sim(PandasOperand):
+class Distance(PandasOperand):
+    name = "dist"
+    is_binary = True
+    output_type = "float"
+    is_symmetrical = True
+    has_symmetry_importance = True
+    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        return pd.Series(
+            1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
+        )
+    # row-wise dot product
+    def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        left = left.apply(lambda x: np.array(x))
+        right = right.apply(lambda x: np.array(x))
+        res = (left.dropna() * right.dropna()).apply(np.sum)
+        res = res.reindex(left.index.union(right.index))
+        return res
+# Left for backward compatibility
+class Sim(Distance):
     name = "sim"
     is_binary = True
     output_type = "float"
@@ -138,4 +162,71 @@ class Sim(PandasOperand):
     has_symmetry_importance = True
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
-        return dot(left, right) / (norm(left) * norm(right))
+        return 1 - super().calculate_binary(left, right)
+class StringSim(PandasOperand, abc.ABC):
+    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        sims = []
+        for i in left.index:
+            left_i = self._prepare_value(left.get(i))
+            right_i = self._prepare_value(right.get(i))
+            if left_i is not None and right_i is not None:
+                sims.append(self._similarity(left_i, right_i))
+            else:
+                sims.append(None)
+        return pd.Series(sims, index=left.index)
+    @abc.abstractmethod
+    def _prepare_value(self, value: Optional[str]) -> Optional[str]:
+        pass
+    @abc.abstractmethod
+    def _similarity(self, left: str, right: str) -> float:
+        pass
+class JaroWinklerSim1(StringSim):
+    name = "sim_jw1"
+    is_binary = True
+    input_type = "string"
+    output_type = "float"
+    is_symmetrical = True
+    has_symmetry_importance = True
+    def _prepare_value(self, value: Optional[str]) -> Optional[str]:
+        return value
+    def _similarity(self, left: str, right: str) -> float:
+        return jarowinkler_similarity(left, right)
+class JaroWinklerSim2(StringSim):
+    name = "sim_jw2"
+    is_binary = True
+    input_type = "string"
+    output_type = "float"
+    is_symmetrical = True
+    has_symmetry_importance = True
+    def _prepare_value(self, value: Optional[str]) -> Optional[str]:
+        return value[::-1] if value is not None else None
+    def _similarity(self, left: str, right: str) -> float:
+        return jarowinkler_similarity(left, right)
+class LevenshteinSim(StringSim):
+    name = "sim_lv"
+    is_binary = True
+    input_type = "string"
+    output_type = "float"
+    is_symmetrical = True
+    has_symmetry_importance = True
+    def _prepare_value(self, value: Optional[str]) -> Optional[str]:
+        return value
+    def _similarity(self, left: str, right: str) -> float:
+        return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))

upgini/autofe/date.py CHANGED Viewed

@@ -20,7 +20,7 @@ class DateDiffMixin(BaseModel):
         if isinstance(x, pd.DataFrame):
             return x.apply(lambda y: self._convert_to_date(y, unit), axis=1)
-        return pd.to_datetime(x, unit=unit, errors='coerce')
+        return pd.to_datetime(x, unit=unit, errors="coerce")
     def _convert_diff_to_unit(self, diff: Union[pd.Series, TimedeltaArray]) -> Union[pd.Series, TimedeltaArray]:
         if self.diff_unit == "D":
@@ -43,6 +43,8 @@ class DateDiff(PandasOperand, DateDiffMixin):
     is_binary = True
     has_symmetry_importance = True
+    replace_negative: bool = False
     def get_params(self) -> Dict[str, Optional[str]]:
         res = super().get_params()
         res.update(
@@ -50,6 +52,7 @@ class DateDiff(PandasOperand, DateDiffMixin):
                 "diff_unit": self.diff_unit,
                 "left_unit": self.left_unit,
                 "right_unit": self.right_unit,
+                "replace_negative": self.replace_negative,
             }
         )
         return res
@@ -61,7 +64,8 @@ class DateDiff(PandasOperand, DateDiffMixin):
         return self.__replace_negative(diff)
     def __replace_negative(self, x: Union[pd.DataFrame, pd.Series]):
-        x[x < 0] = None
+        if self.replace_negative:
+            x[x < 0] = None
         return x
@@ -96,18 +100,25 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
 _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len, 0)}
+_count_aggregations = ["nunique", "count"]
 class DateListDiff(PandasOperand, DateDiffMixin):
     is_binary = True
     has_symmetry_importance = True
     aggregation: str
+    replace_negative: bool = False
     def get_params(self) -> Dict[str, Optional[str]]:
         res = super().get_params()
         res.update(
             {
                 "aggregation": self.aggregation,
+                "diff_unit": self.diff_unit,
+                "left_unit": self.left_unit,
+                "right_unit": self.right_unit,
+                "replace_negative": self.replace_negative,
             }
         )
         return res
@@ -119,13 +130,19 @@ class DateListDiff(PandasOperand, DateDiffMixin):
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         left = self._convert_to_date(left, self.left_unit)
-        right = right.apply(lambda x: pd.arrays.DatetimeArray(self._convert_to_date(x, self.right_unit)))
+        right_mask = right.apply(lambda x: len(x) > 0)
+        mask = left.notna() & right.notna() & right_mask
+        right_masked = right[mask].apply(lambda x: pd.arrays.DatetimeArray(self._convert_to_date(x, self.right_unit)))
+        res_masked = pd.Series(left[mask] - right_masked.values).apply(lambda x: self._agg(self._diff(x)))
+        res = res_masked.reindex(left.index.union(right.index))
+        if self.aggregation in _count_aggregations:
+            res[~right_mask] = 0.0
-        return pd.Series(left - right.values).apply(lambda x: self._agg(self._diff(x)))
+        return res
     def _diff(self, x: TimedeltaArray):
         x = self._convert_diff_to_unit(x)
-        return x[x > 0]
+        return x[x > 0] if self.replace_negative else x
     def _agg(self, x):
         method = getattr(np, self.aggregation, None)
@@ -157,7 +174,10 @@ class DateListDiffBounded(DateListDiff):
         super().__init__(**data)
     def _agg(self, x):
-        x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
+        x = x[
+            (x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
+            & (x < (self.upper_bound if self.upper_bound is not None else np.inf))
+        ]
         return super()._agg(x)

upgini/autofe/feature.py CHANGED Viewed

@@ -138,15 +138,17 @@ class Feature:
         if self.cached_display_name is not None and cache:
             return self.cached_display_name
+        should_stack_op = not isinstance(self.children[0], Column) if self.op.is_unary else False
+        prev_name = [self.children[0].get_op_display_name()] if should_stack_op else []
         if self.alias:
             components = ["f_autofe", self.alias]
-        elif shorten and not self.op.is_unary:
-            components = ["f_autofe", self.get_op_display_name()]
+        elif shorten and (not self.op.is_unary or should_stack_op):
+            components = ["f_autofe"] + prev_name + [self.get_op_display_name()]
         else:
-            components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + [
-                "autofe",
-                self.get_op_display_name(),
-            ]
+            components = (
+                ["f_" + "_f_".join(self.get_columns(**kwargs))] + ["autofe"] + prev_name + [self.get_op_display_name()]
+            )
         components.extend([str(self.display_index)] if self.display_index is not None else [])
         display_name = "_".join(components)
@@ -237,12 +239,19 @@ class Feature:
     @staticmethod
     def from_formula(string: str) -> Union[Column, "Feature"]:
-        if string[-1] != ")":
-            return Column(string)
         def is_trivial_char(c: str) -> bool:
             return c not in "()+-*/,"
+        if string[-1] != ")":
+            if all(is_trivial_char(c) for c in string):
+                return Column(string)
+            else:
+                raise ValueError(
+                    f"Unsupported column name: {string}. Column names should not have characters: "
+                    "['(', ')', '+', '-', '*', '/', ',']"
+                )
         def find_prev(string: str) -> int:
             if string[-1] != ")":
                 return max([(0 if is_trivial_char(c) else i + 1) for i, c in enumerate(string)])
@@ -264,8 +273,11 @@ class Feature:
             return Feature(find_op(string[: p2 - 1]), [Feature.from_formula(string[p2:-1])])
         p1 = find_prev(string[: p2 - 1])
         if string[0] == "(":
+            op = find_op(string[p2 - 1])
+            if op is None:
+                raise ValueError(f"Unsupported operand: {string[p2 - 1]}")
             return Feature(
-                find_op(string[p2 - 1]),
+                op,
                 [Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
             )
         else:
@@ -276,6 +288,8 @@ class Feature:
                     [Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
                 )
             else:
+                if string[p1 - 1] == "(":
+                    raise ValueError(f"Unsupported operand: {string[: p1 - 1]}")
                 base_features = [
                     Feature.from_formula(string[p2:-1]),
                     Feature.from_formula(string[p1 : p2 - 1]),
@@ -321,10 +335,10 @@ class FeatureGroup:
             lower_order_names = [ch.get_display_name() for ch in lower_order_children]
             if any(isinstance(f, Feature) for f in lower_order_children):
                 child_data = pd.concat(
-                    [data[main_column]] + [ch.calculate(data) for ch in lower_order_children],
+                    [data[main_column or []]] + [ch.calculate(data) for ch in lower_order_children],
                     axis=1,
                 )
-                child_data.columns = [main_column] + lower_order_names
+                child_data.columns = ([main_column] if main_column is not None else []) + lower_order_names
             else:
                 child_data = data[columns]

upgini/autofe/unary.py CHANGED Viewed

@@ -125,3 +125,10 @@ class Norm(PandasOperand):
         normalized_data = pd.Series(normalized_data[:, 0], index=data_dropna.index, name=data.name)
         normalized_data = normalized_data.reindex(data.index)
         return normalized_data
+class Embeddings(PandasOperand):
+    name = "emb"
+    is_unary = True
+    input_type = "string"
+    output_type = "vector"

upgini 1.1.312a5__py3-none-any.whl → 1.1.313a3511.dev1__py3-none-any.whl

Potentially problematic release.

upgini 1.1.312a5py3-none-any.whl → 1.1.313a3511.dev1py3-none-any.whl