PyPI - upgini - Versions diffs - 1.1.312__py3-none-any.whl → 1.1.312a2__py3-none-any.whl - Mend

upgini 1.1.312py3-none-any.whl → 1.1.312a2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (21) hide show

upgini/__about__.py +1 -1
upgini/autofe/all_operands.py +7 -26
upgini/autofe/binary.py +4 -95
upgini/autofe/date.py +3 -16
upgini/autofe/feature.py +11 -25
upgini/autofe/unary.py +0 -7
upgini/dataset.py +30 -385
upgini/features_enricher.py +276 -120
upgini/metadata.py +16 -1
upgini/normalizer/normalize_utils.py +203 -0
upgini/utils/country_utils.py +16 -0
upgini/utils/datetime_utils.py +34 -15
upgini/utils/email_utils.py +19 -5
upgini/utils/ip_utils.py +100 -1
upgini/utils/phone_utils.py +345 -0
upgini/utils/postal_code_utils.py +34 -0
{upgini-1.1.312.dist-info → upgini-1.1.312a2.dist-info}/METADATA +1 -3
{upgini-1.1.312.dist-info → upgini-1.1.312a2.dist-info}/RECORD +20 -20
{upgini-1.1.312.dist-info → upgini-1.1.312a2.dist-info}/WHEEL +1 -1
upgini/normalizer/phone_normalizer.py +0 -340
{upgini-1.1.312.dist-info → upgini-1.1.312a2.dist-info}/licenses/LICENSE +0 -0

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.1.~~312~~"
1	+ __version__ = "1.1.312a2"

upgini/autofe/all_operands.py CHANGED Viewed

@@ -1,20 +1,6 @@
 from typing import Dict
-from upgini.autofe.binary import (
-    Add,
-    Combine,
-    CombineThenFreq,
-    Distance,
-    Divide,
-    JaroWinklerSim1,
-    JaroWinklerSim2,
-    LevenshteinSim,
-    Max,
-    Min,
-    Multiply,
-    Sim,
-    Subtract,
-)
+from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
 from upgini.autofe.date import (
     DateDiff,
     DateDiffType2,
@@ -23,9 +9,9 @@ from upgini.autofe.date import (
     DatePercentile,
     DatePercentileMethod2,
 )
-from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
+from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
 from upgini.autofe.operand import Operand
-from upgini.autofe.unary import Abs, Embeddings, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
+from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
 from upgini.autofe.vector import Mean, Sum
 ALL_OPERANDS: Dict[str, Operand] = {
@@ -53,10 +39,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
         GroupByThenAgg(name="GroupByThenMedian", agg="median"),
         GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
         GroupByThenRank(),
-        Combine(),
-        CombineThenFreq(),
-        GroupByThenNUnique(),
-        GroupByThenFreq(),
+        Operand(name="Combine", has_symmetry_importance=True, output_type="object", is_categorical=True),
+        Operand(name="CombineThenFreq", has_symmetry_importance=True, output_type="float"),
+        Operand(name="GroupByThenNUnique", output_type="int", is_vectorizable=True, is_grouping=True),
+        Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
         Sim(),
         DateDiff(),
         DateDiffType2(),
@@ -73,11 +59,6 @@ ALL_OPERANDS: Dict[str, Operand] = {
         DatePercentile(),
         DatePercentileMethod2(),
         Norm(),
-        JaroWinklerSim1(),
-        JaroWinklerSim2(),
-        LevenshteinSim(),
-        Distance(),
-        Embeddings(),
     ]
 }

upgini/autofe/binary.py CHANGED Viewed

@@ -1,9 +1,7 @@
-import abc
-from typing import Optional
-import Levenshtein
 import numpy as np
 import pandas as pd
-from jarowinkler import jarowinkler_similarity
+from numpy import dot
+from numpy.linalg import norm
 from upgini.autofe.operand import PandasOperand, VectorizableMixin
@@ -132,29 +130,7 @@ class CombineThenFreq(PandasOperand):
         self._loc(temp, value_counts)
-class Distance(PandasOperand):
-    name = "dist"
-    is_binary = True
-    output_type = "float"
-    is_symmetrical = True
-    has_symmetry_importance = True
-    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
-        return pd.Series(
-            1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
-        )
-    # row-wise dot product
-    def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
-        left = left.apply(lambda x: np.array(x))
-        right = right.apply(lambda x: np.array(x))
-        res = (left.dropna() * right.dropna()).apply(np.sum)
-        res = res.reindex(left.index.union(right.index))
-        return res
-# Left for backward compatibility
-class Sim(Distance):
+class Sim(PandasOperand):
     name = "sim"
     is_binary = True
     output_type = "float"
@@ -162,71 +138,4 @@ class Sim(Distance):
     has_symmetry_importance = True
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
-        return 1 - super().calculate_binary(left, right)
-class StringSim(PandasOperand, abc.ABC):
-    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
-        sims = []
-        for i in left.index:
-            left_i = self._prepare_value(left.get(i))
-            right_i = self._prepare_value(right.get(i))
-            if left_i is not None and right_i is not None:
-                sims.append(self._similarity(left_i, right_i))
-            else:
-                sims.append(None)
-        return pd.Series(sims, index=left.index)
-    @abc.abstractmethod
-    def _prepare_value(self, value: Optional[str]) -> Optional[str]:
-        pass
-    @abc.abstractmethod
-    def _similarity(self, left: str, right: str) -> float:
-        pass
-class JaroWinklerSim1(StringSim):
-    name = "sim_jw1"
-    is_binary = True
-    input_type = "string"
-    output_type = "float"
-    is_symmetrical = True
-    has_symmetry_importance = True
-    def _prepare_value(self, value: Optional[str]) -> Optional[str]:
-        return value
-    def _similarity(self, left: str, right: str) -> float:
-        return jarowinkler_similarity(left, right)
-class JaroWinklerSim2(StringSim):
-    name = "sim_jw2"
-    is_binary = True
-    input_type = "string"
-    output_type = "float"
-    is_symmetrical = True
-    has_symmetry_importance = True
-    def _prepare_value(self, value: Optional[str]) -> Optional[str]:
-        return value[::-1] if value is not None else None
-    def _similarity(self, left: str, right: str) -> float:
-        return jarowinkler_similarity(left, right)
-class LevenshteinSim(StringSim):
-    name = "sim_lv"
-    is_binary = True
-    input_type = "string"
-    output_type = "float"
-    is_symmetrical = True
-    has_symmetry_importance = True
-    def _prepare_value(self, value: Optional[str]) -> Optional[str]:
-        return value
-    def _similarity(self, left: str, right: str) -> float:
-        return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
+        return dot(left, right) / (norm(left) * norm(right))

upgini/autofe/date.py CHANGED Viewed

@@ -43,8 +43,6 @@ class DateDiff(PandasOperand, DateDiffMixin):
     is_binary = True
     has_symmetry_importance = True
-    replace_negative: bool = False
     def get_params(self) -> Dict[str, Optional[str]]:
         res = super().get_params()
         res.update(
@@ -52,7 +50,6 @@ class DateDiff(PandasOperand, DateDiffMixin):
                 "diff_unit": self.diff_unit,
                 "left_unit": self.left_unit,
                 "right_unit": self.right_unit,
-                "replace_negative": self.replace_negative,
             }
         )
         return res
@@ -64,8 +61,7 @@ class DateDiff(PandasOperand, DateDiffMixin):
         return self.__replace_negative(diff)
     def __replace_negative(self, x: Union[pd.DataFrame, pd.Series]):
-        if self.replace_negative:
-            x[x < 0] = None
+        x[x < 0] = None
         return x
@@ -105,19 +101,13 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
 class DateListDiff(PandasOperand, DateDiffMixin):
     is_binary = True
     has_symmetry_importance = True
     aggregation: str
-    replace_negative: bool = False
     def get_params(self) -> Dict[str, Optional[str]]:
         res = super().get_params()
         res.update(
             {
                 "aggregation": self.aggregation,
-                "diff_unit": self.diff_unit,
-                "left_unit": self.left_unit,
-                "right_unit": self.right_unit,
-                "replace_negative": self.replace_negative,
             }
         )
         return res
@@ -135,7 +125,7 @@ class DateListDiff(PandasOperand, DateDiffMixin):
     def _diff(self, x: TimedeltaArray):
         x = self._convert_diff_to_unit(x)
-        return x[x > 0] if self.replace_negative else x
+        return x[x > 0]
     def _agg(self, x):
         method = getattr(np, self.aggregation, None)
@@ -167,10 +157,7 @@ class DateListDiffBounded(DateListDiff):
         super().__init__(**data)
     def _agg(self, x):
-        x = x[
-            (x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
-            & (x < (self.upper_bound if self.upper_bound is not None else np.inf))
-        ]
+        x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
         return super()._agg(x)

upgini/autofe/feature.py CHANGED Viewed

@@ -138,17 +138,15 @@ class Feature:
         if self.cached_display_name is not None and cache:
             return self.cached_display_name
-        should_stack_op = not isinstance(self.children[0], Column) if self.op.is_unary else False
-        prev_name = [self.children[0].get_op_display_name()] if should_stack_op else []
         if self.alias:
             components = ["f_autofe", self.alias]
-        elif shorten and (not self.op.is_unary or should_stack_op):
-            components = ["f_autofe"] + prev_name + [self.get_op_display_name()]
+        elif shorten and not self.op.is_unary:
+            components = ["f_autofe", self.get_op_display_name()]
         else:
-            components = (
-                ["f_" + "_f_".join(self.get_columns(**kwargs))] + ["autofe"] + prev_name + [self.get_op_display_name()]
-            )
+            components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + [
+                "autofe",
+                self.get_op_display_name(),
+            ]
         components.extend([str(self.display_index)] if self.display_index is not None else [])
         display_name = "_".join(components)
@@ -239,19 +237,12 @@ class Feature:
     @staticmethod
     def from_formula(string: str) -> Union[Column, "Feature"]:
+        if string[-1] != ")":
+            return Column(string)
         def is_trivial_char(c: str) -> bool:
             return c not in "()+-*/,"
-        if string[-1] != ")":
-            if all(is_trivial_char(c) for c in string):
-                return Column(string)
-            else:
-                raise ValueError(
-                    f"Unsupported column name: {string}. Column names should not have characters: "
-                    "['(', ')', '+', '-', '*', '/', ',']"
-                )
         def find_prev(string: str) -> int:
             if string[-1] != ")":
                 return max([(0 if is_trivial_char(c) else i + 1) for i, c in enumerate(string)])
@@ -273,11 +264,8 @@ class Feature:
             return Feature(find_op(string[: p2 - 1]), [Feature.from_formula(string[p2:-1])])
         p1 = find_prev(string[: p2 - 1])
         if string[0] == "(":
-            op = find_op(string[p2 - 1])
-            if op is None:
-                raise ValueError(f"Unsupported operand: {string[p2 - 1]}")
             return Feature(
-                op,
+                find_op(string[p2 - 1]),
                 [Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
             )
         else:
@@ -288,8 +276,6 @@ class Feature:
                     [Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
                 )
             else:
-                if string[p1 - 1] == "(":
-                    raise ValueError(f"Unsupported operand: {string[: p1 - 1]}")
                 base_features = [
                     Feature.from_formula(string[p2:-1]),
                     Feature.from_formula(string[p1 : p2 - 1]),
@@ -335,10 +321,10 @@ class FeatureGroup:
             lower_order_names = [ch.get_display_name() for ch in lower_order_children]
             if any(isinstance(f, Feature) for f in lower_order_children):
                 child_data = pd.concat(
-                    [data[main_column or []]] + [ch.calculate(data) for ch in lower_order_children],
+                    [data[main_column]] + [ch.calculate(data) for ch in lower_order_children],
                     axis=1,
                 )
-                child_data.columns = ([main_column] if main_column is not None else []) + lower_order_names
+                child_data.columns = [main_column] + lower_order_names
             else:
                 child_data = data[columns]

upgini/autofe/unary.py CHANGED Viewed

@@ -125,10 +125,3 @@ class Norm(PandasOperand):
         normalized_data = pd.Series(normalized_data[:, 0], index=data_dropna.index, name=data.name)
         normalized_data = normalized_data.reindex(data.index)
         return normalized_data
-class Embeddings(PandasOperand):
-    name = "emb"
-    is_unary = True
-    input_type = "string"
-    output_type = "vector"

upgini 1.1.312__py3-none-any.whl → 1.1.312a2__py3-none-any.whl

Potentially problematic release.

upgini 1.1.312py3-none-any.whl → 1.1.312a2py3-none-any.whl