PyPI - upgini - Versions diffs - 1.1.229a3__tar.gz → 1.1.230__tar.gz - Mend

upgini 1.1.229a3tar.gz → 1.1.230tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (79) hide show

{upgini-1.1.229a3/src/upgini.egg-info → upgini-1.1.230}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: upgini
-Version: 1.1.229a3
+Version: 1.1.230
 Summary: Intelligent data search & enrichment for Machine Learning
 Home-page: https://upgini.com/
 Author: Upgini Developers

{upgini-1.1.229a3 → upgini-1.1.230}/setup.py RENAMED Viewed

@@ -40,7 +40,7 @@ def send_log(msg: str):
 here = Path(__file__).parent.resolve()
-version = "1.1.229a3"
+version = "1.1.230"
 try:
     send_log(f"Start setup PyLib version {version}")
     setup(

upgini-1.1.230/src/upgini/autofe/all_operands.py ADDED Viewed

@@ -0,0 +1,43 @@
+from typing import Dict
+from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
+from upgini.autofe.operand import Operand
+from upgini.autofe.unary import Abs, Log, Residual, Sqrt, Square, Sigmoid, Floor, Freq
+from upgini.autofe.binary import Min, Max, Add, Subtract, Multiply, Divide, Sim
+from upgini.autofe.vector import Mean, Sum
+ALL_OPERANDS: Dict[str, Operand] = {
+    op.name: op
+    for op in [
+        Freq(),
+        Mean(),
+        Sum(),
+        Abs(),
+        Log(),
+        Sqrt(),
+        Square(),
+        Sigmoid(),
+        Floor(),
+        Residual(),
+        Min(),
+        Max(),
+        Add(),
+        Subtract(),
+        Multiply(),
+        Divide(),
+        GroupByThenAgg(name="GroupByThenMin", agg="min"),
+        GroupByThenAgg(name="GroupByThenMax", agg="max"),
+        GroupByThenAgg(name="GroupByThenMean", agg="mean"),
+        GroupByThenAgg(name="GroupByThenMedian", agg="median"),
+        GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
+        GroupByThenRank(),
+        Operand(name="Combine", has_symmetry_importance=True, output_type="object", is_categorical=True),
+        Operand(name="CombineThenFreq", has_symmetry_importance=True, output_type="float"),
+        Operand(name="GroupByThenNUnique", output_type="int", is_vectorizable=True, is_grouping=True),
+        Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
+        Sim(),
+    ]
+}
+def find_op(name):
+    return ALL_OPERANDS.get(name)

upgini-1.1.230/src/upgini/autofe/binary.py ADDED Viewed

@@ -0,0 +1,133 @@
+from upgini.autofe.operand import PandasOperand, VectorizableMixin
+import numpy as np
+import pandas as pd
+from numpy import dot
+from numpy.linalg import norm
+class Min(PandasOperand):
+    name = "min"
+    is_binary = True
+    has_symmetry_importance = True
+    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        return np.minimum(left, right)
+class Max(PandasOperand):
+    name = "max"
+    is_binary = True
+    has_symmetry_importance = True
+    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        return np.maximum(left, right)
+class Add(PandasOperand, VectorizableMixin):
+    name = "+"
+    alias = "add"
+    is_binary = True
+    has_symmetry_importance = True
+    is_vectorizable = True
+    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        return left + right
+    def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
+        group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
+        d1 = data[value_columns]
+        d2 = data[group_column]
+        return d1.add(d2, axis=0)
+class Subtract(PandasOperand, VectorizableMixin):
+    name = "-"
+    alias = "sub"
+    is_binary = True
+    has_symmetry_importance = True
+    is_vectorizable = True
+    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        return left - right
+    def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
+        group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
+        d1 = data[value_columns]
+        d2 = data[group_column]
+        return d1.sub(d2, axis=0)
+class Multiply(PandasOperand, VectorizableMixin):
+    name = "*"
+    alias = "mul"
+    is_binary = True
+    has_symmetry_importance = True
+    is_vectorizable = True
+    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        return left * right
+    def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
+        group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
+        d1 = data[value_columns]
+        d2 = data[group_column]
+        return d1.mul(d2, axis=0)
+class Divide(PandasOperand, VectorizableMixin):
+    name = "/"
+    alias = "div"
+    is_binary = True
+    has_symmetry_importance = True
+    is_vectorizable = True
+    output_type = "float"
+    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        return left / right.replace(0, np.nan)
+    def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
+        group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
+        d1 = data[value_columns]
+        d2 = data[group_column]
+        return d1.div(d2.replace(0, np.nan), axis=0)
+class Combine(PandasOperand):
+    name = "Combine"
+    is_binary = True
+    has_symmetry_importance = True
+    output_type = "object"
+    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        temp = left.astype(str) + "_" + right.astype(str)
+        temp[left.isna() | right.isna()] = np.nan
+        return pd.Series(temp, index=left.index)
+class CombineThenFreq(PandasOperand):
+    name = "CombineThenFreq"
+    is_binary = True
+    has_symmetry_importance = True
+    output_type = "float"
+    is_distribution_dependent = True
+    input_type = "discrete"
+    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        temp = left.astype(str) + "_" + right.astype(str)
+        temp[left.isna() | right.isna()] = np.nan
+        value_counts = temp.value_counts(normalize=True)
+        self._loc(temp, value_counts)
+class Sim(PandasOperand):
+    name = "sim"
+    is_binary = True
+    output_type = "float"
+    has_symmetry_importance = True
+    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        return dot(left, right) / (norm(left) * norm(right))

upgini-1.1.230/src/upgini/autofe/feature.py ADDED Viewed

@@ -0,0 +1,298 @@
+import hashlib
+from typing import Dict
+import numpy as np
+import pandas as pd
+import itertools
+from upgini.autofe.operand import PandasOperand
+from upgini.autofe.all_operands import (
+    find_op,
+)
+class FeatureGroup(object):
+    def __init__(self, op, main_column, children):
+        self.op = op
+        self.main_column_node = main_column
+        self.children = children
+        self.data = None
+    def get_columns(self, **kwargs):
+        column_list = []
+        seen = set()
+        for child in self.children:
+            columns = child.get_columns(**kwargs)
+            column_list.extend([f for f in columns if f not in seen])
+            seen.update(columns)
+        return column_list
+    def get_display_names(self, **kwargs):
+        names = [f.get_display_name(**kwargs) for f in self.children]
+        return names
+    def calculate(self, data: pd.DataFrame, is_root=False):
+        main_column = None if self.main_column_node is None else self.main_column_node.get_columns()[0]
+        if isinstance(self.op, PandasOperand):
+            columns = self.get_columns()
+            new_data = self.op.calculate_group(data[columns], main_column=main_column)
+            new_data.rename(columns=dict(zip(columns, self.get_display_names())), inplace=True)
+        else:
+            raise NotImplementedError(f"Unrecognized operator {self.op.name}.")
+        new_data.replace([-np.inf, np.inf], np.nan, inplace=True)
+        if is_root:
+            self.data = new_data
+        return new_data
+    @staticmethod
+    def make_groups(candidates):
+        grouped_features = []
+        for op_child, features in itertools.groupby(
+            candidates, lambda f: (f.op, f.children[0] if f.op.is_unary or f.op.is_vector else f.children[1])
+        ):
+            op, main_child = op_child
+            feature_list = list(features)
+            if op.is_vectorizable:
+                if op.is_unary:
+                    group = FeatureGroup(op, main_column=None, children=feature_list)
+                else:
+                    group = FeatureGroup(op, main_column=main_child, children=feature_list)
+                grouped_features.append(group)
+            else:
+                grouped_features.extend(feature_list)
+        return grouped_features
+    def delete_data(self):
+        self.data = None
+        if self.main_column_node:
+            self.main_column_node.delete_data()
+        for child in self.children:
+            child.delete_data()
+class Feature(object):
+    def __init__(self, op, children, data=None, display_index=None, cached_display_name=None, alias=None):
+        self.op = op
+        self.children = children
+        self.data = data
+        self.display_index = display_index
+        self.cached_display_name = cached_display_name
+        self.alias = alias
+    def set_op_params(self, params: Dict):
+        self.op.set_params(params)
+        return self
+    def get_hash(self):
+        return hashlib.sha256("_".join([self.op.name] + [ch.name for ch in self.children]).encode("utf-8")).hexdigest()[
+            :8
+        ]
+    def set_alias(self, alias):
+        self.alias = alias
+        return self
+    def rename_columns(self, mapping: Dict):
+        for child in self.children:
+            child.rename_columns(mapping)
+        self.cached_display_name = None
+        return self
+    def get_column_nodes(self):
+        res = []
+        for child in self.children:
+            res.extend(child.get_column_nodes())
+        return res
+    def get_columns(self, **kwargs):
+        column_list = []
+        seen = set()
+        for child in self.children:
+            columns = child.get_columns(**kwargs)
+            column_list.extend([f for f in columns if f not in seen])
+            seen.update(columns)
+        return column_list
+    def delete_data(self):
+        self.data = None
+        for child in self.children:
+            child.delete_data()
+    def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs):
+        if self.cached_display_name is not None and cache:
+            return self.cached_display_name
+        if self.alias:
+            components = ["f_autofe", self.alias]
+        elif shorten and not self.op.is_unary:
+            components = ["f_autofe", self.op.alias or self.op.name.lower()]
+        else:
+            components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + [
+                "autofe",
+                self.op.alias or self.op.name.lower(),
+            ]
+        components.extend([str(self.display_index)] if self.display_index is not None else [])
+        display_name = "_".join(components)
+        if cache:
+            self.cached_display_name = display_name
+        return display_name
+    def set_display_index(self, index):
+        self.display_index = index
+        self.cached_display_name = None
+        return self
+    def infer_type(self, data):
+        if self.op.output_type:
+            return self.op.output_type
+        else:
+            # either a symmetrical operator or group by
+            return self.children[0].infer_type(data)
+    def calculate(self, data, is_root=False):
+        if isinstance(self.op, PandasOperand) and self.op.is_vector:
+            ds = [child.calculate(data) for child in self.children]
+            new_data = self.op.calculate(data=ds)
+        elif isinstance(self.op, PandasOperand):
+            d1 = self.children[0].calculate(data)
+            d2 = None if len(self.children) < 2 else self.children[1].calculate(data)
+            new_data = self.op.calculate(data=d1, left=d1, right=d2)
+        else:
+            raise NotImplementedError(f"Unrecognized operator {self.op.name}.")
+        if (str(new_data.dtype) == "category") | (str(new_data.dtype) == "object"):
+            pass
+        else:
+            new_data = new_data.replace([-np.inf, np.inf], np.nan)
+        if is_root:
+            self.data = new_data
+        return new_data
+    @staticmethod
+    def check_xor(left, right):
+        def _get_all_columns(feature):
+            if isinstance(feature, Column):
+                return [feature.name]
+            else:
+                res = []
+                for child in feature.children:
+                    res.extend(_get_all_columns(child))
+                return res
+        column1 = set(_get_all_columns(left))
+        column2 = set(_get_all_columns(right))
+        if len(column1 ^ column2) == 0:
+            return False
+        else:
+            return True
+    def to_formula(self, **kwargs):
+        if self.op.name in ["+", "-", "*", "/"]:
+            left = self.children[0].to_formula(**kwargs)
+            right = self.children[1].to_formula(**kwargs)
+            return f"({left}{self.op.name}{right})"
+        else:
+            result = [self.op.name, "("]
+            for i in range(len(self.children)):
+                string_i = self.children[i].to_formula(**kwargs)
+                result.append(string_i)
+                result.append(",")
+            result.pop()
+            result.append(")")
+            return "".join(result)
+    @staticmethod
+    def from_formula(string):
+        if string[-1] != ")":
+            return Column(string)
+        def is_trivial_char(c):
+            return not (c in "()+-*/,")
+        def find_prev(string):
+            if string[-1] != ")":
+                return max([(0 if is_trivial_char(c) else i + 1) for i, c in enumerate(string)])
+            level, pos = 0, -1
+            for i in range(len(string) - 1, -1, -1):
+                if string[i] == ")":
+                    level += 1
+                if string[i] == "(":
+                    level -= 1
+                if level == 0:
+                    pos = i
+                    break
+            while (pos > 0) and is_trivial_char(string[pos - 1]):
+                pos -= 1
+            return pos
+        p2 = find_prev(string[:-1])
+        if string[p2 - 1] == "(":
+            return Feature(find_op(string[: p2 - 1]), [Feature.from_formula(string[p2:-1])])
+        p1 = find_prev(string[: p2 - 1])
+        if string[0] == "(":
+            return Feature(
+                find_op(string[p2 - 1]),
+                [Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
+            )
+        else:
+            op = find_op(string[: p1 - 1])
+            if op is not None:
+                return Feature(
+                    op,
+                    [Feature.from_formula(string[p1 : p2 - 1]), Feature.from_formula(string[p2:-1])],
+                )
+            else:
+                base_features = [
+                    Feature.from_formula(string[p2:-1]),
+                    Feature.from_formula(string[p1 : p2 - 1]),
+                ]
+                while op is None:
+                    p2 = p1
+                    p1 = find_prev(string[: p1 - 1])
+                    base_features.append(Feature.from_formula(string[p1 : p2 - 1]))
+                    op = find_op(string[: p1 - 1])
+                base_features.reverse()
+                return Feature(op, base_features)
+class Column(object):
+    def __init__(self, name, data=None, calculate_all=False):
+        self.name = name
+        self.data = data
+        self.calculate_all = calculate_all
+    def rename_columns(self, mapping: Dict):
+        self.name = self._unhash(mapping.get(self.name) or self.name)
+        return self
+    def _unhash(self, feature_name):
+        last_component_idx = feature_name.rfind("_")
+        if not feature_name.startswith("f_"):
+            return feature_name  # etalon feature
+        elif last_component_idx == 1:
+            return feature_name[2:]  # fully hashed name, cannot unhash
+        else:
+            return feature_name[2:last_component_idx]
+    def delete_data(self):
+        self.data = None
+    def get_column_nodes(self):
+        return [self]
+    def get_columns(self):
+        return [self.name]
+    def infer_type(self, data):
+        return data[self.name].dtype
+    def calculate(self, data):
+        self.data = data[self.name]
+        return self.data
+    def to_formula(self, **kwargs):
+        return str(self.get_columns(**kwargs)[0])

upgini-1.1.230/src/upgini/autofe/groupby.py ADDED Viewed

@@ -0,0 +1,82 @@
+from upgini.autofe.operand import PandasOperand, VectorizableMixin
+from typing import Optional
+import pandas as pd
+class GroupByThenAgg(PandasOperand, VectorizableMixin):
+    agg: Optional[str]
+    is_vectorizable = True
+    is_grouping = True
+    is_distribution_dependent = True
+    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        temp = left.groupby(right).agg(self.agg)
+        return self._loc(right, temp)
+    def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
+        group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
+        d1 = data[value_columns]
+        d2 = data[group_column]
+        temp = d1.groupby(d2).agg(self.agg)
+        return temp.merge(d2, how="right", on=[group_column])[value_columns]
+class GroupByThenMedian(GroupByThenAgg):
+    name = "GroupByThenMedian"
+    pandas_agg = "median"
+    is_distribution_dependent = True
+class GroupByThenRank(PandasOperand, VectorizableMixin):
+    name = "GroupByThenRank"
+    is_vectorizable = True
+    is_grouping = True
+    output_type = "float"
+    is_distribution_dependent = True
+    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        temp = pd.DataFrame(left[~right.isna()].groupby(right).rank(ascending=True, pct=True)).reset_index()
+        return temp.merge(pd.DataFrame(right).reset_index(), how="right", on=["index"])[left.name]
+    def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
+        group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
+        d1 = data[value_columns]
+        d2 = data[group_column]
+        temp = d1[~d2.isna()].groupby(d2).rank(ascending=True, pct=True)[value_columns].reset_index()
+        return temp.merge(d2.reset_index(), how="right", on=["index"])[value_columns]
+class GroupByThenNUnique(PandasOperand, VectorizableMixin):
+    name = "GroupByThenNUnique"
+    is_vectorizable = True
+    is_grouping = True
+    output_type = "int"
+    is_distribution_dependent = True
+    input_type = "discrete"
+    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        nunique = left.groupby(right).nunique()
+        return self._loc(right, nunique)
+    def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
+        group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
+        d1 = data[value_columns]
+        d2 = data[group_column]
+        nunique = d1.groupby(d2).nunique()
+        return nunique.merge(d2, how="right", on=[group_column])[value_columns]
+class GroupByThenFreq(PandasOperand):
+    name = "GroupByThenFreq"
+    is_grouping = True
+    output_type = "float"
+    is_distribution_dependent = True
+    input_type = "discrete"
+    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        def _f(x):
+            value_counts = x.value_counts(normalize=True)
+            return self._loc(x, value_counts)
+        freq = left.groupby(right).apply(_f)
+        return pd.Series(freq, index=right.index)

upgini-1.1.230/src/upgini/autofe/operand.py ADDED Viewed

@@ -0,0 +1,70 @@
+from pydantic import BaseModel
+from typing import Dict, List, Optional, Tuple
+import abc
+import pandas as pd
+import numpy as np
+class Operand(BaseModel):
+    name: str
+    alias: Optional[str]
+    is_unary: bool = False
+    has_symmetry_importance: bool = False
+    input_type: Optional[str]
+    output_type: Optional[str]
+    is_categorical: bool = False
+    is_vectorizable: bool = False
+    is_grouping: bool = False
+    is_binary: bool = False
+    is_vector: bool = False
+    is_distribution_dependent: bool = False
+    params: Optional[Dict[str, str]]
+    def set_params(self, params: Dict[str, str]):
+        self.params = params
+        return self
+    def get_params(self) -> Dict[str, str]:
+        return self.params
+MAIN_COLUMN = "main_column"
+class PandasOperand(Operand, abc.ABC):
+    def calculate(self, **kwargs) -> pd.Series:
+        if self.is_unary:
+            return self.calculate_unary(kwargs["data"])
+        elif self.is_binary or self.is_grouping:
+            return self.calculate_binary(kwargs["left"], kwargs["right"])
+        else:
+            return self.calculate_vector(kwargs["data"])
+    def calculate_unary(self, data: pd.Series) -> pd.Series:
+        pass
+    def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        pass
+    def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
+        pass
+    def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
+        if not self.is_vectorizable:
+            raise RuntimeError(f"Cannot apply calculate_group: operator {self.name} is not vectorizable")
+        else:
+            raise RuntimeError(f"Unimplemented calculate_group for operator {self.name}")
+    def _loc(self, df_to, df_from):
+        df_from.loc[np.nan] = np.nan
+        return df_to.fillna(np.nan).apply(lambda x: df_from.loc[x])
+class VectorizableMixin(Operand):
+    def validate_calculation(self, input_columns: List[str], **kwargs) -> Tuple[str, List[str]]:
+        if not kwargs.get(MAIN_COLUMN):
+            raise ValueError(f"Expected argument {MAIN_COLUMN} for grouping operator {self.name} not found")
+        group_column = kwargs[MAIN_COLUMN]
+        value_columns = [col for col in input_columns if col != group_column]
+        return group_column, value_columns

upgini-1.1.230/src/upgini/autofe/unary.py ADDED Viewed

@@ -0,0 +1,105 @@
+from upgini.autofe.operand import PandasOperand
+import numpy as np
+import pandas as pd
+class Abs(PandasOperand):
+    name = "abs"
+    is_unary = True
+    is_vectorizable = True
+    def calculate_unary(self, data: pd.Series) -> pd.Series:
+        return data.abs()
+    def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
+        return data.abs()
+class Log(PandasOperand):
+    name = "log"
+    is_unary = True
+    is_vectorizable = True
+    output_type = "float"
+    def calculate_unary(self, data: pd.Series) -> pd.Series:
+        return np.log(np.abs(data.replace(0, np.nan)))
+    def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
+        return np.log(data.replace(0, np.nan).abs())
+class Sqrt(PandasOperand):
+    name = "sqrt"
+    is_unary = True
+    is_vectorizable = True
+    output_type = "float"
+    def calculate_unary(self, data: pd.Series) -> pd.Series:
+        return np.sqrt(np.abs(data))
+    def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
+        return np.sqrt(data.abs())
+class Square(PandasOperand):
+    name = "square"
+    is_unary = True
+    is_vectorizable = True
+    def calculate_unary(self, data: pd.Series) -> pd.Series:
+        return np.square(data)
+    def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
+        return np.square(data)
+class Sigmoid(PandasOperand):
+    name = "sigmoid"
+    is_unary = True
+    is_vectorizable = True
+    output_type = "float"
+    def calculate_unary(self, data: pd.Series) -> pd.Series:
+        return 1 / (1 + np.exp(-data))
+    def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
+        return 1 / (1 + np.exp(-data))
+class Floor(PandasOperand):
+    name = "floor"
+    is_unary = True
+    is_vectorizable = True
+    output_type = "int"
+    input_type = "continuous"
+    def calculate_unary(self, data: pd.Series) -> pd.Series:
+        return np.floor(data)
+    def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
+        return np.floor(data)
+class Residual(PandasOperand):
+    name = "residual"
+    is_unary = True
+    is_vectorizable = True
+    input_type = "continuous"
+    def calculate_unary(self, data: pd.Series) -> pd.Series:
+        return data - np.floor(data)
+    def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
+        return data - np.floor(data)
+class Freq(PandasOperand):
+    name = "freq"
+    is_unary = True
+    output_type = "float"
+    is_distribution_dependent = True
+    input_type = "discrete"
+    def calculate_unary(self, data: pd.Series) -> pd.Series:
+        value_counts = data.value_counts(normalize=True)
+        return self._loc(data, value_counts)

upgini-1.1.230/src/upgini/autofe/vector.py ADDED Viewed

@@ -0,0 +1,20 @@
+from typing import List
+import pandas as pd
+from upgini.autofe.operand import PandasOperand
+class Mean(PandasOperand):
+    name = "mean"
+    output_type = "float"
+    is_vector = True
+    def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
+        return pd.DataFrame(data).T.fillna(0).mean(axis=1)
+class Sum(PandasOperand):
+    name = "sum"
+    is_vector = True
+    def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
+        return pd.DataFrame(data).T.fillna(0).sum(axis=1)

{upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/features_enricher.py RENAMED Viewed

@@ -904,9 +904,8 @@ class FeaturesEnricher(TransformerMixin):
                     model_task_type = self.model_task_type or define_task(y_sorted, self.logger, silent=True)
                     _cv = cv or self.cv
-                    self.logger.info(f"CV: {_cv}, groups: {groups}")
                     if groups is None and _cv == CVType.group_k_fold:
-                        self.logger.info(f"Replacing group_k_fold with k_fold as no groups were found")
+                        self.logger.info("Replacing group_k_fold with k_fold as no groups were found")
                         _cv = CVType.k_fold
                     if not isinstance(_cv, BaseCrossValidator):
                         date_column = self._get_date_column(search_keys)

{upgini-1.1.229a3 → upgini-1.1.230}/src/upgini/metadata.py RENAMED Viewed

@@ -68,7 +68,7 @@ class SearchKey(Enum):
     @staticmethod
     def personal_keys() -> List["SearchKey"]:
         return [SearchKey.EMAIL, SearchKey.HEM, SearchKey.IP, SearchKey.PHONE]
     @staticmethod
     def from_meaning_type(meaning_type: FileColumnMeaningType) -> "SearchKey":
         if meaning_type == FileColumnMeaningType.EMAIL:

upgini-1.1.230/src/upgini/sampler/__init__.py ADDED Viewed

File without changes

{upgini-1.1.229a3 → upgini-1.1.230/src/upgini.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: upgini
-Version: 1.1.229a3
+Version: 1.1.230
 Summary: Intelligent data search & enrichment for Machine Learning
 Home-page: https://upgini.com/
 Author: Upgini Developers

{upgini-1.1.229a3 → upgini-1.1.230}/src/upgini.egg-info/SOURCES.txt RENAMED Viewed

@@ -20,6 +20,14 @@ src/upgini.egg-info/requires.txt
 src/upgini.egg-info/top_level.txt
 src/upgini/ads_management/__init__.py
 src/upgini/ads_management/ads_manager.py
+src/upgini/autofe/__init__.py
+src/upgini/autofe/all_operands.py
+src/upgini/autofe/binary.py
+src/upgini/autofe/feature.py
+src/upgini/autofe/groupby.py
+src/upgini/autofe/operand.py
+src/upgini/autofe/unary.py
+src/upgini/autofe/vector.py
 src/upgini/data_source/__init__.py
 src/upgini/data_source/data_source_publisher.py
 src/upgini/mdc/__init__.py