PyPI - upgini - Versions diffs - 1.2.63__py3-none-any.whl → 1.2.65a3818.dev5__py3-none-any.whl - Mend

upgini 1.2.63py3-none-any.whl → 1.2.65a3818.dev5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (23) hide show

upgini/__about__.py +1 -1
upgini/autofe/all_operands.py +2 -2
upgini/autofe/binary.py +11 -11
upgini/autofe/date.py +6 -6
upgini/autofe/feature.py +6 -6
upgini/autofe/groupby.py +6 -6
upgini/autofe/{operand.py → operator.py} +13 -11
upgini/autofe/timeseries/__init__.py +23 -0
upgini/autofe/timeseries/base.py +105 -0
upgini/autofe/timeseries/cross.py +130 -0
upgini/autofe/timeseries/delta.py +119 -0
upgini/autofe/timeseries/lag.py +68 -0
upgini/autofe/timeseries/roll.py +92 -0
upgini/autofe/timeseries/trend.py +61 -0
upgini/autofe/timeseries/volatility.py +259 -0
upgini/autofe/unary.py +11 -11
upgini/autofe/vector.py +4 -200
upgini/features_enricher.py +2 -2
upgini/utils/sort.py +4 -2
{upgini-1.2.63.dist-info → upgini-1.2.65a3818.dev5.dist-info}/METADATA +1 -1
{upgini-1.2.63.dist-info → upgini-1.2.65a3818.dev5.dist-info}/RECORD +23 -15
{upgini-1.2.63.dist-info → upgini-1.2.65a3818.dev5.dist-info}/WHEEL +1 -1
{upgini-1.2.63.dist-info → upgini-1.2.65a3818.dev5.dist-info}/licenses/LICENSE +0 -0

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.63"
1	+ __version__ = "1.2.65a3818.dev5"

upgini/autofe/all_operands.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from upgini.autofe.operand import OperandRegistry
+from upgini.autofe.operator import OperatorRegistry
 from upgini.autofe.unary import *  # noqa
 from upgini.autofe.binary import *  # noqa
 from upgini.autofe.groupby import *  # noqa
@@ -7,4 +7,4 @@ from upgini.autofe.vector import *  # noqa
 def find_op(name):
-    return OperandRegistry.get_operand(name)
+    return OperatorRegistry.get_operator(name)

upgini/autofe/binary.py CHANGED Viewed

@@ -5,10 +5,10 @@ import numpy as np
 import pandas as pd
 from jarowinkler import jarowinkler_similarity
-from upgini.autofe.operand import PandasOperand, VectorizableMixin
+from upgini.autofe.operator import PandasOperator, VectorizableMixin
-class Min(PandasOperand):
+class Min(PandasOperator):
     name: str = "min"
     is_binary: bool = True
     is_symmetrical: bool = True
@@ -18,7 +18,7 @@ class Min(PandasOperand):
         return np.minimum(left, right)
-class Max(PandasOperand):
+class Max(PandasOperator):
     name: str = "max"
     is_binary: bool = True
     is_symmetrical: bool = True
@@ -28,7 +28,7 @@ class Max(PandasOperand):
         return np.maximum(left, right)
-class Add(PandasOperand, VectorizableMixin):
+class Add(PandasOperator, VectorizableMixin):
     name: str = "+"
     alias: str = "add"
     is_binary: bool = True
@@ -47,7 +47,7 @@ class Add(PandasOperand, VectorizableMixin):
         return d1.add(d2, axis=0)
-class Subtract(PandasOperand, VectorizableMixin):
+class Subtract(PandasOperator, VectorizableMixin):
     name: str = "-"
     alias: str = "sub"
     is_binary: bool = True
@@ -66,7 +66,7 @@ class Subtract(PandasOperand, VectorizableMixin):
         return d1.sub(d2, axis=0)
-class Multiply(PandasOperand, VectorizableMixin):
+class Multiply(PandasOperator, VectorizableMixin):
     name: str = "*"
     alias: str = "mul"
     is_binary: bool = True
@@ -85,7 +85,7 @@ class Multiply(PandasOperand, VectorizableMixin):
         return d1.mul(d2, axis=0)
-class Divide(PandasOperand, VectorizableMixin):
+class Divide(PandasOperator, VectorizableMixin):
     name: str = "/"
     alias: str = "div"
     is_binary: bool = True
@@ -104,7 +104,7 @@ class Divide(PandasOperand, VectorizableMixin):
         return d1.div(d2.replace(0, np.nan), axis=0)
-class Combine(PandasOperand):
+class Combine(PandasOperator):
     name: str = "Combine"
     is_binary: bool = True
     has_symmetry_importance: bool = True
@@ -116,7 +116,7 @@ class Combine(PandasOperand):
         return pd.Series(temp, index=left.index)
-class CombineThenFreq(PandasOperand):
+class CombineThenFreq(PandasOperator):
     name: str = "CombineThenFreq"
     is_binary: bool = True
     is_symmetrical: bool = True
@@ -132,7 +132,7 @@ class CombineThenFreq(PandasOperand):
         self._loc(temp, value_counts)
-class Distance(PandasOperand):
+class Distance(PandasOperator):
     name: str = "dist"
     is_binary: bool = True
     output_type: Optional[str] = "float"
@@ -170,7 +170,7 @@ class Sim(Distance):
         return 1 - super().calculate_binary(left, right)
-class StringSim(PandasOperand, abc.ABC):
+class StringSim(PandasOperator, abc.ABC):
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         sims = []
         for i in left.index:

upgini/autofe/date.py CHANGED Viewed

@@ -7,7 +7,7 @@ import pandas as pd
 from pandas.core.arrays.timedeltas import TimedeltaArray
 from pydantic import BaseModel, __version__ as pydantic_version
-from upgini.autofe.operand import PandasOperand, ParametrizedOperand
+from upgini.autofe.operator import PandasOperator, ParametrizedOperator
 def get_pydantic_version():
@@ -43,7 +43,7 @@ class DateDiffMixin(BaseModel):
             raise Exception(f"Unsupported difference unit: {self.diff_unit}")
-class DateDiff(PandasOperand, DateDiffMixin):
+class DateDiff(PandasOperator, DateDiffMixin):
     name: str = "date_diff"
     alias: Optional[str] = "date_diff_type1"
     is_binary: bool = True
@@ -78,7 +78,7 @@ class DateDiff(PandasOperand, DateDiffMixin):
         return x
-class DateDiffType2(PandasOperand, DateDiffMixin):
+class DateDiffType2(PandasOperator, DateDiffMixin):
     name: str = "date_diff_type2"
     is_binary: bool = True
     has_symmetry_importance: bool = True
@@ -112,7 +112,7 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
 _count_aggregations = ["nunique", "count"]
-class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
+class DateListDiff(PandasOperator, DateDiffMixin, ParametrizedOperator):
     is_binary: bool = True
     has_symmetry_importance: bool = True
@@ -183,7 +183,7 @@ class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
         return method(x) if len(x) > 0 else default
-class DateListDiffBounded(DateListDiff, ParametrizedOperand):
+class DateListDiffBounded(DateListDiff, ParametrizedOperator):
     lower_bound: Optional[int] = None
     upper_bound: Optional[int] = None
@@ -217,7 +217,7 @@ class DateListDiffBounded(DateListDiff, ParametrizedOperand):
         return super()._agg(x)
-class DatePercentileBase(PandasOperand, abc.ABC):
+class DatePercentileBase(PandasOperator, abc.ABC):
     is_binary: bool = True
     output_type: Optional[str] = "float"

upgini/autofe/feature.py CHANGED Viewed

@@ -7,7 +7,7 @@ import pandas as pd
 from pandas._typing import DtypeObj
 from upgini.autofe.all_operands import find_op
-from upgini.autofe.operand import Operand, PandasOperand
+from upgini.autofe.operator import Operator, PandasOperator
 class Column:
@@ -65,7 +65,7 @@ class Column:
 class Feature:
     def __init__(
         self,
-        op: Operand,
+        op: Operator,
         children: List[Union[Column, "Feature"]],
         data: Optional[pd.DataFrame] = None,
         display_index: Optional[str] = None,
@@ -188,7 +188,7 @@ class Feature:
             return self.children[0].infer_type(data)
     def calculate(self, data: pd.DataFrame, is_root=False) -> Union[pd.Series, pd.DataFrame]:
-        if isinstance(self.op, PandasOperand):
+        if isinstance(self.op, PandasOperator):
             if self.op.is_vector:
                 ds = [child.calculate(data) for child in self.children]
                 new_data = self.op.calculate(data=ds)
@@ -324,7 +324,7 @@ class Feature:
 class FeatureGroup:
     def __init__(
-        self, op: Operand, main_column: Optional[Union[Column, Feature]], children: List[Union[Column, Feature]]
+        self, op: Operator, main_column: Optional[Union[Column, Feature]], children: List[Union[Column, Feature]]
     ):
         self.op = op
         self.main_column_node = main_column
@@ -345,7 +345,7 @@ class FeatureGroup:
         return names
     def calculate(self, data: pd.DataFrame, is_root=False) -> pd.DataFrame:
-        if isinstance(self.op, PandasOperand):
+        if isinstance(self.op, PandasOperator):
             main_column = None if self.main_column_node is None else self.main_column_node.get_display_name()
             lower_order_children = []
             if self.main_column_node is not None:
@@ -378,7 +378,7 @@ class FeatureGroup:
     def make_groups(candidates: List[Feature]) -> List[Union[Feature, "FeatureGroup"]]:
         grouped_features = []
-        def groupby_func(f: Feature) -> Tuple[Operand, Union[Column, Feature]]:
+        def groupby_func(f: Feature) -> Tuple[Operator, Union[Column, Feature]]:
             return (f.op, f.children[0 if not f.op.is_vectorizable else f.op.group_index])
         for op_child, features in itertools.groupby(candidates, groupby_func):

upgini/autofe/groupby.py CHANGED Viewed

@@ -2,13 +2,13 @@ from typing import Optional
 import pandas as pd
-from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
+from upgini.autofe.operator import PandasOperator, ParametrizedOperator, VectorizableMixin
 class GroupByThenAgg(
-    PandasOperand,
+    PandasOperator,
     VectorizableMixin,
-    ParametrizedOperand,
+    ParametrizedOperator,
 ):
     agg: Optional[str]
     is_vectorizable: bool = True
@@ -39,7 +39,7 @@ class GroupByThenAgg(
         return temp.merge(d2, how="right", on=[group_column])[value_columns]
-class GroupByThenRank(PandasOperand, VectorizableMixin):
+class GroupByThenRank(PandasOperator, VectorizableMixin):
     name: str = "GroupByThenRank"
     is_vectorizable: bool = True
     is_grouping: bool = True
@@ -58,7 +58,7 @@ class GroupByThenRank(PandasOperand, VectorizableMixin):
         return temp.merge(d2.reset_index(), how="right", on=["index"])[value_columns]
-class GroupByThenNUnique(PandasOperand, VectorizableMixin):
+class GroupByThenNUnique(PandasOperator, VectorizableMixin):
     name: str = "GroupByThenNUnique"
     is_vectorizable: bool = True
     is_grouping: bool = True
@@ -78,7 +78,7 @@ class GroupByThenNUnique(PandasOperand, VectorizableMixin):
         return nunique.merge(d2, how="right", on=[group_column])[value_columns]
-class GroupByThenFreq(PandasOperand):
+class GroupByThenFreq(PandasOperator):
     name: str = "GroupByThenFreq"
     is_grouping: bool = True
     output_type: Optional[str] = "float"

upgini/autofe/{operand.py → operator.py} RENAMED Viewed

@@ -6,7 +6,7 @@ import pandas as pd
 from pydantic import BaseModel
-class OperandRegistry(type(BaseModel)):
+class OperatorRegistry(type(BaseModel)):
     _registry = {}
     _parametrized_registry = []
@@ -20,23 +20,25 @@ class OperandRegistry(type(BaseModel)):
             base_names.update(b.__name__ for b in base.__bases__)
             base_classes.extend(base.__bases__)
-        if "Operand" in base_names:
+        if "Operator" in base_names:
             # Track parametrized operands separately
-            if "ParametrizedOperand" in base_names:
+            if "ParametrizedOperator" in base_names:
                 cls._parametrized_registry.append(new_class)
             else:
                 try:
                     instance = new_class()
                     cls._registry[instance.name] = new_class
+                    if instance.alias:
+                        cls._registry[instance.alias] = new_class
                 except Exception:
                     pass
         return new_class
     @classmethod
-    def get_operand(cls, name: str) -> Optional["Operand"]:
+    def get_operator(cls, name: str) -> Optional["Operator"]:
         # First try to resolve as a parametrized operand formula
-        for operand_cls in cls._parametrized_registry:
-            resolved = operand_cls.from_formula(name)
+        for operator_cls in cls._parametrized_registry:
+            resolved = operator_cls.from_formula(name)
             if resolved is not None:
                 return resolved
         # Fall back to direct registry lookup
@@ -46,7 +48,7 @@ class OperandRegistry(type(BaseModel)):
         return None
-class Operand(BaseModel, metaclass=OperandRegistry):
+class Operator(BaseModel, metaclass=OperatorRegistry):
     name: Optional[str] = None
     alias: Optional[str] = None
     is_unary: bool = False
@@ -75,7 +77,7 @@ class Operand(BaseModel, metaclass=OperandRegistry):
         return self.name
-class ParametrizedOperand(Operand, abc.ABC):
+class ParametrizedOperator(Operator, abc.ABC):
     @abc.abstractmethod
     def to_formula(self) -> str:
@@ -83,14 +85,14 @@ class ParametrizedOperand(Operand, abc.ABC):
     @classmethod
     @abc.abstractmethod
-    def from_formula(cls, formula: str) -> Optional["Operand"]:
+    def from_formula(cls, formula: str) -> Optional["Operator"]:
         pass
 MAIN_COLUMN = "main_column"
-class PandasOperand(Operand, abc.ABC):
+class PandasOperator(Operator, abc.ABC):
     def calculate(self, **kwargs) -> pd.Series:
         if self.is_unary:
             return self.calculate_unary(kwargs["data"])
@@ -131,7 +133,7 @@ class PandasOperand(Operand, abc.ABC):
             return value
-class VectorizableMixin(Operand):
+class VectorizableMixin(Operator):
     group_index: int = 1
     def validate_calculation(self, input_columns: List[str], **kwargs) -> Tuple[str, List[str]]:

upgini/autofe/timeseries/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+"""Time series feature engineering operators."""
+from upgini.autofe.timeseries.base import TimeSeriesBase
+from upgini.autofe.timeseries.roll import Roll
+from upgini.autofe.timeseries.lag import Lag
+from upgini.autofe.timeseries.delta import Delta, Delta2
+from upgini.autofe.timeseries.trend import TrendCoefficient
+from upgini.autofe.timeseries.volatility import EWMAVolatility, RollingVolatility, RollingVolatility2, VolatilityRatio
+from upgini.autofe.timeseries.cross import CrossSeriesInteraction
+__all__ = [
+    "TimeSeriesBase",
+    "Roll",
+    "Lag",
+    "Delta",
+    "Delta2",
+    "TrendCoefficient",
+    "EWMAVolatility",
+    "RollingVolatility",
+    "RollingVolatility2",
+    "VolatilityRatio",
+    "CrossSeriesInteraction",
+]

upgini/autofe/timeseries/base.py ADDED Viewed

@@ -0,0 +1,105 @@
+import abc
+from typing import Dict, List, Optional
+import pandas as pd
+from upgini.autofe.operator import PandasOperator
+# Used in derived classes
+try:
+    from pydantic import field_validator as validator  # V2
+except ImportError:
+    from pydantic import validator  # V1
+class TimeSeriesBase(PandasOperator, abc.ABC):
+    is_vector: bool = True
+    date_unit: Optional[str] = None
+    offset_size: int = 0
+    offset_unit: str = "D"
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        res.update(
+            {
+                "date_unit": self.date_unit,
+                "offset_size": self.offset_size,
+                "offset_unit": self.offset_unit,
+            }
+        )
+        return res
+    def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
+        # assuming first is date, last is value, rest is group columns
+        date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
+        ts = pd.concat([date] + data[1:], axis=1)
+        ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
+        ts.set_index(date.name, inplace=True)
+        ts = ts[ts.index.notna()].sort_index()
+        ts = (
+            ts.groupby([c.name for c in data[1:-1]], group_keys=True)
+            .apply(self._shift)[data[-1].name]
+            .to_frame()
+            .reset_index()
+            .set_index(date.name)
+            .groupby([c.name for c in data[1:-1]], group_keys=True)
+            if len(data) > 2
+            else self._shift(ts)
+        )
+        ts = self._aggregate(ts)
+        ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
+        ts.index = date.index
+        return ts.iloc[:, -1]
+    def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
+        if self.offset_size > 0:
+            return ts.iloc[:, :-1].merge(
+                ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
+                left_index=True,
+                right_index=True,
+            )
+        return ts
+    @abc.abstractmethod
+    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
+        pass
+    def _add_offset_to_formula(self, base_formula: str) -> str:
+        if self.offset_size > 0:
+            return f"{base_formula}_offset_{self.offset_size}{self.offset_unit}"
+        return base_formula
+    @classmethod
+    def _parse_offset_from_formula(cls, formula: str, base_regex: str) -> tuple[Optional[dict], Optional[str]]:
+        """
+        Parse the offset component from a formula.
+        Args:
+            formula: The formula to parse
+            base_regex: The regex pattern for the base formula (without offset)
+        Returns:
+            A tuple with:
+            - Dictionary with offset parameters if found, None otherwise
+            - Remaining part of the formula after removing offset component (for further parsing)
+        """
+        import re
+        offset_regex = f"{base_regex}_offset_(\\d+)([a-zA-Z])"
+        match = re.match(offset_regex, formula)
+        if match:
+            # Get groups from the offset part
+            offset_size = int(match.group(match.lastindex - 1))
+            offset_unit = match.group(match.lastindex)
+            # Return the parameters and the base formula for further parsing if needed
+            # Extract the base formula by using the match object
+            base_formula = formula[: match.start(match.lastindex - 1) - len("_offset_")]
+            return {"offset_size": offset_size, "offset_unit": offset_unit}, base_formula
+        # Check if it matches the base regex (no offset)
+        if re.match(f"^{base_regex}$", formula) or re.match(f"^{base_regex}_", formula):
+            return None, formula
+        return None, None

upgini/autofe/timeseries/cross.py ADDED Viewed

@@ -0,0 +1,130 @@
+from typing import Dict, List, Optional
+import numpy as np
+import pandas as pd
+try:
+    from pydantic import field_validator as validator  # V2
+except ImportError:
+    from pydantic import validator  # V1
+from upgini.autofe.all_operands import find_op
+from upgini.autofe.operator import PandasOperator, ParametrizedOperator
+from upgini.autofe.timeseries.base import TimeSeriesBase
+class CrossSeriesInteraction(TimeSeriesBase, ParametrizedOperator):
+    base_name: str = "cross"
+    interaction_op: PandasOperator
+    descriptor_indices: List[int] = []
+    left_descriptor: List[str] = []
+    right_descriptor: List[str] = []
+    @validator("descriptor_indices")
+    @classmethod
+    def validate_descriptor_indices(cls, v):
+        if not v:
+            raise ValueError("descriptor_indices cannot be empty for CrossSeriesInteraction")
+        return v
+    def __init__(self, **data):
+        super().__init__(**data)
+        indices = self.descriptor_indices
+        left = self.left_descriptor
+        right = self.right_descriptor
+        if len(left) != len(indices):
+            raise ValueError(
+                f"left_descriptor length ({len(left)}) " f"must match descriptor_indices length ({len(indices)})"
+            )
+        if len(right) != len(indices):
+            raise ValueError(
+                f"right_descriptor length ({len(right)}) " f"must match descriptor_indices length ({len(indices)})"
+            )
+    def to_formula(self) -> str:
+        base_formula = f"{self.base_name}_{self._get_interaction_op_name()}"
+        return self._add_offset_to_formula(base_formula)
+    @classmethod
+    def from_formula(cls, formula: str) -> Optional["CrossSeriesInteraction"]:
+        base_regex = r"cross_(.+)"
+        offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
+        if remaining_formula is None:
+            return None
+        import re
+        match = re.match(f"^{base_regex}$", remaining_formula)
+        if not match:
+            return None
+        # Extract the operator formula
+        op_formula = match.group(1)
+        op = find_op(op_formula)
+        if op is None or not op.is_binary:
+            return None
+        # Include default values to pass validation
+        params = {
+            "interaction_op": op,
+            "descriptor_indices": [0],  # Default index
+            "left_descriptor": ["default"],  # Default left descriptor
+            "right_descriptor": ["default"],  # Default right descriptor
+        }
+        if offset_params:
+            params.update(offset_params)
+        return cls(**params)
+    def get_params(self) -> Dict[str, str | None]:
+        res = super().get_params()
+        res.update(
+            {
+                "interaction_op": self._get_interaction_op_name(),
+                "descriptor_indices": self.descriptor_indices,
+                "left_descriptor": self.left_descriptor,
+                "right_descriptor": self.right_descriptor,
+            }
+        )
+        return res
+    def _get_interaction_op_name(self) -> str:
+        return self.interaction_op.alias or self.interaction_op.to_formula()
+    def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
+        left_mask = self._get_mask(data, self.left_descriptor)
+        left = self._extract_series(data, left_mask)
+        right_mask = self._get_mask(data, self.right_descriptor)
+        right = self._extract_series(data, right_mask)
+        interaction: pd.Series = self.interaction_op.calculate_binary(left, right)
+        interaction = interaction.reindex(self._get_index(data))
+        res = pd.Series(np.nan, index=data[-1].index, name=data[-1].name)
+        res.loc[left_mask] = interaction[left_mask].values
+        res.loc[right_mask] = interaction[right_mask].values
+        return res
+    def _get_mask(self, data: List[pd.Series], descriptor: List[str]) -> pd.Series:
+        mask = np.logical_and.reduce([data[i] == v for i, v in zip(self.descriptor_indices, descriptor)])
+        return mask
+    def _extract_series(self, data: List[pd.Series], mask: pd.Series) -> pd.Series:
+        masked_data = [d[mask] for d in data]
+        shifted = super().calculate_vector(masked_data)
+        shifted.index = self._get_index(masked_data)
+        return shifted
+    def _get_index(self, data: List[pd.Series]) -> pd.Series:
+        index = [d for i, d in enumerate(data[:-1]) if i not in self.descriptor_indices]
+        return index if len(index) > 1 else index[0]
+    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
+        return ts.apply(lambda x: x).iloc[:, [-1]]

upgini 1.2.63__py3-none-any.whl → 1.2.65a3818.dev5__py3-none-any.whl

Potentially problematic release.

upgini 1.2.63py3-none-any.whl → 1.2.65a3818.dev5py3-none-any.whl