PyPI - upgini - Versions diffs - 1.2.62__py3-none-any.whl → 1.2.62a3818.dev2__py3-none-any.whl - Mend

upgini 1.2.62py3-none-any.whl → 1.2.62a3818.dev2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

upgini/__about__.py +1 -1
upgini/autofe/all_operands.py +2 -2
upgini/autofe/binary.py +11 -11
upgini/autofe/date.py +6 -6
upgini/autofe/feature.py +6 -6
upgini/autofe/groupby.py +6 -6
upgini/autofe/{operand.py → operator.py} +7 -7
upgini/autofe/timeseries.py +200 -0
upgini/autofe/unary.py +11 -11
upgini/autofe/vector.py +4 -200
upgini/features_enricher.py +1 -1
{upgini-1.2.62.dist-info → upgini-1.2.62a3818.dev2.dist-info}/METADATA +1 -1
{upgini-1.2.62.dist-info → upgini-1.2.62a3818.dev2.dist-info}/RECORD +15 -14
{upgini-1.2.62.dist-info → upgini-1.2.62a3818.dev2.dist-info}/WHEEL +0 -0
{upgini-1.2.62.dist-info → upgini-1.2.62a3818.dev2.dist-info}/licenses/LICENSE +0 -0

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.62"
1	+ __version__ = "1.2.62a3818.dev2"

upgini/autofe/all_operands.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from upgini.autofe.operand import OperandRegistry
+from upgini.autofe.operator import OperatorRegistry
 from upgini.autofe.unary import *  # noqa
 from upgini.autofe.binary import *  # noqa
 from upgini.autofe.groupby import *  # noqa
@@ -7,4 +7,4 @@ from upgini.autofe.vector import *  # noqa
 def find_op(name):
-    return OperandRegistry.get_operand(name)
+    return OperatorRegistry.get_operand(name)

upgini/autofe/binary.py CHANGED Viewed

@@ -5,10 +5,10 @@ import numpy as np
 import pandas as pd
 from jarowinkler import jarowinkler_similarity
-from upgini.autofe.operand import PandasOperand, VectorizableMixin
+from upgini.autofe.operator import PandasOperator, VectorizableMixin
-class Min(PandasOperand):
+class Min(PandasOperator):
     name: str = "min"
     is_binary: bool = True
     is_symmetrical: bool = True
@@ -18,7 +18,7 @@ class Min(PandasOperand):
         return np.minimum(left, right)
-class Max(PandasOperand):
+class Max(PandasOperator):
     name: str = "max"
     is_binary: bool = True
     is_symmetrical: bool = True
@@ -28,7 +28,7 @@ class Max(PandasOperand):
         return np.maximum(left, right)
-class Add(PandasOperand, VectorizableMixin):
+class Add(PandasOperator, VectorizableMixin):
     name: str = "+"
     alias: str = "add"
     is_binary: bool = True
@@ -47,7 +47,7 @@ class Add(PandasOperand, VectorizableMixin):
         return d1.add(d2, axis=0)
-class Subtract(PandasOperand, VectorizableMixin):
+class Subtract(PandasOperator, VectorizableMixin):
     name: str = "-"
     alias: str = "sub"
     is_binary: bool = True
@@ -66,7 +66,7 @@ class Subtract(PandasOperand, VectorizableMixin):
         return d1.sub(d2, axis=0)
-class Multiply(PandasOperand, VectorizableMixin):
+class Multiply(PandasOperator, VectorizableMixin):
     name: str = "*"
     alias: str = "mul"
     is_binary: bool = True
@@ -85,7 +85,7 @@ class Multiply(PandasOperand, VectorizableMixin):
         return d1.mul(d2, axis=0)
-class Divide(PandasOperand, VectorizableMixin):
+class Divide(PandasOperator, VectorizableMixin):
     name: str = "/"
     alias: str = "div"
     is_binary: bool = True
@@ -104,7 +104,7 @@ class Divide(PandasOperand, VectorizableMixin):
         return d1.div(d2.replace(0, np.nan), axis=0)
-class Combine(PandasOperand):
+class Combine(PandasOperator):
     name: str = "Combine"
     is_binary: bool = True
     has_symmetry_importance: bool = True
@@ -116,7 +116,7 @@ class Combine(PandasOperand):
         return pd.Series(temp, index=left.index)
-class CombineThenFreq(PandasOperand):
+class CombineThenFreq(PandasOperator):
     name: str = "CombineThenFreq"
     is_binary: bool = True
     is_symmetrical: bool = True
@@ -132,7 +132,7 @@ class CombineThenFreq(PandasOperand):
         self._loc(temp, value_counts)
-class Distance(PandasOperand):
+class Distance(PandasOperator):
     name: str = "dist"
     is_binary: bool = True
     output_type: Optional[str] = "float"
@@ -170,7 +170,7 @@ class Sim(Distance):
         return 1 - super().calculate_binary(left, right)
-class StringSim(PandasOperand, abc.ABC):
+class StringSim(PandasOperator, abc.ABC):
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         sims = []
         for i in left.index:

upgini/autofe/date.py CHANGED Viewed

@@ -7,7 +7,7 @@ import pandas as pd
 from pandas.core.arrays.timedeltas import TimedeltaArray
 from pydantic import BaseModel, __version__ as pydantic_version
-from upgini.autofe.operand import PandasOperand, ParametrizedOperand
+from upgini.autofe.operator import PandasOperator, ParametrizedOperator
 def get_pydantic_version():
@@ -43,7 +43,7 @@ class DateDiffMixin(BaseModel):
             raise Exception(f"Unsupported difference unit: {self.diff_unit}")
-class DateDiff(PandasOperand, DateDiffMixin):
+class DateDiff(PandasOperator, DateDiffMixin):
     name: str = "date_diff"
     alias: Optional[str] = "date_diff_type1"
     is_binary: bool = True
@@ -78,7 +78,7 @@ class DateDiff(PandasOperand, DateDiffMixin):
         return x
-class DateDiffType2(PandasOperand, DateDiffMixin):
+class DateDiffType2(PandasOperator, DateDiffMixin):
     name: str = "date_diff_type2"
     is_binary: bool = True
     has_symmetry_importance: bool = True
@@ -112,7 +112,7 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
 _count_aggregations = ["nunique", "count"]
-class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
+class DateListDiff(PandasOperator, DateDiffMixin, ParametrizedOperator):
     is_binary: bool = True
     has_symmetry_importance: bool = True
@@ -183,7 +183,7 @@ class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
         return method(x) if len(x) > 0 else default
-class DateListDiffBounded(DateListDiff, ParametrizedOperand):
+class DateListDiffBounded(DateListDiff, ParametrizedOperator):
     lower_bound: Optional[int] = None
     upper_bound: Optional[int] = None
@@ -217,7 +217,7 @@ class DateListDiffBounded(DateListDiff, ParametrizedOperand):
         return super()._agg(x)
-class DatePercentileBase(PandasOperand, abc.ABC):
+class DatePercentileBase(PandasOperator, abc.ABC):
     is_binary: bool = True
     output_type: Optional[str] = "float"

upgini/autofe/feature.py CHANGED Viewed

@@ -7,7 +7,7 @@ import pandas as pd
 from pandas._typing import DtypeObj
 from upgini.autofe.all_operands import find_op
-from upgini.autofe.operand import Operand, PandasOperand
+from upgini.autofe.operator import Operator, PandasOperator
 class Column:
@@ -65,7 +65,7 @@ class Column:
 class Feature:
     def __init__(
         self,
-        op: Operand,
+        op: Operator,
         children: List[Union[Column, "Feature"]],
         data: Optional[pd.DataFrame] = None,
         display_index: Optional[str] = None,
@@ -188,7 +188,7 @@ class Feature:
             return self.children[0].infer_type(data)
     def calculate(self, data: pd.DataFrame, is_root=False) -> Union[pd.Series, pd.DataFrame]:
-        if isinstance(self.op, PandasOperand):
+        if isinstance(self.op, PandasOperator):
             if self.op.is_vector:
                 ds = [child.calculate(data) for child in self.children]
                 new_data = self.op.calculate(data=ds)
@@ -324,7 +324,7 @@ class Feature:
 class FeatureGroup:
     def __init__(
-        self, op: Operand, main_column: Optional[Union[Column, Feature]], children: List[Union[Column, Feature]]
+        self, op: Operator, main_column: Optional[Union[Column, Feature]], children: List[Union[Column, Feature]]
     ):
         self.op = op
         self.main_column_node = main_column
@@ -345,7 +345,7 @@ class FeatureGroup:
         return names
     def calculate(self, data: pd.DataFrame, is_root=False) -> pd.DataFrame:
-        if isinstance(self.op, PandasOperand):
+        if isinstance(self.op, PandasOperator):
             main_column = None if self.main_column_node is None else self.main_column_node.get_display_name()
             lower_order_children = []
             if self.main_column_node is not None:
@@ -378,7 +378,7 @@ class FeatureGroup:
     def make_groups(candidates: List[Feature]) -> List[Union[Feature, "FeatureGroup"]]:
         grouped_features = []
-        def groupby_func(f: Feature) -> Tuple[Operand, Union[Column, Feature]]:
+        def groupby_func(f: Feature) -> Tuple[Operator, Union[Column, Feature]]:
             return (f.op, f.children[0 if not f.op.is_vectorizable else f.op.group_index])
         for op_child, features in itertools.groupby(candidates, groupby_func):

upgini/autofe/groupby.py CHANGED Viewed

@@ -2,13 +2,13 @@ from typing import Optional
 import pandas as pd
-from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
+from upgini.autofe.operator import PandasOperator, ParametrizedOperator, VectorizableMixin
 class GroupByThenAgg(
-    PandasOperand,
+    PandasOperator,
     VectorizableMixin,
-    ParametrizedOperand,
+    ParametrizedOperator,
 ):
     agg: Optional[str]
     is_vectorizable: bool = True
@@ -39,7 +39,7 @@ class GroupByThenAgg(
         return temp.merge(d2, how="right", on=[group_column])[value_columns]
-class GroupByThenRank(PandasOperand, VectorizableMixin):
+class GroupByThenRank(PandasOperator, VectorizableMixin):
     name: str = "GroupByThenRank"
     is_vectorizable: bool = True
     is_grouping: bool = True
@@ -58,7 +58,7 @@ class GroupByThenRank(PandasOperand, VectorizableMixin):
         return temp.merge(d2.reset_index(), how="right", on=["index"])[value_columns]
-class GroupByThenNUnique(PandasOperand, VectorizableMixin):
+class GroupByThenNUnique(PandasOperator, VectorizableMixin):
     name: str = "GroupByThenNUnique"
     is_vectorizable: bool = True
     is_grouping: bool = True
@@ -78,7 +78,7 @@ class GroupByThenNUnique(PandasOperand, VectorizableMixin):
         return nunique.merge(d2, how="right", on=[group_column])[value_columns]
-class GroupByThenFreq(PandasOperand):
+class GroupByThenFreq(PandasOperator):
     name: str = "GroupByThenFreq"
     is_grouping: bool = True
     output_type: Optional[str] = "float"

upgini/autofe/{operand.py → operator.py} RENAMED Viewed

@@ -6,7 +6,7 @@ import pandas as pd
 from pydantic import BaseModel
-class OperandRegistry(type(BaseModel)):
+class OperatorRegistry(type(BaseModel)):
     _registry = {}
     _parametrized_registry = []
@@ -33,7 +33,7 @@ class OperandRegistry(type(BaseModel)):
         return new_class
     @classmethod
-    def get_operand(cls, name: str) -> Optional["Operand"]:
+    def get_operand(cls, name: str) -> Optional["Operator"]:
         # First try to resolve as a parametrized operand formula
         for operand_cls in cls._parametrized_registry:
             resolved = operand_cls.from_formula(name)
@@ -46,7 +46,7 @@ class OperandRegistry(type(BaseModel)):
         return None
-class Operand(BaseModel, metaclass=OperandRegistry):
+class Operator(BaseModel, metaclass=OperatorRegistry):
     name: Optional[str] = None
     alias: Optional[str] = None
     is_unary: bool = False
@@ -75,7 +75,7 @@ class Operand(BaseModel, metaclass=OperandRegistry):
         return self.name
-class ParametrizedOperand(Operand, abc.ABC):
+class ParametrizedOperator(Operator, abc.ABC):
     @abc.abstractmethod
     def to_formula(self) -> str:
@@ -83,14 +83,14 @@ class ParametrizedOperand(Operand, abc.ABC):
     @classmethod
     @abc.abstractmethod
-    def from_formula(cls, formula: str) -> Optional["Operand"]:
+    def from_formula(cls, formula: str) -> Optional["Operator"]:
         pass
 MAIN_COLUMN = "main_column"
-class PandasOperand(Operand, abc.ABC):
+class PandasOperator(Operator, abc.ABC):
     def calculate(self, **kwargs) -> pd.Series:
         if self.is_unary:
             return self.calculate_unary(kwargs["data"])
@@ -131,7 +131,7 @@ class PandasOperand(Operand, abc.ABC):
             return value
-class VectorizableMixin(Operand):
+class VectorizableMixin(Operator):
     group_index: int = 1
     def validate_calculation(self, input_columns: List[str], **kwargs) -> Tuple[str, List[str]]:

upgini/autofe/timeseries.py ADDED Viewed

@@ -0,0 +1,200 @@
+import abc
+from typing import Dict, List, Optional
+import pandas as pd
+from upgini.autofe.operator import PandasOperator, ParametrizedOperator
+try:
+    from pydantic import field_validator as validator  # V2
+except ImportError:
+    from pydantic import validator  # V1
+class TimeSeriesBase(PandasOperator, abc.ABC):
+    is_vector: bool = True
+    date_unit: Optional[str] = None
+    offset_size: int = 0
+    offset_unit: str = "D"
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        res.update(
+            {
+                "date_unit": self.date_unit,
+                "offset_size": self.offset_size,
+                "offset_unit": self.offset_unit,
+            }
+        )
+        return res
+    def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
+        # assuming first is date, last is value, rest is group columns
+        date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
+        ts = pd.concat([date] + data[1:], axis=1)
+        ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
+        ts.set_index(date.name, inplace=True)
+        ts = ts[ts.index.notna()].sort_index()
+        ts = (
+            ts.groupby([c.name for c in data[1:-1]], group_keys=True)
+            .apply(self._shift)[data[-1].name]
+            .to_frame()
+            .reset_index()
+            .set_index(date.name)
+            .groupby([c.name for c in data[1:-1]])
+            if len(data) > 2
+            else self._shift(ts)
+        )
+        ts = self._aggregate(ts)
+        ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
+        ts.index = date.index
+        return ts.iloc[:, -1]
+    def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
+        if self.offset_size > 0:
+            return ts.iloc[:, :-1].merge(
+                ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
+                left_index=True,
+                right_index=True,
+            )
+        return ts
+    @abc.abstractmethod
+    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
+        pass
+_roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean(), "last": lambda x: x[-1]}
+class Roll(TimeSeriesBase, ParametrizedOperator):
+    aggregation: str
+    window_size: int = 1
+    window_unit: str = "D"
+    @validator("window_unit")
+    @classmethod
+    def validate_window_unit(cls, v: str) -> str:
+        try:
+            pd.tseries.frequencies.to_offset(v)
+            return v
+        except ValueError:
+            raise ValueError(
+                f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
+            )
+    def to_formula(self) -> str:
+        roll_component = f"roll_{self.window_size}{self.window_unit}"
+        if self.offset_size > 0:
+            roll_component += f"_offset_{self.offset_size}{self.offset_unit}"
+        return f"{roll_component}_{self.aggregation}"
+    @classmethod
+    def from_formula(cls, formula: str) -> Optional["Roll"]:
+        import re
+        # Try matching pattern with offset first
+        pattern_with_offset = r"^roll_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])_(\w+)$"
+        match_with_offset = re.match(pattern_with_offset, formula)
+        if match_with_offset:
+            window_size = int(match_with_offset.group(1))
+            window_unit = match_with_offset.group(2)
+            offset_size = int(match_with_offset.group(3))
+            offset_unit = match_with_offset.group(4)
+            aggregation = match_with_offset.group(5)
+            return cls(
+                window_size=window_size,
+                window_unit=window_unit,
+                offset_size=offset_size,
+                offset_unit=offset_unit,
+                aggregation=aggregation,
+            )
+        # If no offset pattern found, try basic pattern
+        pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
+        match = re.match(pattern, formula)
+        if not match:
+            return None
+        window_size = int(match.group(1))
+        window_unit = match.group(2)
+        aggregation = match.group(3)
+        return cls(window_size=window_size, window_unit=window_unit, aggregation=aggregation)
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        res.update(
+            {
+                "window_size": self.window_size,
+                "window_unit": self.window_unit,
+                "aggregation": self.aggregation,
+            }
+        )
+        return res
+    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
+        return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
+            _roll_aggregations.get(self.aggregation, self.aggregation)
+        )
+class Lag(TimeSeriesBase, ParametrizedOperator):
+    lag_size: int
+    lag_unit: str = "D"
+    def to_formula(self) -> str:
+        lag_component = f"lag_{self.lag_size}{self.lag_unit}"
+        if self.offset_size > 0:
+            lag_component += f"_offset_{self.offset_size}{self.offset_unit}"
+        return lag_component
+    @classmethod
+    def from_formula(cls, formula: str) -> Optional["Lag"]:
+        import re
+        # Try matching pattern with offset first
+        pattern_with_offset = r"^lag_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])$"
+        match_with_offset = re.match(pattern_with_offset, formula)
+        if match_with_offset:
+            lag_size = int(match_with_offset.group(1))
+            lag_unit = match_with_offset.group(2)
+            offset_size = int(match_with_offset.group(3))
+            offset_unit = match_with_offset.group(4)
+            return cls(
+                lag_size=lag_size,
+                lag_unit=lag_unit,
+                offset_size=offset_size,
+                offset_unit=offset_unit,
+            )
+        # If no offset pattern found, try basic pattern
+        pattern = r"^lag_(\d+)([a-zA-Z])$"
+        match = re.match(pattern, formula)
+        if not match:
+            return None
+        lag_size = int(match.group(1))
+        lag_unit = match.group(2)
+        return cls(lag_size=lag_size, lag_unit=lag_unit)
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        res.update(
+            {
+                "lag_size": self.lag_size,
+                "lag_unit": self.lag_unit,
+            }
+        )
+        return res
+    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
+        lag_window = self.lag_size + 1
+        return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=lag_window).agg(lambda x: x[0])

upgini/autofe/unary.py CHANGED Viewed

@@ -2,10 +2,10 @@ from typing import Dict, Optional
 import numpy as np
 import pandas as pd
-from upgini.autofe.operand import PandasOperand, VectorizableMixin
+from upgini.autofe.operator import PandasOperator, VectorizableMixin
-class Abs(PandasOperand, VectorizableMixin):
+class Abs(PandasOperator, VectorizableMixin):
     name: str = "abs"
     is_unary: bool = True
     is_vectorizable: bool = True
@@ -20,7 +20,7 @@ class Abs(PandasOperand, VectorizableMixin):
         # return data.abs()
-class Log(PandasOperand, VectorizableMixin):
+class Log(PandasOperator, VectorizableMixin):
     name: str = "log"
     is_unary: bool = True
     is_vectorizable: bool = True
@@ -34,7 +34,7 @@ class Log(PandasOperand, VectorizableMixin):
         return self._round_value(np.log(data.replace(0, np.nan).abs()), 10)
-class Sqrt(PandasOperand, VectorizableMixin):
+class Sqrt(PandasOperator, VectorizableMixin):
     name: str = "sqrt"
     is_unary: bool = True
     is_vectorizable: bool = True
@@ -48,7 +48,7 @@ class Sqrt(PandasOperand, VectorizableMixin):
         return self._round_value(np.sqrt(data.abs()))
-class Square(PandasOperand, VectorizableMixin):
+class Square(PandasOperator, VectorizableMixin):
     name: str = "square"
     is_unary: bool = True
     is_vectorizable: bool = True
@@ -61,7 +61,7 @@ class Square(PandasOperand, VectorizableMixin):
         return np.square(data)
-class Sigmoid(PandasOperand, VectorizableMixin):
+class Sigmoid(PandasOperator, VectorizableMixin):
     name: str = "sigmoid"
     is_unary: bool = True
     is_vectorizable: bool = True
@@ -75,7 +75,7 @@ class Sigmoid(PandasOperand, VectorizableMixin):
         return self._round_value(1 / (1 + np.exp(-data)))
-class Floor(PandasOperand, VectorizableMixin):
+class Floor(PandasOperator, VectorizableMixin):
     name: str = "floor"
     is_unary: bool = True
     is_vectorizable: bool = True
@@ -90,7 +90,7 @@ class Floor(PandasOperand, VectorizableMixin):
         return np.floor(data)
-class Residual(PandasOperand, VectorizableMixin):
+class Residual(PandasOperator, VectorizableMixin):
     name: str = "residual"
     is_unary: bool = True
     is_vectorizable: bool = True
@@ -104,7 +104,7 @@ class Residual(PandasOperand, VectorizableMixin):
         return data - np.floor(data)
-class Freq(PandasOperand):
+class Freq(PandasOperator):
     name: str = "freq"
     is_unary: bool = True
     output_type: Optional[str] = "float"
@@ -116,7 +116,7 @@ class Freq(PandasOperand):
         return self._loc(data, value_counts)
-class Norm(PandasOperand):
+class Norm(PandasOperator):
     name: str = "norm"
     is_unary: bool = True
     output_type: Optional[str] = "float"
@@ -148,7 +148,7 @@ class Norm(PandasOperand):
         return res
-class Embeddings(PandasOperand):
+class Embeddings(PandasOperator):
     name: str = "emb"
     is_unary: bool = True
     input_type: Optional[str] = "string"

upgini/autofe/vector.py CHANGED Viewed

@@ -1,17 +1,11 @@
-import abc
-from typing import Dict, List, Optional
+from typing import List, Optional
 import pandas as pd
-try:
-    from pydantic import field_validator as validator  # V2
-except ImportError:
-    from pydantic import validator  # V1
+from upgini.autofe.operator import PandasOperator, VectorizableMixin
-from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
-class Mean(PandasOperand, VectorizableMixin):
+class Mean(PandasOperator, VectorizableMixin):
     name: str = "mean"
     output_type: Optional[str] = "float"
     is_vector: bool = True
@@ -21,200 +15,10 @@ class Mean(PandasOperand, VectorizableMixin):
         return pd.DataFrame(data).T.fillna(0).mean(axis=1)
-class Sum(PandasOperand, VectorizableMixin):
+class Sum(PandasOperator, VectorizableMixin):
     name: str = "sum"
     is_vector: bool = True
     group_index: int = 0
     def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
         return pd.DataFrame(data).T.fillna(0).sum(axis=1)
-class TimeSeriesBase(PandasOperand, abc.ABC):
-    is_vector: bool = True
-    date_unit: Optional[str] = None
-    offset_size: int = 0
-    offset_unit: str = "D"
-    def get_params(self) -> Dict[str, Optional[str]]:
-        res = super().get_params()
-        res.update(
-            {
-                "date_unit": self.date_unit,
-                "offset_size": self.offset_size,
-                "offset_unit": self.offset_unit,
-            }
-        )
-        return res
-    def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
-        # assuming first is date, last is value, rest is group columns
-        date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
-        ts = pd.concat([date] + data[1:], axis=1)
-        ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
-        ts.set_index(date.name, inplace=True)
-        ts = ts[ts.index.notna()].sort_index()
-        ts = (
-            ts.groupby([c.name for c in data[1:-1]])
-            .apply(self._shift)[data[-1].name]
-            .to_frame()
-            .reset_index()
-            .set_index(date.name)
-            .groupby([c.name for c in data[1:-1]])
-            if len(data) > 2
-            else self._shift(ts)
-        )
-        ts = self._aggregate(ts)
-        ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
-        ts.index = date.index
-        return ts.iloc[:, -1]
-    def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
-        if self.offset_size > 0:
-            return ts.iloc[:, :-1].merge(
-                ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
-                left_index=True,
-                right_index=True,
-            )
-        return ts
-    @abc.abstractmethod
-    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
-        pass
-_roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean()}
-class Roll(TimeSeriesBase, ParametrizedOperand):
-    aggregation: str
-    window_size: int = 1
-    window_unit: str = "D"
-    @validator("window_unit")
-    @classmethod
-    def validate_window_unit(cls, v: str) -> str:
-        try:
-            pd.tseries.frequencies.to_offset(v)
-            return v
-        except ValueError:
-            raise ValueError(
-                f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
-            )
-    def to_formula(self) -> str:
-        roll_component = f"roll_{self.window_size}{self.window_unit}"
-        if self.offset_size > 0:
-            roll_component += f"_offset_{self.offset_size}{self.offset_unit}"
-        return f"{roll_component}_{self.aggregation}"
-    @classmethod
-    def from_formula(cls, formula: str) -> Optional["Roll"]:
-        import re
-        # Try matching pattern with offset first
-        pattern_with_offset = r"^roll_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])_(\w+)$"
-        match_with_offset = re.match(pattern_with_offset, formula)
-        if match_with_offset:
-            window_size = int(match_with_offset.group(1))
-            window_unit = match_with_offset.group(2)
-            offset_size = int(match_with_offset.group(3))
-            offset_unit = match_with_offset.group(4)
-            aggregation = match_with_offset.group(5)
-            return cls(
-                window_size=window_size,
-                window_unit=window_unit,
-                offset_size=offset_size,
-                offset_unit=offset_unit,
-                aggregation=aggregation,
-            )
-        # If no offset pattern found, try basic pattern
-        pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
-        match = re.match(pattern, formula)
-        if not match:
-            return None
-        window_size = int(match.group(1))
-        window_unit = match.group(2)
-        aggregation = match.group(3)
-        return cls(window_size=window_size, window_unit=window_unit, aggregation=aggregation)
-    def get_params(self) -> Dict[str, Optional[str]]:
-        res = super().get_params()
-        res.update(
-            {
-                "window_size": self.window_size,
-                "window_unit": self.window_unit,
-                "aggregation": self.aggregation,
-            }
-        )
-        return res
-    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
-        return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
-            _roll_aggregations.get(self.aggregation, self.aggregation)
-        )
-class Lag(TimeSeriesBase, ParametrizedOperand):
-    lag_size: int
-    lag_unit: str = "D"
-    def to_formula(self) -> str:
-        lag_component = f"lag_{self.lag_size}{self.lag_unit}"
-        if self.offset_size > 0:
-            lag_component += f"_offset_{self.offset_size}{self.offset_unit}"
-        return lag_component
-    @classmethod
-    def from_formula(cls, formula: str) -> Optional["Lag"]:
-        import re
-        # Try matching pattern with offset first
-        pattern_with_offset = r"^lag_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])$"
-        match_with_offset = re.match(pattern_with_offset, formula)
-        if match_with_offset:
-            lag_size = int(match_with_offset.group(1))
-            lag_unit = match_with_offset.group(2)
-            offset_size = int(match_with_offset.group(3))
-            offset_unit = match_with_offset.group(4)
-            return cls(
-                lag_size=lag_size,
-                lag_unit=lag_unit,
-                offset_size=offset_size,
-                offset_unit=offset_unit,
-            )
-        # If no offset pattern found, try basic pattern
-        pattern = r"^lag_(\d+)([a-zA-Z])$"
-        match = re.match(pattern, formula)
-        if not match:
-            return None
-        lag_size = int(match.group(1))
-        lag_unit = match.group(2)
-        return cls(lag_size=lag_size, lag_unit=lag_unit)
-    def get_params(self) -> Dict[str, Optional[str]]:
-        res = super().get_params()
-        res.update(
-            {
-                "lag_size": self.lag_size,
-                "lag_unit": self.lag_unit,
-            }
-        )
-        return res
-    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
-        lag_window = self.lag_size + 1
-        return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=lag_window).agg(lambda x: x[0])

upgini/features_enricher.py CHANGED Viewed

@@ -31,7 +31,7 @@ from sklearn.exceptions import NotFittedError
 from sklearn.model_selection import BaseCrossValidator
 from upgini.autofe.feature import Feature
-from upgini.autofe.vector import TimeSeriesBase
+from upgini.autofe.timeseries import TimeSeriesBase
 from upgini.data_source.data_source_publisher import CommercialSchema
 from upgini.dataset import Dataset
 from upgini.errors import HttpError, ValidationError

{upgini-1.2.62.dist-info → upgini-1.2.62a3818.dev2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.62
+Version: 1.2.62a3818.dev2
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

{upgini-1.2.62.dist-info → upgini-1.2.62a3818.dev2.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,9 @@
-upgini/__about__.py,sha256=X-PIyJPyy-W4DbKWDuHTMhmvRT8La2rsZ63Zaf_MERI,23
+upgini/__about__.py,sha256=OLozvzWRYF8QVe08Gh2xAIzV-SPbWN9X8WcPvXKgTuU,33
 upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
 upgini/dataset.py,sha256=OGjpeFHbj3lWiZTOHTpWEoMMDmFY1FlNC44FKktoZvU,34956
 upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
-upgini/features_enricher.py,sha256=2AMEXtoMrEFw3f0b0CsvkFyS1a7L4aqI2GO_fCsgWac,205336
+upgini/features_enricher.py,sha256=cB2I5rNpbztjkYEEW5aJuKj2fCMnfxp40X4Eo63oyuQ,205340
 upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
 upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
 upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
@@ -14,14 +14,15 @@ upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1
 upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
 upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
 upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-upgini/autofe/all_operands.py,sha256=v0_NozalvvzeojSAA0d7UJ5INS654ZVaLn4S8djK6Ac,329
-upgini/autofe/binary.py,sha256=zMhtHVuGUAFLUqem-XiXqJj-GRXxS88tdz8tFuDfSNM,7659
-upgini/autofe/date.py,sha256=pqwwk4_35RYXDT2fSJ9dlxGBm-R0jWBeiSb-79hZjkI,10721
-upgini/autofe/feature.py,sha256=zvRdlxCkaOsX0XiragNvh0tAPyOWut0MQTq5JGU5HtY,14749
-upgini/autofe/groupby.py,sha256=G48_sQZw016eGx3cOy8YQrEIOp95puWqYUpFWd-gdeM,3595
-upgini/autofe/operand.py,sha256=8Ttrfxv_H91dMbS7J55zxluzAJHfGXU_Y2xCh4OHwb8,4774
-upgini/autofe/unary.py,sha256=T3E7F3dA_7o_rkdCFq7JV6nHLzcoHLHQTcxO7y5Opa4,4646
-upgini/autofe/vector.py,sha256=udkg4pP7IIeLjt0Cg6rzEKUmGaubOnqsEz3bz9R6E44,7110
+upgini/autofe/all_operands.py,sha256=VIT5jCq5U-qypdNz1MIQ_hlIAs0ujJgRfKRUkU24nFs,332
+upgini/autofe/binary.py,sha256=MnQuFiERpocjCPQUjOljlsq5FE-04GPfwtNjzvfNMyU,7671
+upgini/autofe/date.py,sha256=I07psJerrxOcHao91PdSCk9X6KWu61IBVyFRLjGNgK8,10730
+upgini/autofe/feature.py,sha256=Xto7FHH1JG-5QvkfTPNWKtV9GAzPviTNPKFZOUN7RQA,14757
+upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
+upgini/autofe/operator.py,sha256=RSJWoKB2pIZ5xToVuk_T0ec7QRx-duxYEEGJ5oealaM,4784
+upgini/autofe/timeseries.py,sha256=-BnDp0z_Hv6Vol1Vov6QC_82U8XPV3pfIPFspK2aTCE,6598
+upgini/autofe/unary.py,sha256=yVgPvtfnPSOhrii0YgezddmgWPwyOBCR0JutaIkdTTc,4658
+upgini/autofe/vector.py,sha256=l0KdKg-txlZxDSE4hPPfCtfGQofYbl7oaABPr830sPI,667
 upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/data_source/data_source_publisher.py,sha256=4S9qwlAklD8vg9tUU_c1pHE2_glUHAh15-wr5hMwKFw,22879
 upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
@@ -62,7 +63,7 @@ upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
 upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
 upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
-upgini-1.2.62.dist-info/METADATA,sha256=l1TBHJEV26NNT_Er41bbO3ph5UZ-QkzYTpf_JU1Y7ak,49084
-upgini-1.2.62.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
-upgini-1.2.62.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.62.dist-info/RECORD,,
+upgini-1.2.62a3818.dev2.dist-info/METADATA,sha256=VEJPjgu8A5gOrr4WPbk6DYHt8BNxoqUq9rsl967GQMU,49094
+upgini-1.2.62a3818.dev2.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
+upgini-1.2.62a3818.dev2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.62a3818.dev2.dist-info/RECORD,,

{upgini-1.2.62.dist-info → upgini-1.2.62a3818.dev2.dist-info}/WHEEL RENAMED Viewed

File without changes

{upgini-1.2.62.dist-info → upgini-1.2.62a3818.dev2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.62__py3-none-any.whl → 1.2.62a3818.dev2__py3-none-any.whl

upgini 1.2.62py3-none-any.whl → 1.2.62a3818.dev2py3-none-any.whl