PyPI - upgini - Versions diffs - 1.2.19a1__tar.gz → 1.2.20a3657.dev1__tar.gz - Mend

upgini 1.2.19a1tar.gz → 1.2.20a3657.dev1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (67) hide show

{upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.19a1
+Version: 1.2.20a3657.dev1
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

upgini-1.2.20a3657.dev1/src/upgini/__about__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.2.20a3657.dev1"

{upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/__init__.py RENAMED Viewed

@@ -2,6 +2,7 @@ import os
 from upgini.features_enricher import FeaturesEnricher  # noqa: F401
 from upgini.metadata import SearchKey, CVType, RuntimeParameters, ModelTaskType  # noqa: F401
 # from .lazy_import import LazyImport
 os.environ["SETUPTOOLS_USE_DISTUTILS"] = "stdlib"

upgini-1.2.20a3657.dev1/src/upgini/autofe/all_operands.py ADDED Viewed

@@ -0,0 +1,5 @@
+from upgini.autofe.operand import OperandRegistry
+def find_op(name):
+    return OperandRegistry.get_operand(name)

{upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/autofe/date.py RENAMED Viewed

@@ -7,11 +7,11 @@ import pandas as pd
 from pandas.core.arrays.timedeltas import TimedeltaArray
 from pydantic import BaseModel, __version__ as pydantic_version
-from upgini.autofe.operand import PandasOperand
+from upgini.autofe.operand import PandasOperand, ParametrizedOperand
 def get_pydantic_version():
-    major_version = int(pydantic_version.split('.')[0])
+    major_version = int(pydantic_version.split(".")[0])
     return major_version
@@ -109,7 +109,7 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
 _count_aggregations = ["nunique", "count"]
-class DateListDiff(PandasOperand, DateDiffMixin):
+class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
     is_binary: bool = True
     has_symmetry_importance: bool = True
@@ -134,6 +134,15 @@ class DateListDiff(PandasOperand, DateDiffMixin):
             data["name"] = f"date_diff_{data.get('aggregation')}"
         super().__init__(**data)
+    @classmethod
+    def from_formula(cls, formula: str) -> Optional["DateListDiff"]:
+        if not formula.startswith("date_diff_"):
+            return None
+        aggregation = formula.replace("date_diff_", "")
+        if "_" in aggregation:
+            return None
+        return cls(aggregation=aggregation)
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         left = self._convert_to_date(left, self.left_unit)
         right_mask = right.apply(lambda x: len(x) > 0)
@@ -170,7 +179,7 @@ class DateListDiff(PandasOperand, DateDiffMixin):
         return method(x) if len(x) > 0 else default
-class DateListDiffBounded(DateListDiff):
+class DateListDiffBounded(DateListDiff, ParametrizedOperand):
     lower_bound: Optional[int] = None
     upper_bound: Optional[int] = None
@@ -188,6 +197,23 @@ class DateListDiffBounded(DateListDiff):
             data["name"] = "_".join(components)
         super().__init__(**data)
+    @classmethod
+    def from_formula(cls, formula: str) -> Optional["DateListDiffBounded"]:
+        import re
+        pattern = r"^date_diff_([^_]+)_((minusinf|\d+))_((plusinf|\d+))_(\w+)$"
+        match = re.match(pattern, formula)
+        if not match:
+            return None
+        diff_unit = match.group(1)
+        lower_bound = None if match.group(2) == "minusinf" else int(match.group(2))
+        upper_bound = None if match.group(4) == "plusinf" else int(match.group(4))
+        aggregation = match.group(6)
+        return cls(diff_unit=diff_unit, lower_bound=lower_bound, upper_bound=upper_bound, aggregation=aggregation)
     def _agg(self, x):
         x = x[
             (x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
@@ -257,16 +283,17 @@ class DatePercentile(DatePercentileBase):
         # Use @field_validator for Pydantic 2.x
         from pydantic import field_validator
-        @field_validator('zero_bounds', mode='before')
+        @field_validator("zero_bounds", mode="before")
         def parse_zero_bounds(cls, value):
             if isinstance(value, str):
                 return json.loads(value)
             return value
     else:
         # Use @validator for Pydantic 1.x
         from pydantic import validator
-        @validator('zero_bounds', pre=True)
+        @validator("zero_bounds", pre=True)
         def parse_zero_bounds(cls, value):
             if isinstance(value, str):
                 return json.loads(value)

{upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/autofe/operand.py RENAMED Viewed

@@ -6,7 +6,47 @@ import pandas as pd
 from pydantic import BaseModel
-class Operand(BaseModel):
+class OperandRegistry(type(BaseModel)):
+    _registry = {}
+    _parametrized_registry = []
+    def __new__(cls, name, bases, attrs):
+        new_class = super().__new__(cls, name, bases, attrs)
+        # Only register if it's a concrete class that inherits from Operand
+        base_classes = [b for b in bases]
+        base_names = {b.__name__ for b in bases}
+        while base_classes:
+            base = base_classes.pop()
+            base_names.update(b.__name__ for b in base.__bases__)
+            base_classes.extend(base.__bases__)
+        if "Operand" in base_names:
+            # Track parametrized operands separately
+            if "ParametrizedOperand" in base_names:
+                cls._parametrized_registry.append(new_class)
+            else:
+                try:
+                    instance = new_class()
+                    cls._registry[instance.name] = new_class
+                except Exception:
+                    pass
+        return new_class
+    @classmethod
+    def get_operand(cls, name: str) -> Optional["Operand"]:
+        # First try to resolve as a parametrized operand formula
+        for operand_cls in cls._parametrized_registry:
+            resolved = operand_cls.from_formula(name)
+            if resolved is not None:
+                return resolved
+        # Fall back to direct registry lookup
+        non_parametrized = cls._registry.get(name)
+        if non_parametrized is not None:
+            return non_parametrized()
+        return None
+class Operand(BaseModel, metaclass=OperandRegistry):
     name: str
     alias: Optional[str] = None
     is_unary: bool = False
@@ -32,6 +72,12 @@ class Operand(BaseModel):
         return res
+class ParametrizedOperand(Operand):
+    @classmethod
+    def from_formula(cls, formula: str) -> Optional["Operand"]:
+        pass
 MAIN_COLUMN = "main_column"

upgini-1.2.20a3657.dev1/src/upgini/autofe/vector.py ADDED Viewed

@@ -0,0 +1,155 @@
+import abc
+from typing import Any, Dict, List, Optional
+import pandas as pd
+from pydantic import validator
+from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
+class Mean(PandasOperand, VectorizableMixin):
+    name: str = "mean"
+    output_type: Optional[str] = "float"
+    is_vector: bool = True
+    group_index: int = 0
+    def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
+        return pd.DataFrame(data).T.fillna(0).mean(axis=1)
+class Sum(PandasOperand, VectorizableMixin):
+    name: str = "sum"
+    is_vector: bool = True
+    group_index: int = 0
+    def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
+        return pd.DataFrame(data).T.fillna(0).sum(axis=1)
+class TimeSeriesBase(PandasOperand, abc.ABC):
+    is_vector: bool = True
+    date_unit: Optional[str] = None
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        res.update(
+            {
+                "date_unit": self.date_unit,
+            }
+        )
+        return res
+    def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
+        # assuming first is date, last is value, rest is group columns
+        date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
+        ts = pd.concat([date] + data[1:], axis=1)
+        ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
+        ts.set_index(date.name, inplace=True)
+        ts = ts[ts.index.notna()].sort_index()
+        ts = ts.groupby([c.name for c in data[1:-1]]) if len(data) > 2 else ts
+        ts = self._aggregate(ts)
+        ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
+        return ts.iloc[:, -1]
+    @abc.abstractmethod
+    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
+        pass
+_roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean()}
+class Roll(TimeSeriesBase, ParametrizedOperand):
+    aggregation: str
+    window_size: int = 1
+    window_unit: str = "D"
+    @validator("window_unit")
+    def validate_window_unit(cls, v: str) -> str:
+        try:
+            pd.tseries.frequencies.to_offset(v)
+            return v
+        except ValueError:
+            raise ValueError(
+                f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
+            )
+    def __init__(self, **data: Any) -> None:
+        if "name" not in data:
+            components = [
+                "roll",
+                str(data.get("window_size") or 1) + str(data.get("window_unit") or "D"),
+                data.get("aggregation"),
+            ]
+            data["name"] = "_".join(components).lower()
+        super().__init__(**data)
+    @classmethod
+    def from_formula(cls, formula: str) -> Optional["Roll"]:
+        import re
+        pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
+        match = re.match(pattern, formula)
+        if not match:
+            return None
+        window_size = int(match.group(1))
+        window_unit = match.group(2)
+        aggregation = match.group(3)
+        return cls(window_size=window_size, window_unit=window_unit, aggregation=aggregation)
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        res.update(
+            {
+                "window_size": self.window_size,
+                "window_unit": self.window_unit,
+                "aggregation": self.aggregation,
+            }
+        )
+        return res
+    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
+        return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=self.window_size).agg(
+            _roll_aggregations.get(self.aggregation, self.aggregation)
+        )
+class Lag(TimeSeriesBase, ParametrizedOperand):
+    lag_size: int
+    lag_unit: str = "D"
+    def __init__(self, **data: Any) -> None:
+        if "name" not in data:
+            components = [
+                "lag",
+                str(data.get("lag_size") or 1) + str(data.get("lag_unit") or "D"),
+            ]
+            data["name"] = "_".join(components).lower()
+        super().__init__(**data)
+    @classmethod
+    def from_formula(cls, formula: str) -> Optional["Lag"]:
+        import re
+        pattern = r"^lag_(\d+)([a-zA-Z])$"
+        match = re.match(pattern, formula)
+        if not match:
+            return None
+        lag_size = int(match.group(1))
+        lag_unit = match.group(2)
+        return cls(lag_size=lag_size, lag_unit=lag_unit)
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        return res
+    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
+        lag_window = self.lag_size + 1
+        return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=lag_window).agg(lambda x: x[0])

{upgini-1.2.19a1 → upgini-1.2.20a3657.dev1}/src/upgini/features_enricher.py RENAMED Viewed

@@ -2474,8 +2474,16 @@ class FeaturesEnricher(TransformerMixin):
         df = pd.concat([validated_X, validated_y], axis=1)
+        if validated_eval_set is not None and len(validated_eval_set) > 0:
+            df[EVAL_SET_INDEX] = 0
+            for idx, (eval_X, eval_y) in enumerate(validated_eval_set):
+                eval_df = pd.concat([eval_X, eval_y], axis=1)
+                eval_df[EVAL_SET_INDEX] = idx + 1
+                df = pd.concat([df, eval_df])
         self.fit_search_keys = self.search_keys.copy()
-        self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
+        df = self.__handle_index_search_keys(df, self.fit_search_keys)
+        self.fit_search_keys = self.__prepare_search_keys(df, self.fit_search_keys, is_demo_dataset)
         maybe_date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
         has_date = maybe_date_column is not None
@@ -2487,17 +2495,8 @@ class FeaturesEnricher(TransformerMixin):
             self.loss, self.model_task_type, self.runtime_parameters, self.logger
         )
-        if validated_eval_set is not None and len(validated_eval_set) > 0:
-            df[EVAL_SET_INDEX] = 0
-            for idx, (eval_X, eval_y) in enumerate(validated_eval_set):
-                eval_df = pd.concat([eval_X, eval_y], axis=1)
-                eval_df[EVAL_SET_INDEX] = idx + 1
-                df = pd.concat([df, eval_df])
         df = self.__correct_target(df)
-        df = self.__handle_index_search_keys(df, self.fit_search_keys)
         if DEFAULT_INDEX in df.columns:
             msg = self.bundle.get("unsupported_index_column")
             self.logger.info(msg)

upgini-1.2.19a1/src/upgini/__about__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- __version__ = "1.2.19a1"

upgini-1.2.19a1/src/upgini/autofe/all_operands.py DELETED Viewed

@@ -1,87 +0,0 @@
-from copy import deepcopy
-from typing import Dict
-from upgini.autofe.binary import (
-    Add,
-    Combine,
-    CombineThenFreq,
-    Distance,
-    Divide,
-    JaroWinklerSim1,
-    JaroWinklerSim2,
-    LevenshteinSim,
-    Max,
-    Min,
-    Multiply,
-    Sim,
-    Subtract,
-)
-from upgini.autofe.date import (
-    DateDiff,
-    DateDiffType2,
-    DateListDiff,
-    DateListDiffBounded,
-    DatePercentile,
-    DatePercentileMethod2,
-)
-from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
-from upgini.autofe.operand import Operand
-from upgini.autofe.unary import Abs, Embeddings, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
-from upgini.autofe.vector import Mean, Sum
-ALL_OPERANDS: Dict[str, Operand] = {
-    op.name: op
-    for op in [
-        Freq(),
-        Mean(),
-        Sum(),
-        Abs(),
-        Log(),
-        Sqrt(),
-        Square(),
-        Sigmoid(),
-        Floor(),
-        Residual(),
-        Min(),
-        Max(),
-        Add(),
-        Subtract(),
-        Multiply(),
-        Divide(),
-        GroupByThenAgg(name="GroupByThenMin", agg="min"),
-        GroupByThenAgg(name="GroupByThenMax", agg="max"),
-        GroupByThenAgg(name="GroupByThenMean", agg="mean"),
-        GroupByThenAgg(name="GroupByThenMedian", agg="median"),
-        GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
-        GroupByThenRank(),
-        Combine(),
-        CombineThenFreq(),
-        GroupByThenNUnique(),
-        GroupByThenFreq(),
-        Sim(),
-        DateDiff(),
-        DateDiffType2(),
-        DateListDiff(aggregation="min"),
-        DateListDiff(aggregation="max"),
-        DateListDiff(aggregation="mean"),
-        DateListDiff(aggregation="nunique"),
-        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=0, upper_bound=18),
-        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=18, upper_bound=23),
-        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=23, upper_bound=30),
-        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=30, upper_bound=45),
-        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
-        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
-        DatePercentile(),
-        DatePercentileMethod2(),
-        Norm(),
-        JaroWinklerSim1(),
-        JaroWinklerSim2(),
-        LevenshteinSim(),
-        Distance(),
-        Embeddings(),
-    ]
-}
-def find_op(name):
-    return deepcopy(ALL_OPERANDS.get(name))

upgini-1.2.19a1/src/upgini/autofe/vector.py DELETED Viewed

@@ -1,24 +0,0 @@
-from typing import List, Optional
-import pandas as pd
-from upgini.autofe.operand import PandasOperand, VectorizableMixin
-class Mean(PandasOperand, VectorizableMixin):
-    name: str = "mean"
-    output_type: Optional[str] = "float"
-    is_vector: bool = True
-    group_index: int = 0
-    def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
-        return pd.DataFrame(data).T.fillna(0).mean(axis=1)
-class Sum(PandasOperand, VectorizableMixin):
-    name: str = "sum"
-    is_vector: bool = True
-    group_index: int = 0
-    def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
-        return pd.DataFrame(data).T.fillna(0).sum(axis=1)