PyPI - upgini - Versions diffs - 1.2.20__py3-none-any.whl → 1.2.20a3657.dev1__py3-none-any.whl - Mend

upgini 1.2.20py3-none-any.whl → 1.2.20a3657.dev1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (10) hide show

upgini/__about__.py +1 -1
upgini/__init__.py +1 -0
upgini/autofe/all_operands.py +2 -84
upgini/autofe/date.py +33 -6
upgini/autofe/operand.py +47 -1
upgini/autofe/vector.py +133 -2
{upgini-1.2.20.dist-info → upgini-1.2.20a3657.dev1.dist-info}/METADATA +1 -2
{upgini-1.2.20.dist-info → upgini-1.2.20a3657.dev1.dist-info}/RECORD +10 -10
{upgini-1.2.20.dist-info → upgini-1.2.20a3657.dev1.dist-info}/WHEEL +1 -1
{upgini-1.2.20.dist-info → upgini-1.2.20a3657.dev1.dist-info}/licenses/LICENSE +0 -0

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.20"
1	+ __version__ = "1.2.20a3657.dev1"

upgini/__init__.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 from upgini.features_enricher import FeaturesEnricher  # noqa: F401
 from upgini.metadata import SearchKey, CVType, RuntimeParameters, ModelTaskType  # noqa: F401
 # from .lazy_import import LazyImport
 os.environ["SETUPTOOLS_USE_DISTUTILS"] = "stdlib"

upgini/autofe/all_operands.py CHANGED Viewed

@@ -1,87 +1,5 @@
-from copy import deepcopy
-from typing import Dict
-from upgini.autofe.binary import (
-    Add,
-    Combine,
-    CombineThenFreq,
-    Distance,
-    Divide,
-    JaroWinklerSim1,
-    JaroWinklerSim2,
-    LevenshteinSim,
-    Max,
-    Min,
-    Multiply,
-    Sim,
-    Subtract,
-)
-from upgini.autofe.date import (
-    DateDiff,
-    DateDiffType2,
-    DateListDiff,
-    DateListDiffBounded,
-    DatePercentile,
-    DatePercentileMethod2,
-)
-from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
-from upgini.autofe.operand import Operand
-from upgini.autofe.unary import Abs, Embeddings, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
-from upgini.autofe.vector import Mean, Sum
-ALL_OPERANDS: Dict[str, Operand] = {
-    op.name: op
-    for op in [
-        Freq(),
-        Mean(),
-        Sum(),
-        Abs(),
-        Log(),
-        Sqrt(),
-        Square(),
-        Sigmoid(),
-        Floor(),
-        Residual(),
-        Min(),
-        Max(),
-        Add(),
-        Subtract(),
-        Multiply(),
-        Divide(),
-        GroupByThenAgg(name="GroupByThenMin", agg="min"),
-        GroupByThenAgg(name="GroupByThenMax", agg="max"),
-        GroupByThenAgg(name="GroupByThenMean", agg="mean"),
-        GroupByThenAgg(name="GroupByThenMedian", agg="median"),
-        GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
-        GroupByThenRank(),
-        Combine(),
-        CombineThenFreq(),
-        GroupByThenNUnique(),
-        GroupByThenFreq(),
-        Sim(),
-        DateDiff(),
-        DateDiffType2(),
-        DateListDiff(aggregation="min"),
-        DateListDiff(aggregation="max"),
-        DateListDiff(aggregation="mean"),
-        DateListDiff(aggregation="nunique"),
-        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=0, upper_bound=18),
-        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=18, upper_bound=23),
-        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=23, upper_bound=30),
-        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=30, upper_bound=45),
-        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
-        DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
-        DatePercentile(),
-        DatePercentileMethod2(),
-        Norm(),
-        JaroWinklerSim1(),
-        JaroWinklerSim2(),
-        LevenshteinSim(),
-        Distance(),
-        Embeddings(),
-    ]
-}
+from upgini.autofe.operand import OperandRegistry
 def find_op(name):
-    return deepcopy(ALL_OPERANDS.get(name))
+    return OperandRegistry.get_operand(name)

upgini/autofe/date.py CHANGED Viewed

@@ -7,11 +7,11 @@ import pandas as pd
 from pandas.core.arrays.timedeltas import TimedeltaArray
 from pydantic import BaseModel, __version__ as pydantic_version
-from upgini.autofe.operand import PandasOperand
+from upgini.autofe.operand import PandasOperand, ParametrizedOperand
 def get_pydantic_version():
-    major_version = int(pydantic_version.split('.')[0])
+    major_version = int(pydantic_version.split(".")[0])
     return major_version
@@ -109,7 +109,7 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
 _count_aggregations = ["nunique", "count"]
-class DateListDiff(PandasOperand, DateDiffMixin):
+class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
     is_binary: bool = True
     has_symmetry_importance: bool = True
@@ -134,6 +134,15 @@ class DateListDiff(PandasOperand, DateDiffMixin):
             data["name"] = f"date_diff_{data.get('aggregation')}"
         super().__init__(**data)
+    @classmethod
+    def from_formula(cls, formula: str) -> Optional["DateListDiff"]:
+        if not formula.startswith("date_diff_"):
+            return None
+        aggregation = formula.replace("date_diff_", "")
+        if "_" in aggregation:
+            return None
+        return cls(aggregation=aggregation)
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
         left = self._convert_to_date(left, self.left_unit)
         right_mask = right.apply(lambda x: len(x) > 0)
@@ -170,7 +179,7 @@ class DateListDiff(PandasOperand, DateDiffMixin):
         return method(x) if len(x) > 0 else default
-class DateListDiffBounded(DateListDiff):
+class DateListDiffBounded(DateListDiff, ParametrizedOperand):
     lower_bound: Optional[int] = None
     upper_bound: Optional[int] = None
@@ -188,6 +197,23 @@ class DateListDiffBounded(DateListDiff):
             data["name"] = "_".join(components)
         super().__init__(**data)
+    @classmethod
+    def from_formula(cls, formula: str) -> Optional["DateListDiffBounded"]:
+        import re
+        pattern = r"^date_diff_([^_]+)_((minusinf|\d+))_((plusinf|\d+))_(\w+)$"
+        match = re.match(pattern, formula)
+        if not match:
+            return None
+        diff_unit = match.group(1)
+        lower_bound = None if match.group(2) == "minusinf" else int(match.group(2))
+        upper_bound = None if match.group(4) == "plusinf" else int(match.group(4))
+        aggregation = match.group(6)
+        return cls(diff_unit=diff_unit, lower_bound=lower_bound, upper_bound=upper_bound, aggregation=aggregation)
     def _agg(self, x):
         x = x[
             (x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
@@ -257,16 +283,17 @@ class DatePercentile(DatePercentileBase):
         # Use @field_validator for Pydantic 2.x
         from pydantic import field_validator
-        @field_validator('zero_bounds', mode='before')
+        @field_validator("zero_bounds", mode="before")
         def parse_zero_bounds(cls, value):
             if isinstance(value, str):
                 return json.loads(value)
             return value
     else:
         # Use @validator for Pydantic 1.x
         from pydantic import validator
-        @validator('zero_bounds', pre=True)
+        @validator("zero_bounds", pre=True)
         def parse_zero_bounds(cls, value):
             if isinstance(value, str):
                 return json.loads(value)

upgini/autofe/operand.py CHANGED Viewed

@@ -6,7 +6,47 @@ import pandas as pd
 from pydantic import BaseModel
-class Operand(BaseModel):
+class OperandRegistry(type(BaseModel)):
+    _registry = {}
+    _parametrized_registry = []
+    def __new__(cls, name, bases, attrs):
+        new_class = super().__new__(cls, name, bases, attrs)
+        # Only register if it's a concrete class that inherits from Operand
+        base_classes = [b for b in bases]
+        base_names = {b.__name__ for b in bases}
+        while base_classes:
+            base = base_classes.pop()
+            base_names.update(b.__name__ for b in base.__bases__)
+            base_classes.extend(base.__bases__)
+        if "Operand" in base_names:
+            # Track parametrized operands separately
+            if "ParametrizedOperand" in base_names:
+                cls._parametrized_registry.append(new_class)
+            else:
+                try:
+                    instance = new_class()
+                    cls._registry[instance.name] = new_class
+                except Exception:
+                    pass
+        return new_class
+    @classmethod
+    def get_operand(cls, name: str) -> Optional["Operand"]:
+        # First try to resolve as a parametrized operand formula
+        for operand_cls in cls._parametrized_registry:
+            resolved = operand_cls.from_formula(name)
+            if resolved is not None:
+                return resolved
+        # Fall back to direct registry lookup
+        non_parametrized = cls._registry.get(name)
+        if non_parametrized is not None:
+            return non_parametrized()
+        return None
+class Operand(BaseModel, metaclass=OperandRegistry):
     name: str
     alias: Optional[str] = None
     is_unary: bool = False
@@ -32,6 +72,12 @@ class Operand(BaseModel):
         return res
+class ParametrizedOperand(Operand):
+    @classmethod
+    def from_formula(cls, formula: str) -> Optional["Operand"]:
+        pass
 MAIN_COLUMN = "main_column"

upgini/autofe/vector.py CHANGED Viewed

@@ -1,8 +1,10 @@
-from typing import List, Optional
+import abc
+from typing import Any, Dict, List, Optional
 import pandas as pd
+from pydantic import validator
-from upgini.autofe.operand import PandasOperand, VectorizableMixin
+from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
 class Mean(PandasOperand, VectorizableMixin):
@@ -22,3 +24,132 @@ class Sum(PandasOperand, VectorizableMixin):
     def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
         return pd.DataFrame(data).T.fillna(0).sum(axis=1)
+class TimeSeriesBase(PandasOperand, abc.ABC):
+    is_vector: bool = True
+    date_unit: Optional[str] = None
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        res.update(
+            {
+                "date_unit": self.date_unit,
+            }
+        )
+        return res
+    def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
+        # assuming first is date, last is value, rest is group columns
+        date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
+        ts = pd.concat([date] + data[1:], axis=1)
+        ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
+        ts.set_index(date.name, inplace=True)
+        ts = ts[ts.index.notna()].sort_index()
+        ts = ts.groupby([c.name for c in data[1:-1]]) if len(data) > 2 else ts
+        ts = self._aggregate(ts)
+        ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
+        return ts.iloc[:, -1]
+    @abc.abstractmethod
+    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
+        pass
+_roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean()}
+class Roll(TimeSeriesBase, ParametrizedOperand):
+    aggregation: str
+    window_size: int = 1
+    window_unit: str = "D"
+    @validator("window_unit")
+    def validate_window_unit(cls, v: str) -> str:
+        try:
+            pd.tseries.frequencies.to_offset(v)
+            return v
+        except ValueError:
+            raise ValueError(
+                f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
+            )
+    def __init__(self, **data: Any) -> None:
+        if "name" not in data:
+            components = [
+                "roll",
+                str(data.get("window_size") or 1) + str(data.get("window_unit") or "D"),
+                data.get("aggregation"),
+            ]
+            data["name"] = "_".join(components).lower()
+        super().__init__(**data)
+    @classmethod
+    def from_formula(cls, formula: str) -> Optional["Roll"]:
+        import re
+        pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
+        match = re.match(pattern, formula)
+        if not match:
+            return None
+        window_size = int(match.group(1))
+        window_unit = match.group(2)
+        aggregation = match.group(3)
+        return cls(window_size=window_size, window_unit=window_unit, aggregation=aggregation)
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        res.update(
+            {
+                "window_size": self.window_size,
+                "window_unit": self.window_unit,
+                "aggregation": self.aggregation,
+            }
+        )
+        return res
+    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
+        return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=self.window_size).agg(
+            _roll_aggregations.get(self.aggregation, self.aggregation)
+        )
+class Lag(TimeSeriesBase, ParametrizedOperand):
+    lag_size: int
+    lag_unit: str = "D"
+    def __init__(self, **data: Any) -> None:
+        if "name" not in data:
+            components = [
+                "lag",
+                str(data.get("lag_size") or 1) + str(data.get("lag_unit") or "D"),
+            ]
+            data["name"] = "_".join(components).lower()
+        super().__init__(**data)
+    @classmethod
+    def from_formula(cls, formula: str) -> Optional["Lag"]:
+        import re
+        pattern = r"^lag_(\d+)([a-zA-Z])$"
+        match = re.match(pattern, formula)
+        if not match:
+            return None
+        lag_size = int(match.group(1))
+        lag_unit = match.group(2)
+        return cls(lag_size=lag_size, lag_unit=lag_unit)
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        return res
+    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
+        lag_window = self.lag_size + 1
+        return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=lag_window).agg(lambda x: x[0])

{upgini-1.2.20.dist-info → upgini-1.2.20a3657.dev1.dist-info}/METADATA RENAMED Viewed

@@ -1,12 +1,11 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.20
+Version: 1.2.20a3657.dev1
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/
 Project-URL: Source, https://github.com/upgini/upgini
 Author-email: Upgini Developers <madewithlove@upgini.com>
-License-Expression: BSD-3-Clause
 License-File: LICENSE
 Keywords: automl,data mining,data science,data search,machine learning
 Classifier: Development Status :: 5 - Production/Stable

{upgini-1.2.20.dist-info → upgini-1.2.20a3657.dev1.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
-upgini/__about__.py,sha256=nQtXpLTEUbMtAPecTV_hZAJZb9EhWc8glRv6hgKyvG4,23
-upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
+upgini/__about__.py,sha256=YkZ_uLYHtqgChcjML_VbuHRPzZ0weOtfhilztAaEx10,33
+upgini/__init__.py,sha256=Mb_sTh-IiGiyQLExOF226RsqnpVH8u1ozaCSW3Scdx4,590
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
 upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
 upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
@@ -14,14 +14,14 @@ upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1
 upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
 upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
 upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-upgini/autofe/all_operands.py,sha256=cCCB44qvkmuWyiRM5Xykx8tkHPIjQthrWyj67STWN80,2578
+upgini/autofe/all_operands.py,sha256=z3RSj98mkIXOkkmXHVCV7ese6V6rgD4uXyHge65HMVA,116
 upgini/autofe/binary.py,sha256=zMhtHVuGUAFLUqem-XiXqJj-GRXxS88tdz8tFuDfSNM,7659
-upgini/autofe/date.py,sha256=OpFc3Al0xO3qlESn2Uokfxw51ArVqmh3xngWwdrsaqE,9762
+upgini/autofe/date.py,sha256=kC1oQ_LKaqq-JTiqzIbUti-JB3bWizaB5nvXQ_BoD6Y,10780
 upgini/autofe/feature.py,sha256=eL7wABUhDKZzv3E-RPJNcyGwSfB0UptcfU2RbvsOks4,15082
 upgini/autofe/groupby.py,sha256=r-xl_keZZgm_tpiEoDhjYSkT6NHv7a4cRQR4wJ4uCp8,3263
-upgini/autofe/operand.py,sha256=uk883RaNqgXqtkaRqA1re1d9OFnnpv0JVvelYx09Yw0,2943
+upgini/autofe/operand.py,sha256=sEyFD_SdQ5tqJ5yGUZlXSqUnQb6WxOqZ0bMS6oKDjdU,4593
 upgini/autofe/unary.py,sha256=T3E7F3dA_7o_rkdCFq7JV6nHLzcoHLHQTcxO7y5Opa4,4646
-upgini/autofe/vector.py,sha256=ehcZUDqV71TfbU8EmKfdYp603gS2dJY_-fpr10ho5sI,663
+upgini/autofe/vector.py,sha256=KBoEcRywc1xdgYLCPlkUnKi5w0wCF0j3IYQP5eSmmgY,4807
 upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/data_source/data_source_publisher.py,sha256=X-8aGtVgzGmxyXkMVBoBLIGDMb4lYQaGZbxDnOd4A3Q,22516
 upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
 upgini/utils/target_utils.py,sha256=qHzZRmICFbLNCrmVqGkaBcjm91L2ERRZMppci36acV4,10085
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
 upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
-upgini-1.2.20.dist-info/METADATA,sha256=NVxQ5AA2uDaCtzEDlqWqpG6uEOi2xufY3pqvO9XtdgY,48611
-upgini-1.2.20.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-upgini-1.2.20.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.20.dist-info/RECORD,,
+upgini-1.2.20a3657.dev1.dist-info/METADATA,sha256=bgrk-SB81K0mrOkFRfrSl04-TuA2wxZWIbYdQOJePKA,48588
+upgini-1.2.20a3657.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
+upgini-1.2.20a3657.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.20a3657.dev1.dist-info/RECORD,,

{upgini-1.2.20.dist-info → upgini-1.2.20a3657.dev1.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: hatchling 1.25.0
+Generator: hatchling 1.24.2
 Root-Is-Purelib: true
 Tag: py3-none-any

{upgini-1.2.20.dist-info → upgini-1.2.20a3657.dev1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.20__py3-none-any.whl → 1.2.20a3657.dev1__py3-none-any.whl

Potentially problematic release.

upgini 1.2.20py3-none-any.whl → 1.2.20a3657.dev1py3-none-any.whl