PyPI - upgini - Versions diffs - 1.2.60a3792.dev2__py3-none-any.whl → 1.2.62a3818.dev1__py3-none-any.whl - Mend

upgini 1.2.60a3792.dev2py3-none-any.whl → 1.2.62a3818.dev1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (23) hide show

upgini/__about__.py +1 -1
upgini/autofe/all_operands.py +2 -2
upgini/autofe/binary.py +1 -1
upgini/autofe/date.py +2 -2
upgini/autofe/feature.py +1 -1
upgini/autofe/groupby.py +1 -1
upgini/autofe/{operand.py → operator.py} +2 -2
upgini/autofe/timeseries.py +200 -0
upgini/autofe/unary.py +1 -1
upgini/autofe/vector.py +2 -198
upgini/data_source/data_source_publisher.py +9 -4
upgini/features_enricher.py +108 -46
upgini/metrics.py +4 -7
upgini/resource_bundle/strings.properties +1 -0
upgini/utils/datetime_utils.py +2 -0
upgini/utils/mstats.py +177 -0
upgini/utils/sort.py +172 -0
upgini/utils/target_utils.py +3 -3
upgini/utils/ts_utils.py +0 -6
{upgini-1.2.60a3792.dev2.dist-info → upgini-1.2.62a3818.dev1.dist-info}/METADATA +2 -1
{upgini-1.2.60a3792.dev2.dist-info → upgini-1.2.62a3818.dev1.dist-info}/RECORD +23 -20
{upgini-1.2.60a3792.dev2.dist-info → upgini-1.2.62a3818.dev1.dist-info}/WHEEL +0 -0
{upgini-1.2.60a3792.dev2.dist-info → upgini-1.2.62a3818.dev1.dist-info}/licenses/LICENSE +0 -0

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.~~60a3792~~.~~dev2~~"
1	+ __version__ = "1.2.62a3818.dev1"

upgini/autofe/all_operands.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from upgini.autofe.operand import OperandRegistry
+from upgini.autofe.operator import OperatorRegistry
 from upgini.autofe.unary import *  # noqa
 from upgini.autofe.binary import *  # noqa
 from upgini.autofe.groupby import *  # noqa
@@ -7,4 +7,4 @@ from upgini.autofe.vector import *  # noqa
 def find_op(name):
-    return OperandRegistry.get_operand(name)
+    return OperatorRegistry.get_operand(name)

upgini/autofe/binary.py CHANGED Viewed

@@ -5,7 +5,7 @@ import numpy as np
 import pandas as pd
 from jarowinkler import jarowinkler_similarity
-from upgini.autofe.operand import PandasOperand, VectorizableMixin
+from upgini.autofe.operator import PandasOperand, VectorizableMixin
 class Min(PandasOperand):

upgini/autofe/date.py CHANGED Viewed

@@ -1,13 +1,13 @@
 import abc
 import json
-from typing import Any, Dict, List, Optional, Union
+from typing import Dict, List, Optional, Union
 import numpy as np
 import pandas as pd
 from pandas.core.arrays.timedeltas import TimedeltaArray
 from pydantic import BaseModel, __version__ as pydantic_version
-from upgini.autofe.operand import PandasOperand, ParametrizedOperand
+from upgini.autofe.operator import PandasOperand, ParametrizedOperand
 def get_pydantic_version():

upgini/autofe/feature.py CHANGED Viewed

@@ -7,7 +7,7 @@ import pandas as pd
 from pandas._typing import DtypeObj
 from upgini.autofe.all_operands import find_op
-from upgini.autofe.operand import Operand, PandasOperand
+from upgini.autofe.operator import Operand, PandasOperand
 class Column:

upgini/autofe/groupby.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Optional
 import pandas as pd
-from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
+from upgini.autofe.operator import PandasOperand, ParametrizedOperand, VectorizableMixin
 class GroupByThenAgg(

upgini/autofe/{operand.py → operator.py} RENAMED Viewed

@@ -6,7 +6,7 @@ import pandas as pd
 from pydantic import BaseModel
-class OperandRegistry(type(BaseModel)):
+class OperatorRegistry(type(BaseModel)):
     _registry = {}
     _parametrized_registry = []
@@ -46,7 +46,7 @@ class OperandRegistry(type(BaseModel)):
         return None
-class Operand(BaseModel, metaclass=OperandRegistry):
+class Operand(BaseModel, metaclass=OperatorRegistry):
     name: Optional[str] = None
     alias: Optional[str] = None
     is_unary: bool = False

upgini/autofe/timeseries.py ADDED Viewed

@@ -0,0 +1,200 @@
+import abc
+from typing import Dict, List, Optional
+import pandas as pd
+from upgini.autofe.operator import PandasOperand, ParametrizedOperand
+try:
+    from pydantic import field_validator as validator  # V2
+except ImportError:
+    from pydantic import validator  # V1
+class TimeSeriesBase(PandasOperand, abc.ABC):
+    is_vector: bool = True
+    date_unit: Optional[str] = None
+    offset_size: int = 0
+    offset_unit: str = "D"
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        res.update(
+            {
+                "date_unit": self.date_unit,
+                "offset_size": self.offset_size,
+                "offset_unit": self.offset_unit,
+            }
+        )
+        return res
+    def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
+        # assuming first is date, last is value, rest is group columns
+        date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
+        ts = pd.concat([date] + data[1:], axis=1)
+        ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
+        ts.set_index(date.name, inplace=True)
+        ts = ts[ts.index.notna()].sort_index()
+        ts = (
+            ts.groupby([c.name for c in data[1:-1]], group_keys=True)
+            .apply(self._shift)[data[-1].name]
+            .to_frame()
+            .reset_index()
+            .set_index(date.name)
+            .groupby([c.name for c in data[1:-1]])
+            if len(data) > 2
+            else self._shift(ts)
+        )
+        ts = self._aggregate(ts)
+        ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
+        ts.index = date.index
+        return ts.iloc[:, -1]
+    def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
+        if self.offset_size > 0:
+            return ts.iloc[:, :-1].merge(
+                ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
+                left_index=True,
+                right_index=True,
+            )
+        return ts
+    @abc.abstractmethod
+    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
+        pass
+_roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean(), "last": lambda x: x[-1]}
+class Roll(TimeSeriesBase, ParametrizedOperand):
+    aggregation: str
+    window_size: int = 1
+    window_unit: str = "D"
+    @validator("window_unit")
+    @classmethod
+    def validate_window_unit(cls, v: str) -> str:
+        try:
+            pd.tseries.frequencies.to_offset(v)
+            return v
+        except ValueError:
+            raise ValueError(
+                f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
+            )
+    def to_formula(self) -> str:
+        roll_component = f"roll_{self.window_size}{self.window_unit}"
+        if self.offset_size > 0:
+            roll_component += f"_offset_{self.offset_size}{self.offset_unit}"
+        return f"{roll_component}_{self.aggregation}"
+    @classmethod
+    def from_formula(cls, formula: str) -> Optional["Roll"]:
+        import re
+        # Try matching pattern with offset first
+        pattern_with_offset = r"^roll_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])_(\w+)$"
+        match_with_offset = re.match(pattern_with_offset, formula)
+        if match_with_offset:
+            window_size = int(match_with_offset.group(1))
+            window_unit = match_with_offset.group(2)
+            offset_size = int(match_with_offset.group(3))
+            offset_unit = match_with_offset.group(4)
+            aggregation = match_with_offset.group(5)
+            return cls(
+                window_size=window_size,
+                window_unit=window_unit,
+                offset_size=offset_size,
+                offset_unit=offset_unit,
+                aggregation=aggregation,
+            )
+        # If no offset pattern found, try basic pattern
+        pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
+        match = re.match(pattern, formula)
+        if not match:
+            return None
+        window_size = int(match.group(1))
+        window_unit = match.group(2)
+        aggregation = match.group(3)
+        return cls(window_size=window_size, window_unit=window_unit, aggregation=aggregation)
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        res.update(
+            {
+                "window_size": self.window_size,
+                "window_unit": self.window_unit,
+                "aggregation": self.aggregation,
+            }
+        )
+        return res
+    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
+        return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
+            _roll_aggregations.get(self.aggregation, self.aggregation)
+        )
+class Lag(TimeSeriesBase, ParametrizedOperand):
+    lag_size: int
+    lag_unit: str = "D"
+    def to_formula(self) -> str:
+        lag_component = f"lag_{self.lag_size}{self.lag_unit}"
+        if self.offset_size > 0:
+            lag_component += f"_offset_{self.offset_size}{self.offset_unit}"
+        return lag_component
+    @classmethod
+    def from_formula(cls, formula: str) -> Optional["Lag"]:
+        import re
+        # Try matching pattern with offset first
+        pattern_with_offset = r"^lag_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])$"
+        match_with_offset = re.match(pattern_with_offset, formula)
+        if match_with_offset:
+            lag_size = int(match_with_offset.group(1))
+            lag_unit = match_with_offset.group(2)
+            offset_size = int(match_with_offset.group(3))
+            offset_unit = match_with_offset.group(4)
+            return cls(
+                lag_size=lag_size,
+                lag_unit=lag_unit,
+                offset_size=offset_size,
+                offset_unit=offset_unit,
+            )
+        # If no offset pattern found, try basic pattern
+        pattern = r"^lag_(\d+)([a-zA-Z])$"
+        match = re.match(pattern, formula)
+        if not match:
+            return None
+        lag_size = int(match.group(1))
+        lag_unit = match.group(2)
+        return cls(lag_size=lag_size, lag_unit=lag_unit)
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        res.update(
+            {
+                "lag_size": self.lag_size,
+                "lag_unit": self.lag_unit,
+            }
+        )
+        return res
+    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
+        lag_window = self.lag_size + 1
+        return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=lag_window).agg(lambda x: x[0])

upgini/autofe/unary.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Dict, Optional
 import numpy as np
 import pandas as pd
-from upgini.autofe.operand import PandasOperand, VectorizableMixin
+from upgini.autofe.operator import PandasOperand, VectorizableMixin
 class Abs(PandasOperand, VectorizableMixin):

upgini/autofe/vector.py CHANGED Viewed

@@ -1,14 +1,8 @@
-import abc
-from typing import Dict, List, Optional
+from typing import List, Optional
 import pandas as pd
-try:
-    from pydantic import field_validator as validator  # V2
-except ImportError:
-    from pydantic import validator  # V1
-from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
+from upgini.autofe.operator import PandasOperand, VectorizableMixin
 class Mean(PandasOperand, VectorizableMixin):
@@ -28,193 +22,3 @@ class Sum(PandasOperand, VectorizableMixin):
     def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
         return pd.DataFrame(data).T.fillna(0).sum(axis=1)
-class TimeSeriesBase(PandasOperand, abc.ABC):
-    is_vector: bool = True
-    date_unit: Optional[str] = None
-    offset_size: int = 0
-    offset_unit: str = "D"
-    def get_params(self) -> Dict[str, Optional[str]]:
-        res = super().get_params()
-        res.update(
-            {
-                "date_unit": self.date_unit,
-                "offset_size": self.offset_size,
-                "offset_unit": self.offset_unit,
-            }
-        )
-        return res
-    def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
-        # assuming first is date, last is value, rest is group columns
-        date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
-        ts = pd.concat([date] + data[1:], axis=1)
-        ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
-        ts.set_index(date.name, inplace=True)
-        ts = ts[ts.index.notna()].sort_index()
-        ts = (
-            ts.groupby([c.name for c in data[1:-1]])
-            .apply(self._shift)[data[-1].name]
-            .to_frame()
-            .reset_index()
-            .set_index(date.name)
-            .groupby([c.name for c in data[1:-1]])
-            if len(data) > 2
-            else self._shift(ts)
-        )
-        ts = self._aggregate(ts)
-        ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
-        ts.index = date.index
-        return ts.iloc[:, -1]
-    def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
-        if self.offset_size > 0:
-            return ts.iloc[:, :-1].merge(
-                ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
-                left_index=True,
-                right_index=True,
-            )
-        return ts
-    @abc.abstractmethod
-    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
-        pass
-_roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean()}
-class Roll(TimeSeriesBase, ParametrizedOperand):
-    aggregation: str
-    window_size: int = 1
-    window_unit: str = "D"
-    @validator("window_unit")
-    @classmethod
-    def validate_window_unit(cls, v: str) -> str:
-        try:
-            pd.tseries.frequencies.to_offset(v)
-            return v
-        except ValueError:
-            raise ValueError(
-                f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
-            )
-    def to_formula(self) -> str:
-        roll_component = f"roll_{self.window_size}{self.window_unit}"
-        if self.offset_size > 0:
-            roll_component += f"_offset_{self.offset_size}{self.offset_unit}"
-        return f"{roll_component}_{self.aggregation}"
-    @classmethod
-    def from_formula(cls, formula: str) -> Optional["Roll"]:
-        import re
-        # Try matching pattern with offset first
-        pattern_with_offset = r"^roll_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])_(\w+)$"
-        match_with_offset = re.match(pattern_with_offset, formula)
-        if match_with_offset:
-            window_size = int(match_with_offset.group(1))
-            window_unit = match_with_offset.group(2)
-            offset_size = int(match_with_offset.group(3))
-            offset_unit = match_with_offset.group(4)
-            aggregation = match_with_offset.group(5)
-            return cls(
-                window_size=window_size,
-                window_unit=window_unit,
-                offset_size=offset_size,
-                offset_unit=offset_unit,
-                aggregation=aggregation,
-            )
-        # If no offset pattern found, try basic pattern
-        pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
-        match = re.match(pattern, formula)
-        if not match:
-            return None
-        window_size = int(match.group(1))
-        window_unit = match.group(2)
-        aggregation = match.group(3)
-        return cls(window_size=window_size, window_unit=window_unit, aggregation=aggregation)
-    def get_params(self) -> Dict[str, Optional[str]]:
-        res = super().get_params()
-        res.update(
-            {
-                "window_size": self.window_size,
-                "window_unit": self.window_unit,
-                "aggregation": self.aggregation,
-            }
-        )
-        return res
-    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
-        return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
-            _roll_aggregations.get(self.aggregation, self.aggregation)
-        )
-class Lag(TimeSeriesBase, ParametrizedOperand):
-    lag_size: int
-    lag_unit: str = "D"
-    def to_formula(self) -> str:
-        lag_component = f"lag_{self.lag_size}{self.lag_unit}"
-        if self.offset_size > 0:
-            lag_component += f"_offset_{self.offset_size}{self.offset_unit}"
-        return lag_component
-    @classmethod
-    def from_formula(cls, formula: str) -> Optional["Lag"]:
-        import re
-        # Try matching pattern with offset first
-        pattern_with_offset = r"^lag_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])$"
-        match_with_offset = re.match(pattern_with_offset, formula)
-        if match_with_offset:
-            lag_size = int(match_with_offset.group(1))
-            lag_unit = match_with_offset.group(2)
-            offset_size = int(match_with_offset.group(3))
-            offset_unit = match_with_offset.group(4)
-            return cls(
-                lag_size=lag_size,
-                lag_unit=lag_unit,
-                offset_size=offset_size,
-                offset_unit=offset_unit,
-            )
-        # If no offset pattern found, try basic pattern
-        pattern = r"^lag_(\d+)([a-zA-Z])$"
-        match = re.match(pattern, formula)
-        if not match:
-            return None
-        lag_size = int(match.group(1))
-        lag_unit = match.group(2)
-        return cls(lag_size=lag_size, lag_unit=lag_unit)
-    def get_params(self) -> Dict[str, Optional[str]]:
-        res = super().get_params()
-        res.update(
-            {
-                "lag_size": self.lag_size,
-                "lag_unit": self.lag_unit,
-            }
-        )
-        return res
-    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
-        lag_window = self.lag_size + 1
-        return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=lag_window).agg(lambda x: x[0])

upgini/data_source/data_source_publisher.py CHANGED Viewed

@@ -63,6 +63,7 @@ class DataSourcePublisher:
         keep_features: Optional[List[str]] = None,
         date_features: Optional[List[str]] = None,
         date_vector_features: Optional[List[str]] = None,
+        date_features_format: Optional[str] = None,
         generate_runtime_embeddings: Optional[List[str]] = None,
         exclude_raw: Optional[List[str]] = None,
         _force_generation=False,
@@ -160,13 +161,17 @@ class DataSourcePublisher:
                 if keep_features is not None:
                     request["keepFeatures"] = keep_features
                 if date_features is not None:
-                    if date_format is None:
-                        raise ValidationError("date_format should be presented if you use date features")
+                    if date_features_format is None:
+                        raise ValidationError("date_features_format should be presented if you use date features")
                     request["dateFeatures"] = date_features
+                    request["dateFeaturesFormat"] = date_features_format
                 if date_vector_features is not None:
-                    if date_format is None:
-                        raise ValidationError("date_format should be presented if you use date vector features")
+                    if date_features_format is None:
+                        raise ValidationError(
+                            "date_features_format should be presented if you use date vector features"
+                        )
                     request["dateVectorFeatures"] = date_vector_features
+                    request["dateFeaturesFormat"] = date_features_format
                 if generate_runtime_embeddings is not None:
                     request["generateRuntimeEmbeddingsFeatures"] = generate_runtime_embeddings
                 if exclude_raw is not None:

upgini 1.2.60a3792.dev2__py3-none-any.whl → 1.2.62a3818.dev1__py3-none-any.whl

Potentially problematic release.

upgini 1.2.60a3792.dev2py3-none-any.whl → 1.2.62a3818.dev1py3-none-any.whl