PyPI - upgini - Versions diffs - 1.2.63__py3-none-any.whl → 1.2.65a3818.dev6__py3-none-any.whl - Mend

upgini 1.2.63py3-none-any.whl → 1.2.65a3818.dev6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (23) hide show

upgini/__about__.py +1 -1
upgini/autofe/{all_operands.py → all_operators.py} +2 -2
upgini/autofe/binary.py +11 -11
upgini/autofe/date.py +6 -6
upgini/autofe/feature.py +8 -8
upgini/autofe/groupby.py +6 -6
upgini/autofe/{operand.py → operator.py} +16 -11
upgini/autofe/timeseries/__init__.py +23 -0
upgini/autofe/timeseries/base.py +105 -0
upgini/autofe/timeseries/cross.py +139 -0
upgini/autofe/timeseries/delta.py +119 -0
upgini/autofe/timeseries/lag.py +68 -0
upgini/autofe/timeseries/roll.py +92 -0
upgini/autofe/timeseries/trend.py +61 -0
upgini/autofe/timeseries/volatility.py +259 -0
upgini/autofe/unary.py +11 -11
upgini/autofe/vector.py +4 -200
upgini/features_enricher.py +2 -2
upgini/utils/sort.py +4 -2
{upgini-1.2.63.dist-info → upgini-1.2.65a3818.dev6.dist-info}/METADATA +1 -1
{upgini-1.2.63.dist-info → upgini-1.2.65a3818.dev6.dist-info}/RECORD +23 -15
{upgini-1.2.63.dist-info → upgini-1.2.65a3818.dev6.dist-info}/WHEEL +1 -1
{upgini-1.2.63.dist-info → upgini-1.2.65a3818.dev6.dist-info}/licenses/LICENSE +0 -0

upgini/autofe/timeseries/delta.py ADDED Viewed

@@ -0,0 +1,119 @@
+import pandas as pd
+from typing import Dict, Optional, Union
+from upgini.autofe.operator import ParametrizedOperator
+from upgini.autofe.timeseries.base import TimeSeriesBase
+from upgini.autofe.timeseries.lag import Lag
+class DeltaBase(TimeSeriesBase):
+    delta_size: int
+    delta_unit: str = "D"
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        res.update(
+            {
+                "delta_size": self.delta_size,
+                "delta_unit": self.delta_unit,
+            }
+        )
+        return res
+    def _calculate_delta(self, x: Union[pd.DataFrame, pd.Series]) -> Union[pd.DataFrame, pd.Series]:
+        return_series = isinstance(x, pd.Series)
+        x = pd.DataFrame(x)
+        lag = Lag(lag_size=self.delta_size, lag_unit=self.delta_unit)
+        x.iloc[:, -1] = x.iloc[:, -1] - lag._aggregate(x.iloc[:, -1])
+        return x.iloc[:, -1] if return_series else x
+class Delta(DeltaBase, ParametrizedOperator):
+    def to_formula(self) -> str:
+        base_formula = f"delta_{self.delta_size}{self.delta_unit}"
+        return self._add_offset_to_formula(base_formula)
+    @classmethod
+    def from_formula(cls, formula: str) -> Optional["Delta"]:
+        # Base regex for Delta class
+        base_regex = r"delta_(\d+)([a-zA-Z])"
+        # Parse offset first
+        offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
+        if remaining_formula is None:
+            return None
+        # Now parse the delta part
+        import re
+        match = re.match(f"^{base_regex}$", remaining_formula)
+        if not match:
+            return None
+        delta_size = int(match.group(1))
+        delta_unit = match.group(2)
+        # Create instance with appropriate parameters
+        params = {
+            "delta_size": delta_size,
+            "delta_unit": delta_unit,
+        }
+        if offset_params:
+            params.update(offset_params)
+        return cls(**params)
+    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
+        return ts.apply(self._calculate_delta).iloc[:, [-1]]
+class Delta2(DeltaBase, ParametrizedOperator):
+    def to_formula(self) -> str:
+        base_formula = f"delta2_{self.delta_size}{self.delta_unit}"
+        return self._add_offset_to_formula(base_formula)
+    @classmethod
+    def from_formula(cls, formula: str) -> Optional["Delta2"]:
+        # Base regex for Delta2 class
+        base_regex = r"delta2_(\d+)([a-zA-Z])"
+        # Parse offset first
+        offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
+        if remaining_formula is None:
+            return None
+        # Now parse the delta part
+        import re
+        match = re.match(f"^{base_regex}$", remaining_formula)
+        if not match:
+            return None
+        delta_size = int(match.group(1))
+        delta_unit = match.group(2)
+        # Create instance with appropriate parameters
+        params = {
+            "delta_size": delta_size,
+            "delta_unit": delta_unit,
+        }
+        if offset_params:
+            params.update(offset_params)
+        return cls(**params)
+    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
+        return ts.apply(self._calculate_delta2).iloc[:, [-1]]
+    def _calculate_delta2(self, x):
+        # Calculate first delta
+        first_delta = self._calculate_delta(x)
+        # Calculate delta of delta (second derivative)
+        return self._calculate_delta(first_delta)

upgini/autofe/timeseries/lag.py ADDED Viewed

@@ -0,0 +1,68 @@
+import numpy as np
+import pandas as pd
+from typing import Dict, Optional
+from upgini.autofe.operator import ParametrizedOperator
+from upgini.autofe.timeseries.base import TimeSeriesBase
+class Lag(TimeSeriesBase, ParametrizedOperator):
+    lag_size: int
+    lag_unit: str = "D"
+    def to_formula(self) -> str:
+        base_formula = f"lag_{self.lag_size}{self.lag_unit}"
+        return self._add_offset_to_formula(base_formula)
+    @classmethod
+    def from_formula(cls, formula: str) -> Optional["Lag"]:
+        # Base regex for Lag class
+        base_regex = r"lag_(\d+)([a-zA-Z])"
+        # Parse offset first
+        offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
+        if remaining_formula is None:
+            return None
+        # Now parse the lag part
+        import re
+        match = re.match(f"^{base_regex}$", remaining_formula)
+        if not match:
+            return None
+        lag_size = int(match.group(1))
+        lag_unit = match.group(2)
+        # Create instance with appropriate parameters
+        params = {
+            "lag_size": lag_size,
+            "lag_unit": lag_unit,
+        }
+        if offset_params:
+            params.update(offset_params)
+        return cls(**params)
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        res.update(
+            {
+                "lag_size": self.lag_size,
+                "lag_unit": self.lag_unit,
+            }
+        )
+        return res
+    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
+        lag_window = self.lag_size + 1
+        return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=1).agg(self._lag)
+    def _lag(self, x):
+        if x.index.min() > (x.index.max() - pd.Timedelta(self.lag_size, self.lag_unit)):
+            return np.nan
+        else:
+            return x[0]

upgini/autofe/timeseries/roll.py ADDED Viewed

@@ -0,0 +1,92 @@
+import pandas as pd
+from typing import Dict, Optional
+from upgini.autofe.operator import ParametrizedOperator
+from upgini.autofe.timeseries.base import TimeSeriesBase
+# Roll aggregation functions
+roll_aggregations = {
+    "norm_mean": lambda x: x[-1] / x.mean(),
+    "q25": lambda x: x.quantile(0.25),
+    "q75": lambda x: x.quantile(0.75),
+    "iqr": lambda x: x.quantile(0.75) - x.quantile(0.25),
+}
+try:
+    from pydantic import field_validator as validator  # V2
+except ImportError:
+    from pydantic import validator  # V1
+class Roll(TimeSeriesBase, ParametrizedOperator):
+    aggregation: str
+    window_size: int = 1
+    window_unit: str = "D"
+    @validator("window_unit")
+    @classmethod
+    def validate_window_unit(cls, v: str) -> str:
+        try:
+            pd.tseries.frequencies.to_offset(v)
+            return v
+        except ValueError:
+            raise ValueError(
+                f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
+            )
+    def to_formula(self) -> str:
+        # First add window size and unit, then add aggregation, then add offset
+        base_formula = f"roll_{self.window_size}{self.window_unit}"
+        formula_with_agg = f"{base_formula}_{self.aggregation}"
+        return self._add_offset_to_formula(formula_with_agg)
+    @classmethod
+    def from_formula(cls, formula: str) -> Optional["Roll"]:
+        import re
+        # Base regex for Roll class (with aggregation)
+        base_regex = r"roll_(\d+)([a-zA-Z])_(\w+)"
+        # Parse offset first - this removes the offset part if present
+        offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
+        if remaining_formula is None:
+            return None
+        # Parse the window part and aggregation
+        match = re.match(f"^{base_regex}$", remaining_formula)
+        if not match:
+            return None
+        window_size = int(match.group(1))
+        window_unit = match.group(2)
+        aggregation = match.group(3)
+        # Create instance with appropriate parameters
+        params = {
+            "window_size": window_size,
+            "window_unit": window_unit,
+            "aggregation": aggregation,
+        }
+        if offset_params:
+            params.update(offset_params)
+        return cls(**params)
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        res.update(
+            {
+                "window_size": self.window_size,
+                "window_unit": self.window_unit,
+                "aggregation": self.aggregation,
+            }
+        )
+        return res
+    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
+        return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
+            roll_aggregations.get(self.aggregation, self.aggregation)
+        )

upgini/autofe/timeseries/trend.py ADDED Viewed

@@ -0,0 +1,61 @@
+from typing import Dict, Optional, Union
+import numpy as np
+import pandas as pd
+from upgini.autofe.timeseries.base import TimeSeriesBase
+class TrendCoefficient(TimeSeriesBase):
+    name: str = "trend_coef"
+    step_size: int = 1
+    step_unit: str = "D"
+    def to_formula(self) -> str:
+        base_formula = "trend_coef"
+        return self._add_offset_to_formula(base_formula)
+    @classmethod
+    def from_formula(cls, formula: str) -> Optional["TrendCoefficient"]:
+        # Base regex for TrendCoefficient class
+        base_regex = r"trend_coef"
+        # Parse offset first
+        offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
+        if remaining_formula is None:
+            return None
+        # Basic pattern (no offset)
+        if remaining_formula == "trend_coef":
+            params = {}
+            if offset_params:
+                params.update(offset_params)
+            return cls(**params)
+        return None
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        res.update(
+            {
+                "step_size": self.step_size,
+                "step_unit": self.step_unit,
+                "offset_size": self.offset_size,
+                "offset_unit": self.offset_unit,
+            }
+        )
+        return res
+    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
+        return ts.apply(self._trend_coef).iloc[:, [-1]].fillna(0)
+    def _trend_coef(self, x: Union[pd.DataFrame, pd.Series]) -> Union[pd.DataFrame, pd.Series]:
+        return_series = isinstance(x, pd.Series)
+        x = pd.DataFrame(x)
+        resampled = (
+            x.iloc[:, -1].resample(f"{self.step_size}{self.step_unit}").fillna(method="ffill").fillna(method="bfill")
+        )
+        idx = np.arange(len(resampled))
+        coeffs = np.polyfit(idx, resampled, 1)
+        x.iloc[:, -1] = coeffs[0]
+        return x.iloc[:, -1] if return_series else x

upgini/autofe/timeseries/volatility.py ADDED Viewed

@@ -0,0 +1,259 @@
+from typing import Dict, Optional, Union
+import numpy as np
+import pandas as pd
+from upgini.autofe.operator import ParametrizedOperator
+from upgini.autofe.timeseries.base import TimeSeriesBase
+class VolatilityBase(TimeSeriesBase):
+    @staticmethod
+    def _get_returns(ts: pd.Series, freq: str) -> pd.Series:
+        return ts.pct_change(freq=freq).fillna(0)
+class EWMAVolatility(VolatilityBase, ParametrizedOperator):
+    step_size: int = 1
+    step_unit: str = "D"
+    window_size: int
+    def to_formula(self) -> str:
+        base_formula = f"ewma_vol_{self.window_size}"
+        return self._add_offset_to_formula(base_formula)
+    @classmethod
+    def from_formula(cls, formula: str) -> Optional["EWMAVolatility"]:
+        base_regex = r"ewma_vol_(\d+)"
+        offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
+        if remaining_formula is None:
+            return None
+        import re
+        match = re.match(f"^{base_regex}$", remaining_formula)
+        if not match:
+            return None
+        window_size = int(match.group(1))
+        params = {
+            "window_size": window_size,
+        }
+        if offset_params:
+            params.update(offset_params)
+        return cls(**params)
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        res.update(
+            {
+                "step_size": self.step_size,
+                "step_unit": self.step_unit,
+                "window_size": self.window_size,
+            }
+        )
+        return res
+    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
+        return ts.apply(self._ewma_vol)
+    def _ewma_vol(self, x):
+        x = pd.DataFrame(x).iloc[:, -1]
+        returns = self._get_returns(x, f"{self.step_size}{self.step_unit}")
+        return returns.ewm(span=self.window_size).std()
+class RollingVolBase(VolatilityBase):
+    step_size: int = 1
+    step_unit: str = "D"
+    window_size: int
+    window_unit: str = "D"
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        res.update(
+            {
+                "step_size": self.step_size,
+                "step_unit": self.step_unit,
+                "window_size": self.window_size,
+                "window_unit": self.window_unit,
+            }
+        )
+        return res
+    def _rolling_vol(
+        self, x: Union[pd.DataFrame, pd.Series], window_size: int, window_unit: str, abs_returns: bool = False
+    ) -> Union[pd.DataFrame, pd.Series]:
+        return_series = isinstance(x, pd.Series)
+        x = pd.DataFrame(x)
+        returns = self._get_returns(x.iloc[:, -1], f"{self.step_size}{self.step_unit}")
+        if abs_returns:
+            returns = returns.abs()
+        x.iloc[:, -1] = returns.rolling(f"{window_size}{window_unit}", min_periods=1).std()
+        return x.iloc[:, -1] if return_series else x
+class RollingVolatility(RollingVolBase, ParametrizedOperator):
+    abs_returns: bool = False
+    def to_formula(self) -> str:
+        base_formula = f"roll_vol_{self.window_size}{self.window_unit}"
+        return self._add_offset_to_formula(base_formula)
+    @classmethod
+    def from_formula(cls, formula: str) -> Optional["RollingVolatility"]:
+        base_regex = r"roll_vol_(\d+)([a-zA-Z])"
+        offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
+        if remaining_formula is None:
+            return None
+        import re
+        match = re.match(f"^{base_regex}$", remaining_formula)
+        if not match:
+            return None
+        window_size = int(match.group(1))
+        window_unit = match.group(2)
+        params = {
+            "window_size": window_size,
+            "window_unit": window_unit,
+        }
+        if offset_params:
+            params.update(offset_params)
+        return cls(**params)
+    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
+        return ts.apply(
+            self._rolling_vol, window_size=self.window_size, window_unit=self.window_unit, abs_returns=self.abs_returns
+        ).iloc[:, [-1]]
+class RollingVolatility2(RollingVolBase, ParametrizedOperator):
+    """
+    Computes the volatility on volatility of a time series. Volatility is computed using the RollingVolatility.
+    """
+    def to_formula(self) -> str:
+        base_formula = f"roll_vol2_{self.window_size}{self.window_unit}"
+        return self._add_offset_to_formula(base_formula)
+    @classmethod
+    def from_formula(cls, formula: str) -> Optional["RollingVolatility2"]:
+        base_regex = r"roll_vol2_(\d+)([a-zA-Z])"
+        offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
+        if remaining_formula is None:
+            return None
+        import re
+        match = re.match(f"^{base_regex}$", remaining_formula)
+        if not match:
+            return None
+        window_size = int(match.group(1))
+        window_unit = match.group(2)
+        params = {
+            "window_size": window_size,
+            "window_unit": window_unit,
+        }
+        if offset_params:
+            params.update(offset_params)
+        return cls(**params)
+    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
+        return ts.apply(self._vol_on_vol).iloc[:, [-1]]
+    def _vol_on_vol(self, x: Union[pd.DataFrame, pd.Series]) -> Union[pd.DataFrame, pd.Series]:
+        vol1 = self._rolling_vol(x, self.window_size, self.window_unit, abs_returns=True)
+        vol2 = self._rolling_vol(vol1, self.window_size, self.window_unit, abs_returns=False)
+        return vol2
+class VolatilityRatio(RollingVolBase, ParametrizedOperator):
+    """
+    Computes the ratio of short-term volatility to long-term volatility.
+    Both volatilities are computed using RollingVolatility.
+    """
+    short_window_size: int
+    short_window_unit: str = "D"
+    def to_formula(self) -> str:
+        base_formula = (
+            f"vol_ratio_{self.short_window_size}{self.short_window_unit}_to_{self.window_size}{self.window_unit}"
+        )
+        return self._add_offset_to_formula(base_formula)
+    @classmethod
+    def from_formula(cls, formula: str) -> Optional["VolatilityRatio"]:
+        base_regex = r"vol_ratio_(\d+)([a-zA-Z])_to_(\d+)([a-zA-Z])"
+        offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
+        if remaining_formula is None:
+            return None
+        import re
+        match = re.match(f"^{base_regex}$", remaining_formula)
+        if not match:
+            return None
+        short_window_size = int(match.group(1))
+        short_window_unit = match.group(2)
+        window_size = int(match.group(3))
+        window_unit = match.group(4)
+        params = {
+            "short_window_size": short_window_size,
+            "short_window_unit": short_window_unit,
+            "window_size": window_size,
+            "window_unit": window_unit,
+        }
+        if offset_params:
+            params.update(offset_params)
+        return cls(**params)
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        res.update(
+            {
+                "short_window_size": self.short_window_size,
+                "short_window_unit": self.short_window_unit,
+            }
+        )
+        return res
+    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
+        return ts.apply(self._vol_ratio).iloc[:, [-1]]
+    def _vol_ratio(self, x: Union[pd.DataFrame, pd.Series]) -> Union[pd.DataFrame, pd.Series]:
+        short_vol = self._rolling_vol(x, self.short_window_size, self.short_window_unit)
+        long_vol = self._rolling_vol(x, self.window_size, self.window_unit)
+        ratio = VolatilityRatio._handle_div_errors(short_vol / long_vol)
+        return ratio
+    @staticmethod
+    def _handle_div_errors(x: pd.Series) -> pd.Series:
+        return x.replace([np.inf, -np.inf], np.nan).fillna(1)

upgini/autofe/unary.py CHANGED Viewed

@@ -2,10 +2,10 @@ from typing import Dict, Optional
 import numpy as np
 import pandas as pd
-from upgini.autofe.operand import PandasOperand, VectorizableMixin
+from upgini.autofe.operator import PandasOperator, VectorizableMixin
-class Abs(PandasOperand, VectorizableMixin):
+class Abs(PandasOperator, VectorizableMixin):
     name: str = "abs"
     is_unary: bool = True
     is_vectorizable: bool = True
@@ -20,7 +20,7 @@ class Abs(PandasOperand, VectorizableMixin):
         # return data.abs()
-class Log(PandasOperand, VectorizableMixin):
+class Log(PandasOperator, VectorizableMixin):
     name: str = "log"
     is_unary: bool = True
     is_vectorizable: bool = True
@@ -34,7 +34,7 @@ class Log(PandasOperand, VectorizableMixin):
         return self._round_value(np.log(data.replace(0, np.nan).abs()), 10)
-class Sqrt(PandasOperand, VectorizableMixin):
+class Sqrt(PandasOperator, VectorizableMixin):
     name: str = "sqrt"
     is_unary: bool = True
     is_vectorizable: bool = True
@@ -48,7 +48,7 @@ class Sqrt(PandasOperand, VectorizableMixin):
         return self._round_value(np.sqrt(data.abs()))
-class Square(PandasOperand, VectorizableMixin):
+class Square(PandasOperator, VectorizableMixin):
     name: str = "square"
     is_unary: bool = True
     is_vectorizable: bool = True
@@ -61,7 +61,7 @@ class Square(PandasOperand, VectorizableMixin):
         return np.square(data)
-class Sigmoid(PandasOperand, VectorizableMixin):
+class Sigmoid(PandasOperator, VectorizableMixin):
     name: str = "sigmoid"
     is_unary: bool = True
     is_vectorizable: bool = True
@@ -75,7 +75,7 @@ class Sigmoid(PandasOperand, VectorizableMixin):
         return self._round_value(1 / (1 + np.exp(-data)))
-class Floor(PandasOperand, VectorizableMixin):
+class Floor(PandasOperator, VectorizableMixin):
     name: str = "floor"
     is_unary: bool = True
     is_vectorizable: bool = True
@@ -90,7 +90,7 @@ class Floor(PandasOperand, VectorizableMixin):
         return np.floor(data)
-class Residual(PandasOperand, VectorizableMixin):
+class Residual(PandasOperator, VectorizableMixin):
     name: str = "residual"
     is_unary: bool = True
     is_vectorizable: bool = True
@@ -104,7 +104,7 @@ class Residual(PandasOperand, VectorizableMixin):
         return data - np.floor(data)
-class Freq(PandasOperand):
+class Freq(PandasOperator):
     name: str = "freq"
     is_unary: bool = True
     output_type: Optional[str] = "float"
@@ -116,7 +116,7 @@ class Freq(PandasOperand):
         return self._loc(data, value_counts)
-class Norm(PandasOperand):
+class Norm(PandasOperator):
     name: str = "norm"
     is_unary: bool = True
     output_type: Optional[str] = "float"
@@ -148,7 +148,7 @@ class Norm(PandasOperand):
         return res
-class Embeddings(PandasOperand):
+class Embeddings(PandasOperator):
     name: str = "emb"
     is_unary: bool = True
     input_type: Optional[str] = "string"

upgini 1.2.63__py3-none-any.whl → 1.2.65a3818.dev6__py3-none-any.whl

Potentially problematic release.

upgini 1.2.63py3-none-any.whl → 1.2.65a3818.dev6py3-none-any.whl