PyPI - upgini - Versions diffs - 1.2.57a2__tar.gz → 1.2.57a3675.dev4__tar.gz - Mend

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

{upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.57a2
+Version: 1.2.57a3675.dev4
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

upgini-1.2.57a3675.dev4/src/upgini/__about__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.2.57a3675.dev4"

{upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/autofe/date.py RENAMED Viewed

@@ -64,6 +64,9 @@ class DateDiff(PandasOperand, DateDiffMixin):
         return res
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        if left.isna().all() or right.isna().all():
+            return pd.Series([None] * len(left))
         left = self._convert_to_date(left, self.left_unit)
         right = self._convert_to_date(right, self.right_unit)
         diff = self._convert_diff_to_unit(left.dt.date - right.dt.date)
@@ -142,6 +145,9 @@ class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
         return cls(aggregation=aggregation)
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        if left.isna().all() or right.isna().all():
+            return pd.Series([None] * len(left), dtype=np.float64)
         left = self._convert_to_date(left, self.left_unit)
         right_mask = right.apply(lambda x: len(x) > 0)
         mask = left.notna() & right.notna() & right_mask
@@ -230,6 +236,8 @@ class DatePercentileBase(PandasOperand, abc.ABC):
         pass
     def _perc(self, f, bounds):
+        if f is None or np.isnan(f):
+            return np.nan
         hit = np.where(f >= np.array(bounds))[0]
         if hit.size > 0:
             return np.max(hit) + 1

{upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/autofe/feature.py RENAMED Viewed

@@ -26,18 +26,9 @@ class Column:
         return dict()
     def rename_columns(self, mapping: Dict[str, str]) -> "Column":
-        self.name = self._unhash(mapping.get(self.name) or self.name)
+        self.name = mapping.get(self.name) or self.name
         return self
-    def _unhash(self, feature_name: str) -> str:
-        last_component_idx = feature_name.rfind("_")
-        if not feature_name.startswith("f_"):
-            return feature_name  # etalon feature
-        elif last_component_idx == 1:
-            return feature_name[2:]  # fully hashed name, cannot unhash
-        else:
-            return feature_name[2:last_component_idx]
     def delete_data(self):
         self.data = None

{upgini-1.2.57a2 → upgini-1.2.57a3675.dev4}/src/upgini/autofe/vector.py RENAMED Viewed

@@ -2,7 +2,11 @@ import abc
 from typing import Dict, List, Optional
 import pandas as pd
-from pydantic import validator
+try:
+    from pydantic import field_validator as validator  # V2
+except ImportError:
+    from pydantic import validator  # V1
 from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
@@ -29,12 +33,16 @@ class Sum(PandasOperand, VectorizableMixin):
 class TimeSeriesBase(PandasOperand, abc.ABC):
     is_vector: bool = True
     date_unit: Optional[str] = None
+    offset_size: int = 0
+    offset_unit: str = "D"
     def get_params(self) -> Dict[str, Optional[str]]:
         res = super().get_params()
         res.update(
             {
                 "date_unit": self.date_unit,
+                "offset_size": self.offset_size,
+                "offset_unit": self.offset_unit,
             }
         )
         return res
@@ -46,13 +54,31 @@ class TimeSeriesBase(PandasOperand, abc.ABC):
         ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
         ts.set_index(date.name, inplace=True)
         ts = ts[ts.index.notna()].sort_index()
-        ts = ts.groupby([c.name for c in data[1:-1]]) if len(data) > 2 else ts
+        ts = (
+            ts.groupby([c.name for c in data[1:-1]])
+            .apply(self._shift)[data[-1].name]
+            .to_frame()
+            .reset_index()
+            .set_index(date.name)
+            .groupby([c.name for c in data[1:-1]])
+            if len(data) > 2
+            else self._shift(ts)
+        )
         ts = self._aggregate(ts)
         ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
         ts.index = date.index
         return ts.iloc[:, -1]
+    def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
+        if self.offset_size > 0:
+            return ts.iloc[:, :-1].merge(
+                ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
+                left_index=True,
+                right_index=True,
+            )
+        return ts
     @abc.abstractmethod
     def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
         pass
@@ -67,6 +93,7 @@ class Roll(TimeSeriesBase, ParametrizedOperand):
     window_unit: str = "D"
     @validator("window_unit")
+    @classmethod
     def validate_window_unit(cls, v: str) -> str:
         try:
             pd.tseries.frequencies.to_offset(v)
@@ -77,12 +104,35 @@ class Roll(TimeSeriesBase, ParametrizedOperand):
             )
     def to_formula(self) -> str:
-        return f"roll_{self.window_size}{self.window_unit}_{self.aggregation}"
+        roll_component = f"roll_{self.window_size}{self.window_unit}"
+        if self.offset_size > 0:
+            roll_component += f"_offset_{self.offset_size}{self.offset_unit}"
+        return f"{roll_component}_{self.aggregation}"
     @classmethod
     def from_formula(cls, formula: str) -> Optional["Roll"]:
         import re
+        # Try matching pattern with offset first
+        pattern_with_offset = r"^roll_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])_(\w+)$"
+        match_with_offset = re.match(pattern_with_offset, formula)
+        if match_with_offset:
+            window_size = int(match_with_offset.group(1))
+            window_unit = match_with_offset.group(2)
+            offset_size = int(match_with_offset.group(3))
+            offset_unit = match_with_offset.group(4)
+            aggregation = match_with_offset.group(5)
+            return cls(
+                window_size=window_size,
+                window_unit=window_unit,
+                offset_size=offset_size,
+                offset_unit=offset_unit,
+                aggregation=aggregation,
+            )
+        # If no offset pattern found, try basic pattern
         pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
         match = re.match(pattern, formula)
@@ -107,7 +157,7 @@ class Roll(TimeSeriesBase, ParametrizedOperand):
         return res
     def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
-        return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=self.window_size).agg(
+        return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
             _roll_aggregations.get(self.aggregation, self.aggregation)
         )
@@ -117,12 +167,33 @@ class Lag(TimeSeriesBase, ParametrizedOperand):
     lag_unit: str = "D"
     def to_formula(self) -> str:
-        return f"lag_{self.lag_size}{self.lag_unit}"
+        lag_component = f"lag_{self.lag_size}{self.lag_unit}"
+        if self.offset_size > 0:
+            lag_component += f"_offset_{self.offset_size}{self.offset_unit}"
+        return lag_component
     @classmethod
     def from_formula(cls, formula: str) -> Optional["Lag"]:
         import re
+        # Try matching pattern with offset first
+        pattern_with_offset = r"^lag_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])$"
+        match_with_offset = re.match(pattern_with_offset, formula)
+        if match_with_offset:
+            lag_size = int(match_with_offset.group(1))
+            lag_unit = match_with_offset.group(2)
+            offset_size = int(match_with_offset.group(3))
+            offset_unit = match_with_offset.group(4)
+            return cls(
+                lag_size=lag_size,
+                lag_unit=lag_unit,
+                offset_size=offset_size,
+                offset_unit=offset_unit,
+            )
+        # If no offset pattern found, try basic pattern
         pattern = r"^lag_(\d+)([a-zA-Z])$"
         match = re.match(pattern, formula)
@@ -136,6 +207,12 @@ class Lag(TimeSeriesBase, ParametrizedOperand):
     def get_params(self) -> Dict[str, Optional[str]]:
         res = super().get_params()
+        res.update(
+            {
+                "lag_size": self.lag_size,
+                "lag_unit": self.lag_unit,
+            }
+        )
         return res
     def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame: