PyPI - upgini - Versions diffs - 1.2.57a2__py3-none-any.whl → 1.2.57a3675.dev5__py3-none-any.whl - Mend

upgini 1.2.57a2py3-none-any.whl → 1.2.57a3675.dev5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (8) hide show

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.~~57a2~~"
1	+ __version__ = "1.2.57a3675.dev5"

upgini/autofe/date.py CHANGED Viewed

@@ -64,6 +64,9 @@ class DateDiff(PandasOperand, DateDiffMixin):
         return res
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        if left.isna().all() or right.isna().all():
+            return pd.Series([None] * len(left))
         left = self._convert_to_date(left, self.left_unit)
         right = self._convert_to_date(right, self.right_unit)
         diff = self._convert_diff_to_unit(left.dt.date - right.dt.date)
@@ -142,6 +145,9 @@ class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
         return cls(aggregation=aggregation)
     def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
+        if left.isna().all() or right.isna().all():
+            return pd.Series([None] * len(left), dtype=np.float64)
         left = self._convert_to_date(left, self.left_unit)
         right_mask = right.apply(lambda x: len(x) > 0)
         mask = left.notna() & right.notna() & right_mask
@@ -230,6 +236,8 @@ class DatePercentileBase(PandasOperand, abc.ABC):
         pass
     def _perc(self, f, bounds):
+        if f is None or np.isnan(f):
+            return np.nan
         hit = np.where(f >= np.array(bounds))[0]
         if hit.size > 0:
             return np.max(hit) + 1

upgini/autofe/feature.py CHANGED Viewed

@@ -26,18 +26,9 @@ class Column:
         return dict()
     def rename_columns(self, mapping: Dict[str, str]) -> "Column":
-        self.name = self._unhash(mapping.get(self.name) or self.name)
+        self.name = mapping.get(self.name) or self.name
         return self
-    def _unhash(self, feature_name: str) -> str:
-        last_component_idx = feature_name.rfind("_")
-        if not feature_name.startswith("f_"):
-            return feature_name  # etalon feature
-        elif last_component_idx == 1:
-            return feature_name[2:]  # fully hashed name, cannot unhash
-        else:
-            return feature_name[2:last_component_idx]
     def delete_data(self):
         self.data = None

upgini/autofe/vector.py CHANGED Viewed

@@ -2,7 +2,11 @@ import abc
 from typing import Dict, List, Optional
 import pandas as pd
-from pydantic import validator
+try:
+    from pydantic import field_validator as validator  # V2
+except ImportError:
+    from pydantic import validator  # V1
 from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
@@ -29,12 +33,16 @@ class Sum(PandasOperand, VectorizableMixin):
 class TimeSeriesBase(PandasOperand, abc.ABC):
     is_vector: bool = True
     date_unit: Optional[str] = None
+    offset_size: int = 0
+    offset_unit: str = "D"
     def get_params(self) -> Dict[str, Optional[str]]:
         res = super().get_params()
         res.update(
             {
                 "date_unit": self.date_unit,
+                "offset_size": self.offset_size,
+                "offset_unit": self.offset_unit,
             }
         )
         return res
@@ -46,13 +54,31 @@ class TimeSeriesBase(PandasOperand, abc.ABC):
         ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
         ts.set_index(date.name, inplace=True)
         ts = ts[ts.index.notna()].sort_index()
-        ts = ts.groupby([c.name for c in data[1:-1]]) if len(data) > 2 else ts
+        ts = (
+            ts.groupby([c.name for c in data[1:-1]])
+            .apply(self._shift)[data[-1].name]
+            .to_frame()
+            .reset_index()
+            .set_index(date.name)
+            .groupby([c.name for c in data[1:-1]])
+            if len(data) > 2
+            else self._shift(ts)
+        )
         ts = self._aggregate(ts)
         ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
         ts.index = date.index
         return ts.iloc[:, -1]
+    def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
+        if self.offset_size > 0:
+            return ts.iloc[:, :-1].merge(
+                ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
+                left_index=True,
+                right_index=True,
+            )
+        return ts
     @abc.abstractmethod
     def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
         pass
@@ -67,6 +93,7 @@ class Roll(TimeSeriesBase, ParametrizedOperand):
     window_unit: str = "D"
     @validator("window_unit")
+    @classmethod
     def validate_window_unit(cls, v: str) -> str:
         try:
             pd.tseries.frequencies.to_offset(v)
@@ -77,12 +104,35 @@ class Roll(TimeSeriesBase, ParametrizedOperand):
             )
     def to_formula(self) -> str:
-        return f"roll_{self.window_size}{self.window_unit}_{self.aggregation}"
+        roll_component = f"roll_{self.window_size}{self.window_unit}"
+        if self.offset_size > 0:
+            roll_component += f"_offset_{self.offset_size}{self.offset_unit}"
+        return f"{roll_component}_{self.aggregation}"
     @classmethod
     def from_formula(cls, formula: str) -> Optional["Roll"]:
         import re
+        # Try matching pattern with offset first
+        pattern_with_offset = r"^roll_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])_(\w+)$"
+        match_with_offset = re.match(pattern_with_offset, formula)
+        if match_with_offset:
+            window_size = int(match_with_offset.group(1))
+            window_unit = match_with_offset.group(2)
+            offset_size = int(match_with_offset.group(3))
+            offset_unit = match_with_offset.group(4)
+            aggregation = match_with_offset.group(5)
+            return cls(
+                window_size=window_size,
+                window_unit=window_unit,
+                offset_size=offset_size,
+                offset_unit=offset_unit,
+                aggregation=aggregation,
+            )
+        # If no offset pattern found, try basic pattern
         pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
         match = re.match(pattern, formula)
@@ -107,7 +157,7 @@ class Roll(TimeSeriesBase, ParametrizedOperand):
         return res
     def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
-        return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=self.window_size).agg(
+        return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
             _roll_aggregations.get(self.aggregation, self.aggregation)
         )
@@ -117,12 +167,33 @@ class Lag(TimeSeriesBase, ParametrizedOperand):
     lag_unit: str = "D"
     def to_formula(self) -> str:
-        return f"lag_{self.lag_size}{self.lag_unit}"
+        lag_component = f"lag_{self.lag_size}{self.lag_unit}"
+        if self.offset_size > 0:
+            lag_component += f"_offset_{self.offset_size}{self.offset_unit}"
+        return lag_component
     @classmethod
     def from_formula(cls, formula: str) -> Optional["Lag"]:
         import re
+        # Try matching pattern with offset first
+        pattern_with_offset = r"^lag_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])$"
+        match_with_offset = re.match(pattern_with_offset, formula)
+        if match_with_offset:
+            lag_size = int(match_with_offset.group(1))
+            lag_unit = match_with_offset.group(2)
+            offset_size = int(match_with_offset.group(3))
+            offset_unit = match_with_offset.group(4)
+            return cls(
+                lag_size=lag_size,
+                lag_unit=lag_unit,
+                offset_size=offset_size,
+                offset_unit=offset_unit,
+            )
+        # If no offset pattern found, try basic pattern
         pattern = r"^lag_(\d+)([a-zA-Z])$"
         match = re.match(pattern, formula)
@@ -136,6 +207,12 @@ class Lag(TimeSeriesBase, ParametrizedOperand):
     def get_params(self) -> Dict[str, Optional[str]]:
         res = super().get_params()
+        res.update(
+            {
+                "lag_size": self.lag_size,
+                "lag_unit": self.lag_unit,
+            }
+        )
         return res
     def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:

{upgini-1.2.57a2.dist-info → upgini-1.2.57a3675.dev5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.57a2
+Version: 1.2.57a3675.dev5
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/
@@ -34,7 +34,7 @@ Requires-Dist: pydantic<3.0.0,>1.0.0
 Requires-Dist: pyjwt>=2.8.0
 Requires-Dist: python-bidi==0.4.2
 Requires-Dist: python-dateutil>=2.8.0
-Requires-Dist: python-json-logger>=2.0.2
+Requires-Dist: python-json-logger>=3.3.0
 Requires-Dist: requests>=2.8.0
 Requires-Dist: scikit-learn>=1.3.0
 Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11

{upgini-1.2.57a2.dist-info → upgini-1.2.57a3675.dev5.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-upgini/__about__.py,sha256=PD2lbh5FQufk15oyUAYIGJrdUHAs9qG5Btw3lTqrUtI,25
+upgini/__about__.py,sha256=I0ZAa2qUeGAG8w2GcOhss1hhvV9cMS2KXnSkGWg4s0A,33
 upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
 upgini/dataset.py,sha256=NP5vHqEfZQ1HWz3TcNAa_OhXG8wiMRdydm26D6UBiRU,34166
@@ -16,12 +16,12 @@ upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo
 upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/autofe/all_operands.py,sha256=v0_NozalvvzeojSAA0d7UJ5INS654ZVaLn4S8djK6Ac,329
 upgini/autofe/binary.py,sha256=zMhtHVuGUAFLUqem-XiXqJj-GRXxS88tdz8tFuDfSNM,7659
-upgini/autofe/date.py,sha256=d-sijAD7dETfqIOCaZh1vhuVjsS_nqa-6dhjwkCdny4,10441
-upgini/autofe/feature.py,sha256=l8A8E3BH2BmYvqEC81zbcIEfH6KEEhcesJ2BH4fn0-4,15140
+upgini/autofe/date.py,sha256=oykxfmny4LOr6m79IipOUCtk2JQSUdSCWHh8K9n7nek,10726
+upgini/autofe/feature.py,sha256=zvRdlxCkaOsX0XiragNvh0tAPyOWut0MQTq5JGU5HtY,14749
 upgini/autofe/groupby.py,sha256=G48_sQZw016eGx3cOy8YQrEIOp95puWqYUpFWd-gdeM,3595
 upgini/autofe/operand.py,sha256=8Ttrfxv_H91dMbS7J55zxluzAJHfGXU_Y2xCh4OHwb8,4774
 upgini/autofe/unary.py,sha256=T3E7F3dA_7o_rkdCFq7JV6nHLzcoHLHQTcxO7y5Opa4,4646
-upgini/autofe/vector.py,sha256=bvcop9b0uFFPfQ3FLTwXT2IYfxNl4dIfR8icvnBHvOA,4358
+upgini/autofe/vector.py,sha256=udkg4pP7IIeLjt0Cg6rzEKUmGaubOnqsEz3bz9R6E44,7110
 upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/data_source/data_source_publisher.py,sha256=0vaYz5v3KclJnA6jAWiTUiMQO5mbBTBINWV9jr2F5xM,22591
 upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
@@ -59,7 +59,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
 upgini/utils/target_utils.py,sha256=RlpKGss9kMibVSlA8iZuO_qxmyeplqzn7X8g6hiGGGs,14341
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
 upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
-upgini-1.2.57a2.dist-info/METADATA,sha256=-dEVxWnjwc3LcSqFVJGENL07YJDvWgH8mHQ0PaE93sI,49057
-upgini-1.2.57a2.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-upgini-1.2.57a2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.57a2.dist-info/RECORD,,
+upgini-1.2.57a3675.dev5.dist-info/METADATA,sha256=7bDZbjWy8pxCvyBM02xr2nvMfKLcDBh2Agf07aKc4fI,49065
+upgini-1.2.57a3675.dev5.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
+upgini-1.2.57a3675.dev5.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.57a3675.dev5.dist-info/RECORD,,

{upgini-1.2.57a2.dist-info → upgini-1.2.57a3675.dev5.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: hatchling 1.25.0
+Generator: hatchling 1.24.2
 Root-Is-Purelib: true
 Tag: py3-none-any

{upgini-1.2.57a2.dist-info → upgini-1.2.57a3675.dev5.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.57a2__py3-none-any.whl → 1.2.57a3675.dev5__py3-none-any.whl

Potentially problematic release.

upgini 1.2.57a2py3-none-any.whl → 1.2.57a3675.dev5py3-none-any.whl