PyPI - upgini - Versions diffs - 1.2.63__py3-none-any.whl → 1.2.65__py3-none-any.whl - Mend

upgini 1.2.63py3-none-any.whl → 1.2.65py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

upgini/__about__.py +1 -1
upgini/autofe/{all_operands.py → all_operators.py} +2 -2
upgini/autofe/binary.py +11 -11
upgini/autofe/date.py +6 -6
upgini/autofe/feature.py +8 -8
upgini/autofe/groupby.py +6 -6
upgini/autofe/{operand.py → operator.py} +16 -11
upgini/autofe/timeseries/__init__.py +23 -0
upgini/autofe/timeseries/base.py +105 -0
upgini/autofe/timeseries/cross.py +139 -0
upgini/autofe/timeseries/delta.py +119 -0
upgini/autofe/timeseries/lag.py +68 -0
upgini/autofe/timeseries/roll.py +92 -0
upgini/autofe/timeseries/trend.py +64 -0
upgini/autofe/timeseries/volatility.py +259 -0
upgini/autofe/unary.py +11 -11
upgini/autofe/vector.py +4 -200
upgini/features_enricher.py +2 -2
upgini/utils/sort.py +4 -2
{upgini-1.2.63.dist-info → upgini-1.2.65.dist-info}/METADATA +1 -1
{upgini-1.2.63.dist-info → upgini-1.2.65.dist-info}/RECORD +23 -15
{upgini-1.2.63.dist-info → upgini-1.2.65.dist-info}/WHEEL +1 -1
{upgini-1.2.63.dist-info → upgini-1.2.65.dist-info}/licenses/LICENSE +0 -0

upgini/autofe/vector.py CHANGED Viewed

@@ -1,17 +1,11 @@
-import abc
-from typing import Dict, List, Optional
+from typing import List, Optional
 import pandas as pd
-try:
-    from pydantic import field_validator as validator  # V2
-except ImportError:
-    from pydantic import validator  # V1
+from upgini.autofe.operator import PandasOperator, VectorizableMixin
-from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
-class Mean(PandasOperand, VectorizableMixin):
+class Mean(PandasOperator, VectorizableMixin):
     name: str = "mean"
     output_type: Optional[str] = "float"
     is_vector: bool = True
@@ -21,200 +15,10 @@ class Mean(PandasOperand, VectorizableMixin):
         return pd.DataFrame(data).T.fillna(0).mean(axis=1)
-class Sum(PandasOperand, VectorizableMixin):
+class Sum(PandasOperator, VectorizableMixin):
     name: str = "sum"
     is_vector: bool = True
     group_index: int = 0
     def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
         return pd.DataFrame(data).T.fillna(0).sum(axis=1)
-class TimeSeriesBase(PandasOperand, abc.ABC):
-    is_vector: bool = True
-    date_unit: Optional[str] = None
-    offset_size: int = 0
-    offset_unit: str = "D"
-    def get_params(self) -> Dict[str, Optional[str]]:
-        res = super().get_params()
-        res.update(
-            {
-                "date_unit": self.date_unit,
-                "offset_size": self.offset_size,
-                "offset_unit": self.offset_unit,
-            }
-        )
-        return res
-    def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
-        # assuming first is date, last is value, rest is group columns
-        date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
-        ts = pd.concat([date] + data[1:], axis=1)
-        ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
-        ts.set_index(date.name, inplace=True)
-        ts = ts[ts.index.notna()].sort_index()
-        ts = (
-            ts.groupby([c.name for c in data[1:-1]])
-            .apply(self._shift)[data[-1].name]
-            .to_frame()
-            .reset_index()
-            .set_index(date.name)
-            .groupby([c.name for c in data[1:-1]])
-            if len(data) > 2
-            else self._shift(ts)
-        )
-        ts = self._aggregate(ts)
-        ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
-        ts.index = date.index
-        return ts.iloc[:, -1]
-    def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
-        if self.offset_size > 0:
-            return ts.iloc[:, :-1].merge(
-                ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
-                left_index=True,
-                right_index=True,
-            )
-        return ts
-    @abc.abstractmethod
-    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
-        pass
-_roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean()}
-class Roll(TimeSeriesBase, ParametrizedOperand):
-    aggregation: str
-    window_size: int = 1
-    window_unit: str = "D"
-    @validator("window_unit")
-    @classmethod
-    def validate_window_unit(cls, v: str) -> str:
-        try:
-            pd.tseries.frequencies.to_offset(v)
-            return v
-        except ValueError:
-            raise ValueError(
-                f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
-            )
-    def to_formula(self) -> str:
-        roll_component = f"roll_{self.window_size}{self.window_unit}"
-        if self.offset_size > 0:
-            roll_component += f"_offset_{self.offset_size}{self.offset_unit}"
-        return f"{roll_component}_{self.aggregation}"
-    @classmethod
-    def from_formula(cls, formula: str) -> Optional["Roll"]:
-        import re
-        # Try matching pattern with offset first
-        pattern_with_offset = r"^roll_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])_(\w+)$"
-        match_with_offset = re.match(pattern_with_offset, formula)
-        if match_with_offset:
-            window_size = int(match_with_offset.group(1))
-            window_unit = match_with_offset.group(2)
-            offset_size = int(match_with_offset.group(3))
-            offset_unit = match_with_offset.group(4)
-            aggregation = match_with_offset.group(5)
-            return cls(
-                window_size=window_size,
-                window_unit=window_unit,
-                offset_size=offset_size,
-                offset_unit=offset_unit,
-                aggregation=aggregation,
-            )
-        # If no offset pattern found, try basic pattern
-        pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
-        match = re.match(pattern, formula)
-        if not match:
-            return None
-        window_size = int(match.group(1))
-        window_unit = match.group(2)
-        aggregation = match.group(3)
-        return cls(window_size=window_size, window_unit=window_unit, aggregation=aggregation)
-    def get_params(self) -> Dict[str, Optional[str]]:
-        res = super().get_params()
-        res.update(
-            {
-                "window_size": self.window_size,
-                "window_unit": self.window_unit,
-                "aggregation": self.aggregation,
-            }
-        )
-        return res
-    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
-        return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
-            _roll_aggregations.get(self.aggregation, self.aggregation)
-        )
-class Lag(TimeSeriesBase, ParametrizedOperand):
-    lag_size: int
-    lag_unit: str = "D"
-    def to_formula(self) -> str:
-        lag_component = f"lag_{self.lag_size}{self.lag_unit}"
-        if self.offset_size > 0:
-            lag_component += f"_offset_{self.offset_size}{self.offset_unit}"
-        return lag_component
-    @classmethod
-    def from_formula(cls, formula: str) -> Optional["Lag"]:
-        import re
-        # Try matching pattern with offset first
-        pattern_with_offset = r"^lag_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])$"
-        match_with_offset = re.match(pattern_with_offset, formula)
-        if match_with_offset:
-            lag_size = int(match_with_offset.group(1))
-            lag_unit = match_with_offset.group(2)
-            offset_size = int(match_with_offset.group(3))
-            offset_unit = match_with_offset.group(4)
-            return cls(
-                lag_size=lag_size,
-                lag_unit=lag_unit,
-                offset_size=offset_size,
-                offset_unit=offset_unit,
-            )
-        # If no offset pattern found, try basic pattern
-        pattern = r"^lag_(\d+)([a-zA-Z])$"
-        match = re.match(pattern, formula)
-        if not match:
-            return None
-        lag_size = int(match.group(1))
-        lag_unit = match.group(2)
-        return cls(lag_size=lag_size, lag_unit=lag_unit)
-    def get_params(self) -> Dict[str, Optional[str]]:
-        res = super().get_params()
-        res.update(
-            {
-                "lag_size": self.lag_size,
-                "lag_unit": self.lag_unit,
-            }
-        )
-        return res
-    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
-        lag_window = self.lag_size + 1
-        return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=lag_window).agg(lambda x: x[0])

upgini/features_enricher.py CHANGED Viewed

@@ -31,7 +31,7 @@ from sklearn.exceptions import NotFittedError
 from sklearn.model_selection import BaseCrossValidator
 from upgini.autofe.feature import Feature
-from upgini.autofe.vector import TimeSeriesBase
+from upgini.autofe.timeseries import TimeSeriesBase
 from upgini.data_source.data_source_publisher import CommercialSchema
 from upgini.dataset import Dataset
 from upgini.errors import HttpError, ValidationError
@@ -3632,7 +3632,7 @@ if response.status_code == 200:
                     )
                 do_sorting = False
             else:
-                columns_to_hash = list(search_keys.keys()) + renamed_id_columns + [target_name]
+                columns_to_hash = list(set(list(search_keys.keys()) + renamed_id_columns + [target_name]))
                 columns_to_hash = sort_columns(
                     df[columns_to_hash],
                     target_name,

upgini/utils/sort.py CHANGED Viewed

@@ -28,12 +28,13 @@ def sort_columns(
         logger = logging.getLogger(__name__)
         logger.setLevel(logging.FATAL)
     df = df.copy()  # avoid side effects
+    search_keys = {k: v for k, v in search_keys.items() if v != SearchKey.CUSTOM_KEY}
     # Check multiple search keys
     search_key_values = list(search_keys.values())
     has_duplicate_search_keys = len(search_key_values) != len(set(search_key_values))
     if has_duplicate_search_keys:
-        logging.warning(f"WARNING: Found duplicate SearchKey values in search_keys: {search_keys}")
+        logger.warning(f"WARNING: Found duplicate SearchKey values in search_keys: {search_keys}")
     sorted_keys = sorted(search_keys.keys(), key=lambda x: str(search_keys.get(x)))
     sorted_keys = [k for k in sorted_keys if k in df.columns and k not in exclude_columns]
@@ -68,8 +69,9 @@ def get_sort_columns_dict(
     if len(string_features) > 0:
         if len(df) > len(df.drop(columns=string_features).drop_duplicates()) or sort_all_columns:
             # factorize string features
+            df = df.copy()
             for c in string_features:
-                df.loc[:, c] = pd.Series(df[c].factorize(sort=True)[0], index=df.index, dtype="int")
+                df = df.assign(**{c: pd.factorize(df[c], sort=True)[0].astype(int)})
             columns_for_sort.extend(string_features)
     if len(columns_for_sort) == 0:

{upgini-1.2.63.dist-info → upgini-1.2.65.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.63
+Version: 1.2.65
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

{upgini-1.2.63.dist-info → upgini-1.2.65.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,9 @@
-upgini/__about__.py,sha256=jIcsQGjL4QjnLFsRkdEHc7S78dfQHi-auHwc_P5Xftc,23
+upgini/__about__.py,sha256=9LdiugHjYADPBPHXjA5mj8ce_XDBj0fp-oIlGtPl5HI,23
 upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
 upgini/dataset.py,sha256=OGjpeFHbj3lWiZTOHTpWEoMMDmFY1FlNC44FKktoZvU,34956
 upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
-upgini/features_enricher.py,sha256=2AMEXtoMrEFw3f0b0CsvkFyS1a7L4aqI2GO_fCsgWac,205336
+upgini/features_enricher.py,sha256=nXGBMC42VPAmqQKXbEqZJFIHiGj6F_G2AwhurA8LuQs,205351
 upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
 upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
 upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
@@ -14,14 +14,22 @@ upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1
 upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
 upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
 upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-upgini/autofe/all_operands.py,sha256=v0_NozalvvzeojSAA0d7UJ5INS654ZVaLn4S8djK6Ac,329
-upgini/autofe/binary.py,sha256=zMhtHVuGUAFLUqem-XiXqJj-GRXxS88tdz8tFuDfSNM,7659
-upgini/autofe/date.py,sha256=pqwwk4_35RYXDT2fSJ9dlxGBm-R0jWBeiSb-79hZjkI,10721
-upgini/autofe/feature.py,sha256=zvRdlxCkaOsX0XiragNvh0tAPyOWut0MQTq5JGU5HtY,14749
-upgini/autofe/groupby.py,sha256=G48_sQZw016eGx3cOy8YQrEIOp95puWqYUpFWd-gdeM,3595
-upgini/autofe/operand.py,sha256=8Ttrfxv_H91dMbS7J55zxluzAJHfGXU_Y2xCh4OHwb8,4774
-upgini/autofe/unary.py,sha256=T3E7F3dA_7o_rkdCFq7JV6nHLzcoHLHQTcxO7y5Opa4,4646
-upgini/autofe/vector.py,sha256=udkg4pP7IIeLjt0Cg6rzEKUmGaubOnqsEz3bz9R6E44,7110
+upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
+upgini/autofe/binary.py,sha256=MnQuFiERpocjCPQUjOljlsq5FE-04GPfwtNjzvfNMyU,7671
+upgini/autofe/date.py,sha256=I07psJerrxOcHao91PdSCk9X6KWu61IBVyFRLjGNgK8,10730
+upgini/autofe/feature.py,sha256=xgu6bVIlUJ5PCUgoXQRNcGkcMOhj-_BdDRmkB_qRFS4,14766
+upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
+upgini/autofe/operator.py,sha256=EOffJw6vKXpEh5yymqb1RFNJPxGxmnHdFRo9dB5SCFo,4969
+upgini/autofe/unary.py,sha256=yVgPvtfnPSOhrii0YgezddmgWPwyOBCR0JutaIkdTTc,4658
+upgini/autofe/vector.py,sha256=l0KdKg-txlZxDSE4hPPfCtfGQofYbl7oaABPr830sPI,667
+upgini/autofe/timeseries/__init__.py,sha256=PGwwDAMwvkXl3el12tXVEmZUgDUvlmIPlXtROm6bD18,738
+upgini/autofe/timeseries/base.py,sha256=T9Ec8LKJbiwTUGGsd_xhM0U0NUJblqmKchkzUI1sK88,3755
+upgini/autofe/timeseries/cross.py,sha256=Sh5hAXZFWKaFRqf_JGODu9pWO2tmuV5VKyK9eX3i7-I,4931
+upgini/autofe/timeseries/delta.py,sha256=h0YhmI1TlPJnjwFpN_GQxLb6r59DQuucnG5tQAXSgjU,3520
+upgini/autofe/timeseries/lag.py,sha256=LfQtg484vuqM0mgY4Wft1swHX_Srq7OKKgZswCXoiXI,1882
+upgini/autofe/timeseries/roll.py,sha256=bNFMDszSYTWvB7EyhHbRY1DJqzSURvHlPAcBebt0y0Y,2878
+upgini/autofe/timeseries/trend.py,sha256=9p2Q5ByAi6cx9RH9teBTe8FyjSzqthznC2Lo5dsJ0ho,2051
+upgini/autofe/timeseries/volatility.py,sha256=9shUmIKjpWTHVYjj80YBsk0XheBJ9uBuLv5NW9Mchnk,7953
 upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/data_source/data_source_publisher.py,sha256=4S9qwlAklD8vg9tUU_c1pHE2_glUHAh15-wr5hMwKFw,22879
 upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
@@ -57,12 +65,12 @@ upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,1
 upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
 upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
 upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
-upgini/utils/sort.py,sha256=w-CoT33W_53ekOROpKI_VRsRmiyWNr2b3IpE5_4MLLA,6395
+upgini/utils/sort.py,sha256=GfWfCIbfK7e7BvSPZZNJD-PEtiN19DnTCEQkeefHHxI,6491
 upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,16579
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
 upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
 upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
-upgini-1.2.63.dist-info/METADATA,sha256=nH5TvEpkQ7qCwZi9uFN6qThiBIe3jLgLCIeRtZeflnA,49113
-upgini-1.2.63.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-upgini-1.2.63.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.63.dist-info/RECORD,,
+upgini-1.2.65.dist-info/METADATA,sha256=GGxmpRnHQUTsCQlWPZeNL2xk27XWuEWrvECLPVEx5vU,49113
+upgini-1.2.65.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
+upgini-1.2.65.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.65.dist-info/RECORD,,

{upgini-1.2.63.dist-info → upgini-1.2.65.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: hatchling 1.25.0
+Generator: hatchling 1.24.2
 Root-Is-Purelib: true
 Tag: py3-none-any

{upgini-1.2.63.dist-info → upgini-1.2.65.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.63__py3-none-any.whl → 1.2.65__py3-none-any.whl

upgini 1.2.63py3-none-any.whl → 1.2.65py3-none-any.whl