PyPI - upgini - Versions diffs - 1.2.61__py3-none-any.whl → 1.2.62a3818.dev1__py3-none-any.whl - Mend

upgini 1.2.61py3-none-any.whl → 1.2.62a3818.dev1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (18) hide show

upgini/__about__.py +1 -1
upgini/autofe/all_operands.py +2 -2
upgini/autofe/binary.py +1 -1
upgini/autofe/date.py +1 -1
upgini/autofe/feature.py +1 -1
upgini/autofe/groupby.py +1 -1
upgini/autofe/{operand.py → operator.py} +2 -2
upgini/autofe/timeseries.py +200 -0
upgini/autofe/unary.py +1 -1
upgini/autofe/vector.py +2 -198
upgini/dataset.py +17 -7
upgini/features_enricher.py +1 -1
upgini/utils/target_utils.py +54 -1
upgini/utils/ts_utils.py +41 -0
{upgini-1.2.61.dist-info → upgini-1.2.62a3818.dev1.dist-info}/METADATA +1 -1
{upgini-1.2.61.dist-info → upgini-1.2.62a3818.dev1.dist-info}/RECORD +18 -16
{upgini-1.2.61.dist-info → upgini-1.2.62a3818.dev1.dist-info}/WHEEL +1 -1
{upgini-1.2.61.dist-info → upgini-1.2.62a3818.dev1.dist-info}/licenses/LICENSE +0 -0

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.61"
1	+ __version__ = "1.2.62a3818.dev1"

upgini/autofe/all_operands.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from upgini.autofe.operand import OperandRegistry
+from upgini.autofe.operator import OperatorRegistry
 from upgini.autofe.unary import *  # noqa
 from upgini.autofe.binary import *  # noqa
 from upgini.autofe.groupby import *  # noqa
@@ -7,4 +7,4 @@ from upgini.autofe.vector import *  # noqa
 def find_op(name):
-    return OperandRegistry.get_operand(name)
+    return OperatorRegistry.get_operand(name)

upgini/autofe/binary.py CHANGED Viewed

@@ -5,7 +5,7 @@ import numpy as np
 import pandas as pd
 from jarowinkler import jarowinkler_similarity
-from upgini.autofe.operand import PandasOperand, VectorizableMixin
+from upgini.autofe.operator import PandasOperand, VectorizableMixin
 class Min(PandasOperand):

upgini/autofe/date.py CHANGED Viewed

@@ -7,7 +7,7 @@ import pandas as pd
 from pandas.core.arrays.timedeltas import TimedeltaArray
 from pydantic import BaseModel, __version__ as pydantic_version
-from upgini.autofe.operand import PandasOperand, ParametrizedOperand
+from upgini.autofe.operator import PandasOperand, ParametrizedOperand
 def get_pydantic_version():

upgini/autofe/feature.py CHANGED Viewed

@@ -7,7 +7,7 @@ import pandas as pd
 from pandas._typing import DtypeObj
 from upgini.autofe.all_operands import find_op
-from upgini.autofe.operand import Operand, PandasOperand
+from upgini.autofe.operator import Operand, PandasOperand
 class Column:

upgini/autofe/groupby.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Optional
 import pandas as pd
-from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
+from upgini.autofe.operator import PandasOperand, ParametrizedOperand, VectorizableMixin
 class GroupByThenAgg(

upgini/autofe/{operand.py → operator.py} RENAMED Viewed

@@ -6,7 +6,7 @@ import pandas as pd
 from pydantic import BaseModel
-class OperandRegistry(type(BaseModel)):
+class OperatorRegistry(type(BaseModel)):
     _registry = {}
     _parametrized_registry = []
@@ -46,7 +46,7 @@ class OperandRegistry(type(BaseModel)):
         return None
-class Operand(BaseModel, metaclass=OperandRegistry):
+class Operand(BaseModel, metaclass=OperatorRegistry):
     name: Optional[str] = None
     alias: Optional[str] = None
     is_unary: bool = False

upgini/autofe/timeseries.py ADDED Viewed

@@ -0,0 +1,200 @@
+import abc
+from typing import Dict, List, Optional
+import pandas as pd
+from upgini.autofe.operator import PandasOperand, ParametrizedOperand
+try:
+    from pydantic import field_validator as validator  # V2
+except ImportError:
+    from pydantic import validator  # V1
+class TimeSeriesBase(PandasOperand, abc.ABC):
+    is_vector: bool = True
+    date_unit: Optional[str] = None
+    offset_size: int = 0
+    offset_unit: str = "D"
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        res.update(
+            {
+                "date_unit": self.date_unit,
+                "offset_size": self.offset_size,
+                "offset_unit": self.offset_unit,
+            }
+        )
+        return res
+    def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
+        # assuming first is date, last is value, rest is group columns
+        date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
+        ts = pd.concat([date] + data[1:], axis=1)
+        ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
+        ts.set_index(date.name, inplace=True)
+        ts = ts[ts.index.notna()].sort_index()
+        ts = (
+            ts.groupby([c.name for c in data[1:-1]], group_keys=True)
+            .apply(self._shift)[data[-1].name]
+            .to_frame()
+            .reset_index()
+            .set_index(date.name)
+            .groupby([c.name for c in data[1:-1]])
+            if len(data) > 2
+            else self._shift(ts)
+        )
+        ts = self._aggregate(ts)
+        ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
+        ts.index = date.index
+        return ts.iloc[:, -1]
+    def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
+        if self.offset_size > 0:
+            return ts.iloc[:, :-1].merge(
+                ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
+                left_index=True,
+                right_index=True,
+            )
+        return ts
+    @abc.abstractmethod
+    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
+        pass
+_roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean(), "last": lambda x: x[-1]}
+class Roll(TimeSeriesBase, ParametrizedOperand):
+    aggregation: str
+    window_size: int = 1
+    window_unit: str = "D"
+    @validator("window_unit")
+    @classmethod
+    def validate_window_unit(cls, v: str) -> str:
+        try:
+            pd.tseries.frequencies.to_offset(v)
+            return v
+        except ValueError:
+            raise ValueError(
+                f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
+            )
+    def to_formula(self) -> str:
+        roll_component = f"roll_{self.window_size}{self.window_unit}"
+        if self.offset_size > 0:
+            roll_component += f"_offset_{self.offset_size}{self.offset_unit}"
+        return f"{roll_component}_{self.aggregation}"
+    @classmethod
+    def from_formula(cls, formula: str) -> Optional["Roll"]:
+        import re
+        # Try matching pattern with offset first
+        pattern_with_offset = r"^roll_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])_(\w+)$"
+        match_with_offset = re.match(pattern_with_offset, formula)
+        if match_with_offset:
+            window_size = int(match_with_offset.group(1))
+            window_unit = match_with_offset.group(2)
+            offset_size = int(match_with_offset.group(3))
+            offset_unit = match_with_offset.group(4)
+            aggregation = match_with_offset.group(5)
+            return cls(
+                window_size=window_size,
+                window_unit=window_unit,
+                offset_size=offset_size,
+                offset_unit=offset_unit,
+                aggregation=aggregation,
+            )
+        # If no offset pattern found, try basic pattern
+        pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
+        match = re.match(pattern, formula)
+        if not match:
+            return None
+        window_size = int(match.group(1))
+        window_unit = match.group(2)
+        aggregation = match.group(3)
+        return cls(window_size=window_size, window_unit=window_unit, aggregation=aggregation)
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        res.update(
+            {
+                "window_size": self.window_size,
+                "window_unit": self.window_unit,
+                "aggregation": self.aggregation,
+            }
+        )
+        return res
+    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
+        return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
+            _roll_aggregations.get(self.aggregation, self.aggregation)
+        )
+class Lag(TimeSeriesBase, ParametrizedOperand):
+    lag_size: int
+    lag_unit: str = "D"
+    def to_formula(self) -> str:
+        lag_component = f"lag_{self.lag_size}{self.lag_unit}"
+        if self.offset_size > 0:
+            lag_component += f"_offset_{self.offset_size}{self.offset_unit}"
+        return lag_component
+    @classmethod
+    def from_formula(cls, formula: str) -> Optional["Lag"]:
+        import re
+        # Try matching pattern with offset first
+        pattern_with_offset = r"^lag_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])$"
+        match_with_offset = re.match(pattern_with_offset, formula)
+        if match_with_offset:
+            lag_size = int(match_with_offset.group(1))
+            lag_unit = match_with_offset.group(2)
+            offset_size = int(match_with_offset.group(3))
+            offset_unit = match_with_offset.group(4)
+            return cls(
+                lag_size=lag_size,
+                lag_unit=lag_unit,
+                offset_size=offset_size,
+                offset_unit=offset_unit,
+            )
+        # If no offset pattern found, try basic pattern
+        pattern = r"^lag_(\d+)([a-zA-Z])$"
+        match = re.match(pattern, formula)
+        if not match:
+            return None
+        lag_size = int(match.group(1))
+        lag_unit = match.group(2)
+        return cls(lag_size=lag_size, lag_unit=lag_unit)
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        res.update(
+            {
+                "lag_size": self.lag_size,
+                "lag_unit": self.lag_unit,
+            }
+        )
+        return res
+    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
+        lag_window = self.lag_size + 1
+        return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=lag_window).agg(lambda x: x[0])

upgini/autofe/unary.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Dict, Optional
 import numpy as np
 import pandas as pd
-from upgini.autofe.operand import PandasOperand, VectorizableMixin
+from upgini.autofe.operator import PandasOperand, VectorizableMixin
 class Abs(PandasOperand, VectorizableMixin):

upgini/autofe/vector.py CHANGED Viewed

@@ -1,14 +1,8 @@
-import abc
-from typing import Dict, List, Optional
+from typing import List, Optional
 import pandas as pd
-try:
-    from pydantic import field_validator as validator  # V2
-except ImportError:
-    from pydantic import validator  # V1
-from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
+from upgini.autofe.operator import PandasOperand, VectorizableMixin
 class Mean(PandasOperand, VectorizableMixin):
@@ -28,193 +22,3 @@ class Sum(PandasOperand, VectorizableMixin):
     def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
         return pd.DataFrame(data).T.fillna(0).sum(axis=1)
-class TimeSeriesBase(PandasOperand, abc.ABC):
-    is_vector: bool = True
-    date_unit: Optional[str] = None
-    offset_size: int = 0
-    offset_unit: str = "D"
-    def get_params(self) -> Dict[str, Optional[str]]:
-        res = super().get_params()
-        res.update(
-            {
-                "date_unit": self.date_unit,
-                "offset_size": self.offset_size,
-                "offset_unit": self.offset_unit,
-            }
-        )
-        return res
-    def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
-        # assuming first is date, last is value, rest is group columns
-        date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
-        ts = pd.concat([date] + data[1:], axis=1)
-        ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
-        ts.set_index(date.name, inplace=True)
-        ts = ts[ts.index.notna()].sort_index()
-        ts = (
-            ts.groupby([c.name for c in data[1:-1]])
-            .apply(self._shift)[data[-1].name]
-            .to_frame()
-            .reset_index()
-            .set_index(date.name)
-            .groupby([c.name for c in data[1:-1]])
-            if len(data) > 2
-            else self._shift(ts)
-        )
-        ts = self._aggregate(ts)
-        ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
-        ts.index = date.index
-        return ts.iloc[:, -1]
-    def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
-        if self.offset_size > 0:
-            return ts.iloc[:, :-1].merge(
-                ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
-                left_index=True,
-                right_index=True,
-            )
-        return ts
-    @abc.abstractmethod
-    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
-        pass
-_roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean()}
-class Roll(TimeSeriesBase, ParametrizedOperand):
-    aggregation: str
-    window_size: int = 1
-    window_unit: str = "D"
-    @validator("window_unit")
-    @classmethod
-    def validate_window_unit(cls, v: str) -> str:
-        try:
-            pd.tseries.frequencies.to_offset(v)
-            return v
-        except ValueError:
-            raise ValueError(
-                f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
-            )
-    def to_formula(self) -> str:
-        roll_component = f"roll_{self.window_size}{self.window_unit}"
-        if self.offset_size > 0:
-            roll_component += f"_offset_{self.offset_size}{self.offset_unit}"
-        return f"{roll_component}_{self.aggregation}"
-    @classmethod
-    def from_formula(cls, formula: str) -> Optional["Roll"]:
-        import re
-        # Try matching pattern with offset first
-        pattern_with_offset = r"^roll_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])_(\w+)$"
-        match_with_offset = re.match(pattern_with_offset, formula)
-        if match_with_offset:
-            window_size = int(match_with_offset.group(1))
-            window_unit = match_with_offset.group(2)
-            offset_size = int(match_with_offset.group(3))
-            offset_unit = match_with_offset.group(4)
-            aggregation = match_with_offset.group(5)
-            return cls(
-                window_size=window_size,
-                window_unit=window_unit,
-                offset_size=offset_size,
-                offset_unit=offset_unit,
-                aggregation=aggregation,
-            )
-        # If no offset pattern found, try basic pattern
-        pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
-        match = re.match(pattern, formula)
-        if not match:
-            return None
-        window_size = int(match.group(1))
-        window_unit = match.group(2)
-        aggregation = match.group(3)
-        return cls(window_size=window_size, window_unit=window_unit, aggregation=aggregation)
-    def get_params(self) -> Dict[str, Optional[str]]:
-        res = super().get_params()
-        res.update(
-            {
-                "window_size": self.window_size,
-                "window_unit": self.window_unit,
-                "aggregation": self.aggregation,
-            }
-        )
-        return res
-    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
-        return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
-            _roll_aggregations.get(self.aggregation, self.aggregation)
-        )
-class Lag(TimeSeriesBase, ParametrizedOperand):
-    lag_size: int
-    lag_unit: str = "D"
-    def to_formula(self) -> str:
-        lag_component = f"lag_{self.lag_size}{self.lag_unit}"
-        if self.offset_size > 0:
-            lag_component += f"_offset_{self.offset_size}{self.offset_unit}"
-        return lag_component
-    @classmethod
-    def from_formula(cls, formula: str) -> Optional["Lag"]:
-        import re
-        # Try matching pattern with offset first
-        pattern_with_offset = r"^lag_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])$"
-        match_with_offset = re.match(pattern_with_offset, formula)
-        if match_with_offset:
-            lag_size = int(match_with_offset.group(1))
-            lag_unit = match_with_offset.group(2)
-            offset_size = int(match_with_offset.group(3))
-            offset_unit = match_with_offset.group(4)
-            return cls(
-                lag_size=lag_size,
-                lag_unit=lag_unit,
-                offset_size=offset_size,
-                offset_unit=offset_unit,
-            )
-        # If no offset pattern found, try basic pattern
-        pattern = r"^lag_(\d+)([a-zA-Z])$"
-        match = re.match(pattern, formula)
-        if not match:
-            return None
-        lag_size = int(match.group(1))
-        lag_unit = match.group(2)
-        return cls(lag_size=lag_size, lag_unit=lag_unit)
-    def get_params(self) -> Dict[str, Optional[str]]:
-        res = super().get_params()
-        res.update(
-            {
-                "lag_size": self.lag_size,
-                "lag_unit": self.lag_unit,
-            }
-        )
-        return res
-    def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
-        lag_window = self.lag_size + 1
-        return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=lag_window).agg(lambda x: x[0])

upgini/dataset.py CHANGED Viewed

@@ -40,7 +40,7 @@ from upgini.utils.email_utils import EmailSearchKeyConverter
 from upgini.utils.target_utils import (
     balance_undersample,
     balance_undersample_forced,
-    balance_undersample_time_series,
+    balance_undersample_time_series_trunc,
 )
 try:
@@ -58,6 +58,8 @@ class Dataset:  # (pd.DataFrame):
     FIT_SAMPLE_THRESHOLD = 200_000
     FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
     FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
+    FIT_SAMPLE_THRESHOLD_TS = 54_000
+    FIT_SAMPLE_ROWS_TS = 54_000
     BINARY_MIN_SAMPLE_THRESHOLD = 5_000
     MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
     IMBALANCE_THESHOLD = 0.6
@@ -301,7 +303,10 @@ class Dataset:  # (pd.DataFrame):
                 )
         # Resample over fit threshold
-        if not self.imbalanced and EVAL_SET_INDEX in self.data.columns:
+        if self.cv_type is not None and self.cv_type.is_time_series():
+            sample_threshold = self.FIT_SAMPLE_THRESHOLD_TS
+            sample_rows = self.FIT_SAMPLE_ROWS_TS
+        elif not self.imbalanced and EVAL_SET_INDEX in self.data.columns:
             sample_threshold = self.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD
             sample_rows = self.FIT_SAMPLE_WITH_EVAL_SET_ROWS
         else:
@@ -314,7 +319,7 @@ class Dataset:  # (pd.DataFrame):
                 f"and will be downsampled to {sample_rows}"
             )
             if self.cv_type is not None and self.cv_type.is_time_series():
-                resampled_data = balance_undersample_time_series(
+                resampled_data = balance_undersample_time_series_trunc(
                     df=self.data,
                     id_columns=self.id_columns,
                     date_column=next(
@@ -584,10 +589,7 @@ class Dataset:  # (pd.DataFrame):
         return search_customization
     def _rename_generate_features(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
-        if (
-            runtime_parameters is not None
-            and runtime_parameters.properties is not None
-        ):
+        if runtime_parameters is not None and runtime_parameters.properties is not None:
             if "generate_features" in runtime_parameters.properties:
                 generate_features = runtime_parameters.properties["generate_features"].split(",")
                 renamed_generate_features = []
@@ -607,6 +609,13 @@ class Dataset:  # (pd.DataFrame):
         return runtime_parameters
+    def _set_sample_size(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
+        if runtime_parameters is not None and runtime_parameters.properties is not None:
+            if self.cv_type is not None and self.cv_type.is_time_series():
+                runtime_parameters.properties["sample_size"] = self.FIT_SAMPLE_ROWS_TS
+                runtime_parameters.properties["iter0_sample_size"] = self.FIT_SAMPLE_ROWS_TS
+        return runtime_parameters
     def _clean_generate_features(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
         if (
             runtime_parameters is not None
@@ -638,6 +647,7 @@ class Dataset:  # (pd.DataFrame):
         file_metrics = FileMetrics()
         runtime_parameters = self._rename_generate_features(runtime_parameters)
+        runtime_parameters = self._set_sample_size(runtime_parameters)
         file_metadata = self.__construct_metadata(exclude_features_sources)
         search_customization = self.__construct_search_customization(

upgini/features_enricher.py CHANGED Viewed

@@ -31,7 +31,7 @@ from sklearn.exceptions import NotFittedError
 from sklearn.model_selection import BaseCrossValidator
 from upgini.autofe.feature import Feature
-from upgini.autofe.vector import TimeSeriesBase
+from upgini.autofe.timeseries import TimeSeriesBase
 from upgini.data_source.data_source_publisher import CommercialSchema
 from upgini.dataset import Dataset
 from upgini.errors import HttpError, ValidationError

upgini/utils/target_utils.py CHANGED Viewed

@@ -9,6 +9,7 @@ from upgini.errors import ValidationError
 from upgini.metadata import SYSTEM_RECORD_ID, CVType, ModelTaskType
 from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
 from upgini.sampler.random_under_sampler import RandomUnderSampler
+from upgini.utils.ts_utils import get_most_frequent_time_unit, trunc_datetime
 TS_MIN_DIFFERENT_IDS_RATIO = 0.2
@@ -240,7 +241,7 @@ def balance_undersample_forced(
     df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
     if cv_type is not None and cv_type.is_time_series():
         logger.warning(f"Sampling time series dataset from {len(df)} to {sample_size}")
-        resampled_data = balance_undersample_time_series(
+        resampled_data = balance_undersample_time_series_trunc(
             df,
             id_columns=id_columns,
             date_column=date_column,
@@ -279,6 +280,58 @@ def balance_undersample_forced(
     return resampled_data
+DEFAULT_HIGH_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=2, months=6), pd.DateOffset(years=2, days=7)]
+DEFAULT_LOW_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=7), pd.DateOffset(years=5)]
+DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
+def balance_undersample_time_series_trunc(
+    df: pd.DataFrame,
+    id_columns: List[str],
+    date_column: str,
+    sample_size: int,
+    random_state: int = 42,
+    logger: Optional[logging.Logger] = None,
+    highfreq_trunc_lengths: List[pd.DateOffset] = DEFAULT_HIGH_FREQ_TRUNC_LENGTHS,
+    lowfreq_trunc_lengths: List[pd.DateOffset] = DEFAULT_LOW_FREQ_TRUNC_LENGTHS,
+    time_unit_threshold: pd.Timedelta = DEFAULT_TIME_UNIT_THRESHOLD,
+    **kwargs,
+):
+    # Convert date column to datetime
+    dates_df = df[id_columns + [date_column]].copy()
+    dates_df[date_column] = pd.to_datetime(dates_df[date_column], unit="ms")
+    time_unit = get_most_frequent_time_unit(dates_df, id_columns, date_column)
+    if logger is not None:
+        logger.info(f"Time unit: {time_unit}")
+    if time_unit is None:
+        if logger is not None:
+            logger.info("Cannot detect time unit, returning original dataset")
+        return df
+    if time_unit < time_unit_threshold:
+        for trunc_length in highfreq_trunc_lengths:
+            sampled_df = trunc_datetime(dates_df, id_columns, date_column, trunc_length, logger=logger)
+            if len(sampled_df) <= sample_size:
+                break
+        if len(sampled_df) > sample_size:
+            sampled_df = balance_undersample_time_series(
+                sampled_df, id_columns, date_column, sample_size, random_state, logger=logger, **kwargs
+            )
+    else:
+        for trunc_length in lowfreq_trunc_lengths:
+            sampled_df = trunc_datetime(dates_df, id_columns, date_column, trunc_length, logger=logger)
+            if len(sampled_df) <= sample_size:
+                break
+        if len(sampled_df) > sample_size:
+            sampled_df = balance_undersample_time_series(
+                sampled_df, id_columns, date_column, sample_size, random_state, logger=logger, **kwargs
+            )
+    return df.loc[sampled_df.index]
 def balance_undersample_time_series(
     df: pd.DataFrame,
     id_columns: List[str],

upgini/utils/ts_utils.py ADDED Viewed

@@ -0,0 +1,41 @@
+import logging
+from typing import List, Optional
+import pandas as pd
+def get_most_frequent_time_unit(df: pd.DataFrame, id_columns: List[str], date_column: str) -> Optional[pd.DateOffset]:
+    def closest_unit(diff):
+        return pd.tseries.frequencies.to_offset(pd.Timedelta(diff, unit="s"))
+    all_diffs = []
+    groups = df.groupby(id_columns) if id_columns else [(None, df)]
+    for _, group in groups:
+        group_dates = group[date_column].sort_values().unique()
+        if len(group_dates) > 1:
+            diff_series = pd.Series(group_dates[1:] - group_dates[:-1])
+            diff_ns = diff_series.dt.total_seconds()
+            all_diffs.extend(diff_ns)
+    all_diffs = pd.Series(all_diffs)
+    most_frequent_unit = all_diffs.apply(closest_unit).mode().min()
+    return most_frequent_unit if isinstance(most_frequent_unit, pd.DateOffset) else None
+def trunc_datetime(
+    df: pd.DataFrame,
+    id_columns: List[str],
+    date_column: str,
+    length: pd.DateOffset,
+    logger: Optional[logging.Logger] = None,
+) -> pd.DataFrame:
+    if logger is not None:
+        logger.info(f"Truncating time series dataset to {length}")
+    if id_columns:
+        min_datetime = df.groupby(id_columns)[date_column].transform(lambda group: group.max() - length)
+    else:
+        min_datetime = df[date_column].max() - length
+    return df[df[date_column] > min_datetime]

{upgini-1.2.61.dist-info → upgini-1.2.62a3818.dev1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.61
+Version: 1.2.62a3818.dev1
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

{upgini-1.2.61.dist-info → upgini-1.2.62a3818.dev1.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,9 @@
-upgini/__about__.py,sha256=17s3XgKQ6UgMiFGNXwnQprj1EsjPUiE6QGnAzyDIfhs,23
+upgini/__about__.py,sha256=-inFSOjK0otU7oAU9xIxafvjGaGWyHQqEAz5nWw5yqI,33
 upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
-upgini/dataset.py,sha256=NP5vHqEfZQ1HWz3TcNAa_OhXG8wiMRdydm26D6UBiRU,34166
+upgini/dataset.py,sha256=OGjpeFHbj3lWiZTOHTpWEoMMDmFY1FlNC44FKktoZvU,34956
 upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
-upgini/features_enricher.py,sha256=2AMEXtoMrEFw3f0b0CsvkFyS1a7L4aqI2GO_fCsgWac,205336
+upgini/features_enricher.py,sha256=cB2I5rNpbztjkYEEW5aJuKj2fCMnfxp40X4Eo63oyuQ,205340
 upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
 upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
 upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
@@ -14,14 +14,15 @@ upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1
 upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
 upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
 upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-upgini/autofe/all_operands.py,sha256=v0_NozalvvzeojSAA0d7UJ5INS654ZVaLn4S8djK6Ac,329
-upgini/autofe/binary.py,sha256=zMhtHVuGUAFLUqem-XiXqJj-GRXxS88tdz8tFuDfSNM,7659
-upgini/autofe/date.py,sha256=pqwwk4_35RYXDT2fSJ9dlxGBm-R0jWBeiSb-79hZjkI,10721
-upgini/autofe/feature.py,sha256=zvRdlxCkaOsX0XiragNvh0tAPyOWut0MQTq5JGU5HtY,14749
-upgini/autofe/groupby.py,sha256=G48_sQZw016eGx3cOy8YQrEIOp95puWqYUpFWd-gdeM,3595
-upgini/autofe/operand.py,sha256=8Ttrfxv_H91dMbS7J55zxluzAJHfGXU_Y2xCh4OHwb8,4774
-upgini/autofe/unary.py,sha256=T3E7F3dA_7o_rkdCFq7JV6nHLzcoHLHQTcxO7y5Opa4,4646
-upgini/autofe/vector.py,sha256=udkg4pP7IIeLjt0Cg6rzEKUmGaubOnqsEz3bz9R6E44,7110
+upgini/autofe/all_operands.py,sha256=VIT5jCq5U-qypdNz1MIQ_hlIAs0ujJgRfKRUkU24nFs,332
+upgini/autofe/binary.py,sha256=jsXa_zwlNWRmQAT5qipzU2Or03qae-a1kkY9yDECkq8,7660
+upgini/autofe/date.py,sha256=bmoXU5vlDa1xsfCIFEC_VMRHOnV8Sy_KUMshqh0ARvA,10722
+upgini/autofe/feature.py,sha256=n4sNNFM9b022AGJbW14AMRuERD9bwub-RWqa6hfLID0,14750
+upgini/autofe/groupby.py,sha256=NN0T-tYbTHQDeCi2UZ06wVkDflm8DJBV4rdGrrVyVEE,3596
+upgini/autofe/operator.py,sha256=VCGDUQ5bOtwX-jzmgHDrKF3GbglDumyEkvtLWTmSGQo,4776
+upgini/autofe/timeseries.py,sha256=Pci7kNpFcViNZdIHlVTyxjoxzcMVdqUPopbPrJ3hE20,6593
+upgini/autofe/unary.py,sha256=my7AYIrWCQPFxRtcphONmwieU5HpX4fHiKllFRCsMUk,4647
+upgini/autofe/vector.py,sha256=5Lx2q_Np9PrMtZ_8O86xywq0s4XSQbooHxK3ufo3ANU,664
 upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/data_source/data_source_publisher.py,sha256=4S9qwlAklD8vg9tUU_c1pHE2_glUHAh15-wr5hMwKFw,22879
 upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
@@ -58,10 +59,11 @@ upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml
 upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
 upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
 upgini/utils/sort.py,sha256=w-CoT33W_53ekOROpKI_VRsRmiyWNr2b3IpE5_4MLLA,6395
-upgini/utils/target_utils.py,sha256=VsMdlS04_9SHlB2DPfSWTeqjc2JoXR5OPvu4qmvkmkg,14347
+upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,16579
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
+upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
 upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
-upgini-1.2.61.dist-info/METADATA,sha256=hH2eL4JHq8BjVpY3ZNFYDqUtKs5psdoiVM5jiXjs0yU,49084
-upgini-1.2.61.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-upgini-1.2.61.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.61.dist-info/RECORD,,
+upgini-1.2.62a3818.dev1.dist-info/METADATA,sha256=9mRM2yQ18CeOTHQ83UgVmItZ-npsZSla3illeXSpyTQ,49094
+upgini-1.2.62a3818.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
+upgini-1.2.62a3818.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.62a3818.dev1.dist-info/RECORD,,

{upgini-1.2.61.dist-info → upgini-1.2.62a3818.dev1.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: hatchling 1.25.0
+Generator: hatchling 1.24.2
 Root-Is-Purelib: true
 Tag: py3-none-any

{upgini-1.2.61.dist-info → upgini-1.2.62a3818.dev1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.61__py3-none-any.whl → 1.2.62a3818.dev1__py3-none-any.whl

Potentially problematic release.

upgini 1.2.61py3-none-any.whl → 1.2.62a3818.dev1py3-none-any.whl