PyPI - upgini - Versions diffs - 1.2.67__py3-none-any.whl → 1.2.68a3818.dev1__py3-none-any.whl - Mend

upgini 1.2.67py3-none-any.whl → 1.2.68a3818.dev1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (14) hide show

upgini/__about__.py +1 -1
upgini/autofe/date.py +15 -21
upgini/autofe/feature.py +5 -1
upgini/autofe/timeseries/cross.py +15 -7
upgini/autofe/timeseries/roll.py +2 -7
upgini/autofe/utils.py +83 -0
upgini/features_enricher.py +141 -145
upgini/resource_bundle/strings.properties +1 -0
upgini/search_task.py +7 -1
upgini/utils/mstats.py +1 -1
{upgini-1.2.67.dist-info → upgini-1.2.68a3818.dev1.dist-info}/METADATA +1 -1
{upgini-1.2.67.dist-info → upgini-1.2.68a3818.dev1.dist-info}/RECORD +14 -13
{upgini-1.2.67.dist-info → upgini-1.2.68a3818.dev1.dist-info}/WHEEL +1 -1
{upgini-1.2.67.dist-info → upgini-1.2.68a3818.dev1.dist-info}/licenses/LICENSE +0 -0

upgini/__about__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.67"
1	+ __version__ = "1.2.68a3818.dev1"

upgini/autofe/date.py CHANGED Viewed

@@ -8,6 +8,7 @@ from pandas.core.arrays.timedeltas import TimedeltaArray
 from pydantic import BaseModel, __version__ as pydantic_version
 from upgini.autofe.operator import PandasOperator, ParametrizedOperator
+from upgini.autofe.utils import pydantic_validator
 def get_pydantic_version():
@@ -209,6 +210,14 @@ class DateListDiffBounded(DateListDiff, ParametrizedOperator):
         return cls(diff_unit=diff_unit, lower_bound=lower_bound, upper_bound=upper_bound, aggregation=aggregation)
+    def get_params(self) -> Dict[str, Optional[str]]:
+        res = super().get_params()
+        if self.lower_bound is not None:
+            res["lower_bound"] = str(self.lower_bound)
+        if self.upper_bound is not None:
+            res["upper_bound"] = str(self.upper_bound)
+        return res
     def _agg(self, x):
         x = x[
             (x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
@@ -269,32 +278,17 @@ class DatePercentile(DatePercentileBase):
             {
                 "zero_month": self.zero_month,
                 "zero_year": self.zero_year,
-                "zero_bounds": self.zero_bounds,
+                "zero_bounds": json.dumps(self.zero_bounds),
                 "step": self.step,
             }
         )
         return res
-    # Check Pydantic version
-    if get_pydantic_version() >= 2:
-        # Use @field_validator for Pydantic 2.x
-        from pydantic import field_validator
-        @field_validator("zero_bounds", mode="before")
-        def parse_zero_bounds(cls, value):
-            if isinstance(value, str):
-                return json.loads(value)
-            return value
-    else:
-        # Use @validator for Pydantic 1.x
-        from pydantic import validator
-        @validator("zero_bounds", pre=True)
-        def parse_zero_bounds(cls, value):
-            if isinstance(value, str):
-                return json.loads(value)
-            return value
+    @pydantic_validator("zero_bounds", mode="before")
+    def parse_zero_bounds(cls, value):
+        if isinstance(value, str):
+            return json.loads(value)
+        return value
     def _get_bounds(self, date_col: pd.Series) -> pd.Series:
         months = date_col.dt.month

upgini/autofe/feature.py CHANGED Viewed

@@ -112,7 +112,11 @@ class Feature:
     def get_hash(self) -> str:
         return hashlib.sha256(
-            "_".join([self.op.get_hash_component()] + [ch.get_display_name() for ch in self.children]).encode("utf-8")
+            "_".join(
+                [self.op.get_hash_component()]
+                + [ch.op.get_hash_component() for ch in self.children if isinstance(ch, Feature)]
+                + [ch.get_display_name() for ch in self.children]
+            ).encode("utf-8")
         ).hexdigest()[:8]
     def set_alias(self, alias: str) -> "Feature":

upgini/autofe/timeseries/cross.py CHANGED Viewed

@@ -1,16 +1,13 @@
+import json
 from typing import Dict, List, Optional
 import numpy as np
 import pandas as pd
-try:
-    from pydantic import field_validator as validator  # V2
-except ImportError:
-    from pydantic import validator  # V1
 from upgini.autofe.all_operators import find_op
 from upgini.autofe.operator import PandasOperator, ParametrizedOperator
 from upgini.autofe.timeseries.base import TimeSeriesBase
+from upgini.autofe.utils import pydantic_validator
 class CrossSeriesInteraction(TimeSeriesBase, ParametrizedOperator):
@@ -20,13 +17,24 @@ class CrossSeriesInteraction(TimeSeriesBase, ParametrizedOperator):
     left_descriptor: List[str] = []
     right_descriptor: List[str] = []
-    @validator("descriptor_indices")
-    @classmethod
+    @pydantic_validator("descriptor_indices")
     def validate_descriptor_indices(cls, v):
         if not v:
             raise ValueError("descriptor_indices cannot be empty for CrossSeriesInteraction")
         return v
+    @pydantic_validator("left_descriptor", "right_descriptor", mode="before")
+    def parse_descriptors(cls, v):
+        if isinstance(v, str):
+            return json.loads(v)
+        return v
+    @pydantic_validator("interaction_op", mode="before")
+    def validate_interaction_op(cls, v):
+        if isinstance(v, str):
+            return find_op(v)
+        return v
     def __init__(self, **data):
         super().__init__(**data)
         indices = self.descriptor_indices

upgini/autofe/timeseries/roll.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import Dict, Optional
 from upgini.autofe.operator import ParametrizedOperator
 from upgini.autofe.timeseries.base import TimeSeriesBase
+from upgini.autofe.utils import pydantic_validator
 # Roll aggregation functions
 roll_aggregations = {
@@ -12,19 +13,13 @@ roll_aggregations = {
     "iqr": lambda x: x.quantile(0.75) - x.quantile(0.25),
 }
-try:
-    from pydantic import field_validator as validator  # V2
-except ImportError:
-    from pydantic import validator  # V1
 class Roll(TimeSeriesBase, ParametrizedOperator):
     aggregation: str
     window_size: int = 1
     window_unit: str = "D"
-    @validator("window_unit")
-    @classmethod
+    @pydantic_validator("window_unit")
     def validate_window_unit(cls, v: str) -> str:
         try:
             pd.tseries.frequencies.to_offset(v)

upgini/autofe/utils.py ADDED Viewed

@@ -0,0 +1,83 @@
+"""
+Utility functions for autofe module.
+"""
+import functools
+from typing import Callable
+def get_pydantic_version():
+    """
+    Get the major version of pydantic.
+    Returns:
+        int: Major version number (1 or 2)
+    """
+    try:
+        from pydantic import __version__ as pydantic_version
+        major_version = int(pydantic_version.split(".")[0])
+        return major_version
+    except (ImportError, ValueError):
+        # Default to version 1 if unable to determine
+        return 1
+def pydantic_validator(field_name: str, *fields, mode: str = "before", **kwargs):
+    """
+    A decorator that applies the appropriate Pydantic validator based on the installed version.
+    This decorator handles the differences between Pydantic v1 and v2 validator syntax,
+    making it easier to write code that works with both versions.
+    Args:
+        field_name (str): The name of the field to validate
+        mode (str): The validation mode, either "before" or "after" (for Pydantic v2)
+        **kwargs: Additional arguments to pass to the validator
+    Returns:
+        Callable: A decorator that can be applied to validator methods
+    Example:
+        ```python
+        class MyModel(BaseModel):
+            items: List[int]
+            @pydantic_validator("items")
+            def parse_items(cls, value):
+                if isinstance(value, str):
+                    return [int(x) for x in value.split(",")]
+                return value
+        ```
+    """
+    pydantic_version = get_pydantic_version()
+    if pydantic_version >= 2:
+        # Use field_validator for Pydantic 2.x
+        from pydantic import field_validator
+        def decorator(func: Callable) -> Callable:
+            @field_validator(field_name, *fields, mode=mode, **kwargs)
+            @functools.wraps(func)
+            def wrapper(cls, value, **kw):
+                return func(cls, value)
+            return wrapper
+        return decorator
+    else:
+        # Use validator for Pydantic 1.x
+        from pydantic import validator
+        # Map mode to Pydantic v1 parameters
+        pre = True if mode == "before" else False
+        def decorator(func: Callable) -> Callable:
+            @validator(field_name, *fields, pre=pre, **kwargs)
+            @functools.wraps(func)
+            def wrapper(cls, value, **kw):
+                return func(cls, value)
+            return wrapper
+        return decorator

upgini/features_enricher.py CHANGED Viewed

@@ -702,6 +702,7 @@ class FeaturesEnricher(TransformerMixin):
     def transform(
         self,
         X: pd.DataFrame,
+        y: Optional[pd.Series] = None,
         *args,
         exclude_features_sources: Optional[List[str]] = None,
         keep_input: bool = True,
@@ -766,6 +767,7 @@ class FeaturesEnricher(TransformerMixin):
                 result, _, _ = self.__inner_transform(
                     trace_id,
                     X,
+                    y=y,
                     exclude_features_sources=exclude_features_sources,
                     importance_threshold=importance_threshold,
                     max_features=max_features,
@@ -1682,7 +1684,6 @@ class FeaturesEnricher(TransformerMixin):
                 validated_X,
                 validated_y,
                 eval_set,
-                is_demo_dataset,
                 exclude_features_sources,
                 trace_id,
                 progress_bar,
@@ -1873,158 +1874,147 @@ class FeaturesEnricher(TransformerMixin):
         validated_X: pd.DataFrame,
         validated_y: pd.Series,
         eval_set: Optional[List[tuple]],
-        is_demo_dataset: bool,
         exclude_features_sources: Optional[List[str]],
         trace_id: str,
         progress_bar: Optional[ProgressBar],
         progress_callback: Optional[Callable[[SearchProgress], Any]],
     ) -> _SampledDataForMetrics:
-        eval_set_sampled_dict = {}
-        if eval_set is not None:
-            self.logger.info("Transform with eval_set")
-            # concatenate X and eval_set with eval_set_index
-            df = validated_X.copy()
-            df[TARGET] = validated_y
-            df[EVAL_SET_INDEX] = 0
-            for idx, eval_pair in enumerate(eval_set):
-                eval_x, eval_y = self._validate_eval_set_pair(validated_X, eval_pair)
-                eval_df_with_index = eval_x.copy()
-                eval_df_with_index[TARGET] = eval_y
-                eval_df_with_index[EVAL_SET_INDEX] = idx + 1
-                df = pd.concat([df, eval_df_with_index])
-            df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
-            # downsample if need to eval_set threshold
-            num_samples = _num_samples(df)
-            force_downsampling = (
-                not self.disable_force_downsampling
-                and self.columns_for_online_api is not None
-                and num_samples > Dataset.FORCE_SAMPLE_SIZE
-            )
-            # TODO: check that system_record_id was added before this step
-            if force_downsampling:
-                self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
-                df = balance_undersample_forced(
-                    df=df,
-                    target_column=TARGET,
-                    id_columns=self.id_columns,
-                    date_column=self._get_date_column(self.search_keys),
-                    task_type=self.model_task_type,
-                    cv_type=self.cv,
-                    random_state=self.random_state,
-                    sample_size=Dataset.FORCE_SAMPLE_SIZE,
-                    logger=self.logger,
-                    bundle=self.bundle,
-                    warning_callback=self.__log_warning,
-                )
-            elif num_samples > Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD:
-                self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS}")
-                df = df.sample(n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state)
+        has_eval_set = eval_set is not None
-            eval_set_sampled_dict = {}
+        self.logger.info(f"Transform {'with' if has_eval_set else 'without'} eval_set")
-            tmp_target_name = "__target"
-            df = df.rename(columns={TARGET: tmp_target_name})
+        # Prepare
+        df = self.__combine_train_and_eval_sets(validated_X, validated_y, eval_set)
+        df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
+        df = self.__downsample_for_metrics(df)
-            enriched_df, columns_renaming, generated_features = self.__inner_transform(
-                trace_id,
-                df,
-                exclude_features_sources=exclude_features_sources,
-                silent_mode=True,
-                metrics_calculation=True,
-                progress_bar=progress_bar,
-                progress_callback=progress_callback,
-                add_fit_system_record_id=True,
-                target_name=tmp_target_name,
-            )
-            if enriched_df is None:
-                return None
+        # Transform
-            enriched_df = enriched_df.rename(columns={tmp_target_name: TARGET})
+        enriched_df, _, _ = self.__inner_transform(
+            trace_id,
+            X=df.drop(columns=[TARGET]),
+            y=df[TARGET],
+            exclude_features_sources=exclude_features_sources,
+            silent_mode=True,
+            metrics_calculation=True,
+            progress_bar=progress_bar,
+            progress_callback=progress_callback,
+            add_fit_system_record_id=True,
+        )
+        if enriched_df is None:
+            return None
-            x_columns = [
-                c
-                for c in (validated_X.columns.tolist() + generated_features + [SYSTEM_RECORD_ID])
-                if c in enriched_df.columns
-            ]
+        x_columns = [
+            c
+            for c in (validated_X.columns.tolist() + self.fit_generated_features + [SYSTEM_RECORD_ID])
+            if c in enriched_df.columns
+        ]
-            enriched_Xy = enriched_df.query(f"{EVAL_SET_INDEX} == 0")
-            X_sampled = enriched_Xy[x_columns].copy()
-            y_sampled = enriched_Xy[TARGET].copy()
-            enriched_X = enriched_Xy.drop(columns=[TARGET, EVAL_SET_INDEX])
-            enriched_X_columns = enriched_X.columns.tolist()
+        X_sampled, y_sampled, enriched_X = self.__extract_train_data(enriched_df, x_columns)
+        eval_set_sampled_dict = self.__extract_eval_data(
+            enriched_df, x_columns, enriched_X.columns.tolist(), len(eval_set) if has_eval_set else 0
+        )
-            for idx in range(len(eval_set)):
-                enriched_eval_xy = enriched_df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
-                eval_x_sampled = enriched_eval_xy[x_columns].copy()
-                eval_y_sampled = enriched_eval_xy[TARGET].copy()
-                enriched_eval_x = enriched_eval_xy[enriched_X_columns].copy()
-                eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
-        else:
-            self.logger.info("Transform without eval_set")
-            df = validated_X.copy()
+        # Cache and return results
+        return self.__cache_and_return_results(
+            validated_X, validated_y, eval_set, X_sampled, y_sampled, enriched_X, eval_set_sampled_dict
+        )
-            df[TARGET] = validated_y
+    def __combine_train_and_eval_sets(
+        self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]]
+    ) -> pd.DataFrame:
+        df = validated_X.copy()
+        df[TARGET] = validated_y
+        if eval_set is None:
+            return df
-            df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
+        df[EVAL_SET_INDEX] = 0
-            num_samples = _num_samples(df)
-            force_downsampling = (
-                not self.disable_force_downsampling
-                and self.columns_for_online_api is not None
-                and num_samples > Dataset.FORCE_SAMPLE_SIZE
-            )
+        for idx, eval_pair in enumerate(eval_set):
+            eval_x, eval_y = self._validate_eval_set_pair(validated_X, eval_pair)
+            eval_df_with_index = eval_x.copy()
+            eval_df_with_index[TARGET] = eval_y
+            eval_df_with_index[EVAL_SET_INDEX] = idx + 1
+            df = pd.concat([df, eval_df_with_index])
-            if force_downsampling:
-                self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
-                df = balance_undersample_forced(
-                    df=df,
-                    target_column=TARGET,
-                    id_columns=self.id_columns,
-                    date_column=self._get_date_column(self.search_keys),
-                    task_type=self.model_task_type,
-                    cv_type=self.cv,
-                    random_state=self.random_state,
-                    sample_size=Dataset.FORCE_SAMPLE_SIZE,
-                    logger=self.logger,
-                    bundle=self.bundle,
-                    warning_callback=self.__log_warning,
-                )
-            elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
-                self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
-                df = df.sample(n=Dataset.FIT_SAMPLE_ROWS, random_state=self.random_state)
+        return df
-            tmp_target_name = "__target"
-            df = df.rename(columns={TARGET: tmp_target_name})
+    def __downsample_for_metrics(self, df: pd.DataFrame) -> pd.DataFrame:
+        num_samples = _num_samples(df)
+        force_downsampling = (
+            not self.disable_force_downsampling
+            and self.columns_for_online_api is not None
+            and num_samples > Dataset.FORCE_SAMPLE_SIZE
+        )
-            enriched_Xy, columns_renaming, generated_features = self.__inner_transform(
-                trace_id,
-                df,
-                exclude_features_sources=exclude_features_sources,
-                silent_mode=True,
-                metrics_calculation=True,
-                progress_bar=progress_bar,
-                progress_callback=progress_callback,
-                add_fit_system_record_id=True,
-                target_name=tmp_target_name,
+        if force_downsampling:
+            self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
+            return balance_undersample_forced(
+                df=df,
+                target_column=TARGET,
+                id_columns=self.id_columns,
+                date_column=self._get_date_column(self.search_keys),
+                task_type=self.model_task_type,
+                cv_type=self.cv,
+                random_state=self.random_state,
+                sample_size=Dataset.FORCE_SAMPLE_SIZE,
+                logger=self.logger,
+                bundle=self.bundle,
+                warning_callback=self.__log_warning,
             )
-            if enriched_Xy is None:
-                return None
+        elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
+            if EVAL_SET_INDEX in df.columns:
+                threshold = Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD
+                sample_size = Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS
+            else:
+                threshold = Dataset.FIT_SAMPLE_THRESHOLD
+                sample_size = Dataset.FIT_SAMPLE_ROWS
-            enriched_Xy = enriched_Xy.rename(columns={tmp_target_name: TARGET})
+            if num_samples > threshold:
+                self.logger.info(f"Downsampling from {num_samples} to {sample_size}")
+                return df.sample(n=sample_size, random_state=self.random_state)
-            x_columns = [
-                c
-                for c in (validated_X.columns.tolist() + generated_features + [SYSTEM_RECORD_ID])
-                if c in enriched_Xy.columns
-            ]
+        return df
+    def __extract_train_data(
+        self, enriched_df: pd.DataFrame, x_columns: List[str]
+    ) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
+        if EVAL_SET_INDEX in enriched_df.columns:
+            enriched_Xy = enriched_df.query(f"{EVAL_SET_INDEX} == 0")
+        else:
+            enriched_Xy = enriched_df
+        X_sampled = enriched_Xy[x_columns].copy()
+        y_sampled = enriched_Xy[TARGET].copy()
+        enriched_X = enriched_Xy.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
+        return X_sampled, y_sampled, enriched_X
+    def __extract_eval_data(
+        self, enriched_df: pd.DataFrame, x_columns: List[str], enriched_X_columns: List[str], eval_set_len: int
+    ) -> Dict[int, Tuple]:
+        eval_set_sampled_dict = {}
-            X_sampled = enriched_Xy[x_columns].copy()
-            y_sampled = enriched_Xy[TARGET].copy()
-            enriched_X = enriched_Xy.drop(columns=TARGET)
+        for idx in range(eval_set_len):
+            enriched_eval_xy = enriched_df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
+            eval_x_sampled = enriched_eval_xy[x_columns].copy()
+            eval_y_sampled = enriched_eval_xy[TARGET].copy()
+            enriched_eval_x = enriched_eval_xy[enriched_X_columns].copy()
+            eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
+        return eval_set_sampled_dict
+    def __cache_and_return_results(
+        self,
+        validated_X: pd.DataFrame,
+        validated_y: pd.Series,
+        eval_set: Optional[List[tuple]],
+        X_sampled: pd.DataFrame,
+        y_sampled: pd.Series,
+        enriched_X: pd.DataFrame,
+        eval_set_sampled_dict: Dict[int, Tuple],
+    ) -> _SampledDataForMetrics:
         datasets_hash = hash_input(validated_X, validated_y, eval_set)
+        columns_renaming = getattr(self, "fit_columns_renaming", {})
         self.__cached_sampled_datasets[datasets_hash] = (
             X_sampled,
             y_sampled,
@@ -2161,6 +2151,7 @@ if response.status_code == 200:
         trace_id: str,
         X: pd.DataFrame,
         *,
+        y: Optional[pd.Series] = None,
         exclude_features_sources: Optional[List[str]] = None,
         importance_threshold: Optional[float] = None,
         max_features: Optional[int] = None,
@@ -2179,8 +2170,14 @@ if response.status_code == 200:
             self.logger.info("Start transform")
             validated_X = self._validate_X(X, is_transform=True)
+            if y is not None:
+                validated_y = self._validate_y(validated_X, y)
+                df = self.__combine_train_and_eval_sets(validated_X, validated_y, eval_set=None)
+            else:
+                validated_y = None
+                df = validated_X
-            self.__log_debug_information(validated_X, exclude_features_sources=exclude_features_sources)
+            self.__log_debug_information(validated_X, validated_y, exclude_features_sources=exclude_features_sources)
             self.__validate_search_keys(self.search_keys, self.search_id)
@@ -2223,29 +2220,27 @@ if response.status_code == 200:
                         self.logger.info(msg)
                         print(msg)
-            is_demo_dataset = hash_input(validated_X) in DEMO_DATASET_HASHES
+            is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
             columns_to_drop = [
-                c for c in validated_X.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
+                c for c in df.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
             ]
             if len(columns_to_drop) > 0:
                 msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
                 self.logger.warning(msg)
                 print(msg)
-                validated_X = validated_X.drop(columns=columns_to_drop)
+                df = df.drop(columns=columns_to_drop)
             search_keys = self.search_keys.copy()
             if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
-                self.search_keys.update(
+                search_keys.update(
                     {col: SearchKey.CUSTOM_KEY for col in self.id_columns if col not in self.search_keys}
                 )
             search_keys = self.__prepare_search_keys(
-                validated_X, search_keys, is_demo_dataset, is_transform=True, silent_mode=silent_mode
+                df, search_keys, is_demo_dataset, is_transform=True, silent_mode=silent_mode
             )
-            df = validated_X.copy()
             df = self.__handle_index_search_keys(df, search_keys)
             if DEFAULT_INDEX in df.columns:
@@ -2284,8 +2279,11 @@ if response.status_code == 200:
             features_for_transform = self._search_task.get_features_for_transform() or []
             if len(features_for_transform) > 0:
                 missing_features_for_transform = [
-                    columns_renaming.get(f) for f in features_for_transform if f not in df.columns
+                    columns_renaming.get(f) or f for f in features_for_transform if f not in df.columns
                 ]
+                if TARGET in missing_features_for_transform:
+                    raise ValidationError(self.bundle.get("missing_target_for_transform"))
                 if len(missing_features_for_transform) > 0:
                     raise ValidationError(
                         self.bundle.get("missing_features_for_transform").format(missing_features_for_transform)
@@ -2341,11 +2339,10 @@ if response.status_code == 200:
                 converter = PostalCodeSearchKeyConverter(postal_code)
                 df = converter.convert(df)
-            # generated_features = [f for f in generated_features if f in self.fit_generated_features]
+            meaning_types = {}
+            meaning_types.update({col: FileColumnMeaningType.FEATURE for col in features_for_transform})
+            meaning_types.update({col: key.value for col, key in search_keys.items()})
-            meaning_types = {col: key.value for col, key in search_keys.items()}
-            for col in features_for_transform:
-                meaning_types[col] = FileColumnMeaningType.FEATURE
             features_not_to_pass = [
                 c
                 for c in df.columns
@@ -2354,13 +2351,12 @@ if response.status_code == 200:
                 and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
             ]
-            if add_fit_system_record_id and target_name is not None:
-                reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
+            if add_fit_system_record_id:
                 df = self.__add_fit_system_record_id(
                     df,
                     search_keys,
                     SYSTEM_RECORD_ID,
-                    reversed_columns_renaming.get(target_name, target_name),
+                    TARGET,
                     columns_renaming,
                     silent=True,
                 )

upgini/resource_bundle/strings.properties CHANGED Viewed

@@ -136,6 +136,7 @@ x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
 baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
 baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
 missing_features_for_transform=Missing some features for transform that were presented on fit: {}
+missing_target_for_transform=Search contains features on target. Please add y to the call and try again
 missing_id_column=Id column {} not found in X
     # target validation
 empty_target=Target is empty in all rows

upgini/search_task.py CHANGED Viewed

@@ -168,7 +168,13 @@ class SearchTask:
         for meta in self.provider_metadata_v2:
             if meta.features_used_for_embeddings is not None:
                 features_for_transform.update(meta.features_used_for_embeddings)
+            if meta.generated_features:
+                features_for_transform.update(
+                    c.original_name
+                    for f in meta.generated_features
+                    for c in f.base_columns
+                    if c.ads_definition_id is None
+                )
         return list(features_for_transform)
     def get_shuffle_kfold(self) -> Optional[bool]:

upgini/utils/mstats.py CHANGED Viewed

@@ -118,7 +118,7 @@ def spearmanr(
     # - dof: degrees of freedom
     # - t_stat: t-statistic
     # - alternative: 'two-sided', 'greater', 'less'
-    def compute_t_pvalue(t_stat, dof, alternative='two-sided'):
+    def compute_t_pvalue(t_stat, dof, alternative="two-sided"):
         from scipy.stats import t
         if alternative == "two-sided":

{upgini-1.2.67.dist-info → upgini-1.2.68a3818.dev1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: upgini
-Version: 1.2.67
+Version: 1.2.68a3818.dev1
 Summary: Intelligent data search & enrichment for Machine Learning
 Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
 Project-URL: Homepage, https://upgini.com/

{upgini-1.2.67.dist-info → upgini-1.2.68a3818.dev1.dist-info}/RECORD RENAMED Viewed

@@ -1,14 +1,14 @@
-upgini/__about__.py,sha256=x83kJMBbU7xkJWmY0kKk3DvyxpVE77jHCISbZ98r0HU,23
+upgini/__about__.py,sha256=B8ku0HzP4G2N6EyFXdX43ZRi57azPbbOINogoH1dGG4,33
 upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
 upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
 upgini/dataset.py,sha256=OGjpeFHbj3lWiZTOHTpWEoMMDmFY1FlNC44FKktoZvU,34956
 upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
-upgini/features_enricher.py,sha256=YXG5uUBN1Qo-3X5EUV4Y--Pyqbvg4Gta3WIoWQMTYkU,205359
+upgini/features_enricher.py,sha256=KBTdADF7_Wj3uDROYdevukOk6R8LVQw47gJkH4M1_iQ,204435
 upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
 upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
 upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
 upgini/metrics.py,sha256=t7uOOnlDYvP6E3DLjPMQcFBjyhJfUQY8aUlx7N0Mh-s,35477
-upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
+upgini/search_task.py,sha256=EuCGp0iCWz2fpuJgN6M47aP_CtIi3Oq9zw78w0mkKiU,17595
 upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
 upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
 upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
@@ -16,18 +16,19 @@ upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo
 upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
 upgini/autofe/binary.py,sha256=MnQuFiERpocjCPQUjOljlsq5FE-04GPfwtNjzvfNMyU,7671
-upgini/autofe/date.py,sha256=I07psJerrxOcHao91PdSCk9X6KWu61IBVyFRLjGNgK8,10730
-upgini/autofe/feature.py,sha256=xgu6bVIlUJ5PCUgoXQRNcGkcMOhj-_BdDRmkB_qRFS4,14766
+upgini/autofe/date.py,sha256=C86F7sPiscUGq2a45UtQA9ADWBWg0kt54mePHHzjbLE,10633
+upgini/autofe/feature.py,sha256=y1x3wijhTVBmloayQAHiscqKU9Ll8kLcGm1PdvS357I,14910
 upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
 upgini/autofe/operator.py,sha256=EOffJw6vKXpEh5yymqb1RFNJPxGxmnHdFRo9dB5SCFo,4969
 upgini/autofe/unary.py,sha256=yVgPvtfnPSOhrii0YgezddmgWPwyOBCR0JutaIkdTTc,4658
+upgini/autofe/utils.py,sha256=fK1am2_tQj3fL2vDslblye8lmyfWgGIUOX1beYVBz4k,2420
 upgini/autofe/vector.py,sha256=l0KdKg-txlZxDSE4hPPfCtfGQofYbl7oaABPr830sPI,667
 upgini/autofe/timeseries/__init__.py,sha256=PGwwDAMwvkXl3el12tXVEmZUgDUvlmIPlXtROm6bD18,738
 upgini/autofe/timeseries/base.py,sha256=T9Ec8LKJbiwTUGGsd_xhM0U0NUJblqmKchkzUI1sK88,3755
-upgini/autofe/timeseries/cross.py,sha256=Sh5hAXZFWKaFRqf_JGODu9pWO2tmuV5VKyK9eX3i7-I,4931
+upgini/autofe/timeseries/cross.py,sha256=qdoMGKg0auoYKwu4Vz8V3XDs_6-5j9sE4gcwfAR41Ws,5231
 upgini/autofe/timeseries/delta.py,sha256=h0YhmI1TlPJnjwFpN_GQxLb6r59DQuucnG5tQAXSgjU,3520
 upgini/autofe/timeseries/lag.py,sha256=LfQtg484vuqM0mgY4Wft1swHX_Srq7OKKgZswCXoiXI,1882
-upgini/autofe/timeseries/roll.py,sha256=bNFMDszSYTWvB7EyhHbRY1DJqzSURvHlPAcBebt0y0Y,2878
+upgini/autofe/timeseries/roll.py,sha256=zADKXU-eYWQnQ5R3am1yEal8uU6Tm0jLAixwPb_aCHg,2794
 upgini/autofe/timeseries/trend.py,sha256=9p2Q5ByAi6cx9RH9teBTe8FyjSzqthznC2Lo5dsJ0ho,2051
 upgini/autofe/timeseries/volatility.py,sha256=9shUmIKjpWTHVYjj80YBsk0XheBJ9uBuLv5NW9Mchnk,7953
 upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -38,7 +39,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
 upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
 upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
 upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
-upgini/resource_bundle/strings.properties,sha256=3zctRNQDJ1STTvLUfryBT72wYeHYnrllV4rG1C3HtfI,27542
+upgini/resource_bundle/strings.properties,sha256=LDT-jtYlrD1IXvWjFSf-dtvapje0qSrqI9W3v7y2zVo,27646
 upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
 upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -60,7 +61,7 @@ upgini/utils/feature_info.py,sha256=m1tQcT3hTChPAiXzpk0WQcEqElj8KgeCifEJFa7-gss,
 upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
 upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
 upgini/utils/ip_utils.py,sha256=TSQ_qDsLlVnm09X1HacpabEf_HNqSWpxBF4Sdc2xs08,6580
-upgini/utils/mstats.py,sha256=dLJQr5Ak5BAoV-pDPpnfvMURZVkZ3_v250QzAsSlqY4,6286
+upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
 upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
 upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
 upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
@@ -70,7 +71,7 @@ upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,
 upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
 upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
 upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
-upgini-1.2.67.dist-info/METADATA,sha256=iubpRRFD4zoIH2UvaQKDU_LKtBI4GCNEoaSSAf6MeBk,49113
-upgini-1.2.67.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
-upgini-1.2.67.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
-upgini-1.2.67.dist-info/RECORD,,
+upgini-1.2.68a3818.dev1.dist-info/METADATA,sha256=b70LVYxQjLh3v0j-pbeT-PWuf065TUhpgQxt_prM2Oo,49123
+upgini-1.2.68a3818.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
+upgini-1.2.68a3818.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
+upgini-1.2.68a3818.dev1.dist-info/RECORD,,

{upgini-1.2.67.dist-info → upgini-1.2.68a3818.dev1.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: hatchling 1.25.0
+Generator: hatchling 1.24.2
 Root-Is-Purelib: true
 Tag: py3-none-any

{upgini-1.2.67.dist-info → upgini-1.2.68a3818.dev1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

upgini 1.2.67__py3-none-any.whl → 1.2.68a3818.dev1__py3-none-any.whl

Potentially problematic release.

upgini 1.2.67py3-none-any.whl → 1.2.68a3818.dev1py3-none-any.whl