upgini 1.2.67__py3-none-any.whl → 1.2.68a3818.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.67"
1
+ __version__ = "1.2.68a3818.dev1"
upgini/autofe/date.py CHANGED
@@ -8,6 +8,7 @@ from pandas.core.arrays.timedeltas import TimedeltaArray
8
8
  from pydantic import BaseModel, __version__ as pydantic_version
9
9
 
10
10
  from upgini.autofe.operator import PandasOperator, ParametrizedOperator
11
+ from upgini.autofe.utils import pydantic_validator
11
12
 
12
13
 
13
14
  def get_pydantic_version():
@@ -209,6 +210,14 @@ class DateListDiffBounded(DateListDiff, ParametrizedOperator):
209
210
 
210
211
  return cls(diff_unit=diff_unit, lower_bound=lower_bound, upper_bound=upper_bound, aggregation=aggregation)
211
212
 
213
+ def get_params(self) -> Dict[str, Optional[str]]:
214
+ res = super().get_params()
215
+ if self.lower_bound is not None:
216
+ res["lower_bound"] = str(self.lower_bound)
217
+ if self.upper_bound is not None:
218
+ res["upper_bound"] = str(self.upper_bound)
219
+ return res
220
+
212
221
  def _agg(self, x):
213
222
  x = x[
214
223
  (x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
@@ -269,32 +278,17 @@ class DatePercentile(DatePercentileBase):
269
278
  {
270
279
  "zero_month": self.zero_month,
271
280
  "zero_year": self.zero_year,
272
- "zero_bounds": self.zero_bounds,
281
+ "zero_bounds": json.dumps(self.zero_bounds),
273
282
  "step": self.step,
274
283
  }
275
284
  )
276
285
  return res
277
286
 
278
- # Check Pydantic version
279
- if get_pydantic_version() >= 2:
280
- # Use @field_validator for Pydantic 2.x
281
- from pydantic import field_validator
282
-
283
- @field_validator("zero_bounds", mode="before")
284
- def parse_zero_bounds(cls, value):
285
- if isinstance(value, str):
286
- return json.loads(value)
287
- return value
288
-
289
- else:
290
- # Use @validator for Pydantic 1.x
291
- from pydantic import validator
292
-
293
- @validator("zero_bounds", pre=True)
294
- def parse_zero_bounds(cls, value):
295
- if isinstance(value, str):
296
- return json.loads(value)
297
- return value
287
+ @pydantic_validator("zero_bounds", mode="before")
288
+ def parse_zero_bounds(cls, value):
289
+ if isinstance(value, str):
290
+ return json.loads(value)
291
+ return value
298
292
 
299
293
  def _get_bounds(self, date_col: pd.Series) -> pd.Series:
300
294
  months = date_col.dt.month
upgini/autofe/feature.py CHANGED
@@ -112,7 +112,11 @@ class Feature:
112
112
 
113
113
  def get_hash(self) -> str:
114
114
  return hashlib.sha256(
115
- "_".join([self.op.get_hash_component()] + [ch.get_display_name() for ch in self.children]).encode("utf-8")
115
+ "_".join(
116
+ [self.op.get_hash_component()]
117
+ + [ch.op.get_hash_component() for ch in self.children if isinstance(ch, Feature)]
118
+ + [ch.get_display_name() for ch in self.children]
119
+ ).encode("utf-8")
116
120
  ).hexdigest()[:8]
117
121
 
118
122
  def set_alias(self, alias: str) -> "Feature":
@@ -1,16 +1,13 @@
1
+ import json
1
2
  from typing import Dict, List, Optional
2
3
 
3
4
  import numpy as np
4
5
  import pandas as pd
5
6
 
6
- try:
7
- from pydantic import field_validator as validator # V2
8
- except ImportError:
9
- from pydantic import validator # V1
10
-
11
7
  from upgini.autofe.all_operators import find_op
12
8
  from upgini.autofe.operator import PandasOperator, ParametrizedOperator
13
9
  from upgini.autofe.timeseries.base import TimeSeriesBase
10
+ from upgini.autofe.utils import pydantic_validator
14
11
 
15
12
 
16
13
  class CrossSeriesInteraction(TimeSeriesBase, ParametrizedOperator):
@@ -20,13 +17,24 @@ class CrossSeriesInteraction(TimeSeriesBase, ParametrizedOperator):
20
17
  left_descriptor: List[str] = []
21
18
  right_descriptor: List[str] = []
22
19
 
23
- @validator("descriptor_indices")
24
- @classmethod
20
+ @pydantic_validator("descriptor_indices")
25
21
  def validate_descriptor_indices(cls, v):
26
22
  if not v:
27
23
  raise ValueError("descriptor_indices cannot be empty for CrossSeriesInteraction")
28
24
  return v
29
25
 
26
+ @pydantic_validator("left_descriptor", "right_descriptor", mode="before")
27
+ def parse_descriptors(cls, v):
28
+ if isinstance(v, str):
29
+ return json.loads(v)
30
+ return v
31
+
32
+ @pydantic_validator("interaction_op", mode="before")
33
+ def validate_interaction_op(cls, v):
34
+ if isinstance(v, str):
35
+ return find_op(v)
36
+ return v
37
+
30
38
  def __init__(self, **data):
31
39
  super().__init__(**data)
32
40
  indices = self.descriptor_indices
@@ -3,6 +3,7 @@ from typing import Dict, Optional
3
3
 
4
4
  from upgini.autofe.operator import ParametrizedOperator
5
5
  from upgini.autofe.timeseries.base import TimeSeriesBase
6
+ from upgini.autofe.utils import pydantic_validator
6
7
 
7
8
  # Roll aggregation functions
8
9
  roll_aggregations = {
@@ -12,19 +13,13 @@ roll_aggregations = {
12
13
  "iqr": lambda x: x.quantile(0.75) - x.quantile(0.25),
13
14
  }
14
15
 
15
- try:
16
- from pydantic import field_validator as validator # V2
17
- except ImportError:
18
- from pydantic import validator # V1
19
-
20
16
 
21
17
  class Roll(TimeSeriesBase, ParametrizedOperator):
22
18
  aggregation: str
23
19
  window_size: int = 1
24
20
  window_unit: str = "D"
25
21
 
26
- @validator("window_unit")
27
- @classmethod
22
+ @pydantic_validator("window_unit")
28
23
  def validate_window_unit(cls, v: str) -> str:
29
24
  try:
30
25
  pd.tseries.frequencies.to_offset(v)
upgini/autofe/utils.py ADDED
@@ -0,0 +1,83 @@
1
+ """
2
+ Utility functions for autofe module.
3
+ """
4
+
5
+ import functools
6
+ from typing import Callable
7
+
8
+
9
+ def get_pydantic_version():
10
+ """
11
+ Get the major version of pydantic.
12
+
13
+ Returns:
14
+ int: Major version number (1 or 2)
15
+ """
16
+ try:
17
+ from pydantic import __version__ as pydantic_version
18
+
19
+ major_version = int(pydantic_version.split(".")[0])
20
+ return major_version
21
+ except (ImportError, ValueError):
22
+ # Default to version 1 if unable to determine
23
+ return 1
24
+
25
+
26
+ def pydantic_validator(field_name: str, *fields, mode: str = "before", **kwargs):
27
+ """
28
+ A decorator that applies the appropriate Pydantic validator based on the installed version.
29
+
30
+ This decorator handles the differences between Pydantic v1 and v2 validator syntax,
31
+ making it easier to write code that works with both versions.
32
+
33
+ Args:
34
+ field_name (str): The name of the field to validate
35
+ mode (str): The validation mode, either "before" or "after" (for Pydantic v2)
36
+ **kwargs: Additional arguments to pass to the validator
37
+
38
+ Returns:
39
+ Callable: A decorator that can be applied to validator methods
40
+
41
+ Example:
42
+ ```python
43
+ class MyModel(BaseModel):
44
+ items: List[int]
45
+
46
+ @pydantic_validator("items")
47
+ def parse_items(cls, value):
48
+ if isinstance(value, str):
49
+ return [int(x) for x in value.split(",")]
50
+ return value
51
+ ```
52
+ """
53
+ pydantic_version = get_pydantic_version()
54
+
55
+ if pydantic_version >= 2:
56
+ # Use field_validator for Pydantic 2.x
57
+ from pydantic import field_validator
58
+
59
+ def decorator(func: Callable) -> Callable:
60
+ @field_validator(field_name, *fields, mode=mode, **kwargs)
61
+ @functools.wraps(func)
62
+ def wrapper(cls, value, **kw):
63
+ return func(cls, value)
64
+
65
+ return wrapper
66
+
67
+ return decorator
68
+ else:
69
+ # Use validator for Pydantic 1.x
70
+ from pydantic import validator
71
+
72
+ # Map mode to Pydantic v1 parameters
73
+ pre = True if mode == "before" else False
74
+
75
+ def decorator(func: Callable) -> Callable:
76
+ @validator(field_name, *fields, pre=pre, **kwargs)
77
+ @functools.wraps(func)
78
+ def wrapper(cls, value, **kw):
79
+ return func(cls, value)
80
+
81
+ return wrapper
82
+
83
+ return decorator
@@ -702,6 +702,7 @@ class FeaturesEnricher(TransformerMixin):
702
702
  def transform(
703
703
  self,
704
704
  X: pd.DataFrame,
705
+ y: Optional[pd.Series] = None,
705
706
  *args,
706
707
  exclude_features_sources: Optional[List[str]] = None,
707
708
  keep_input: bool = True,
@@ -766,6 +767,7 @@ class FeaturesEnricher(TransformerMixin):
766
767
  result, _, _ = self.__inner_transform(
767
768
  trace_id,
768
769
  X,
770
+ y=y,
769
771
  exclude_features_sources=exclude_features_sources,
770
772
  importance_threshold=importance_threshold,
771
773
  max_features=max_features,
@@ -1682,7 +1684,6 @@ class FeaturesEnricher(TransformerMixin):
1682
1684
  validated_X,
1683
1685
  validated_y,
1684
1686
  eval_set,
1685
- is_demo_dataset,
1686
1687
  exclude_features_sources,
1687
1688
  trace_id,
1688
1689
  progress_bar,
@@ -1873,158 +1874,147 @@ class FeaturesEnricher(TransformerMixin):
1873
1874
  validated_X: pd.DataFrame,
1874
1875
  validated_y: pd.Series,
1875
1876
  eval_set: Optional[List[tuple]],
1876
- is_demo_dataset: bool,
1877
1877
  exclude_features_sources: Optional[List[str]],
1878
1878
  trace_id: str,
1879
1879
  progress_bar: Optional[ProgressBar],
1880
1880
  progress_callback: Optional[Callable[[SearchProgress], Any]],
1881
1881
  ) -> _SampledDataForMetrics:
1882
- eval_set_sampled_dict = {}
1883
- if eval_set is not None:
1884
- self.logger.info("Transform with eval_set")
1885
- # concatenate X and eval_set with eval_set_index
1886
- df = validated_X.copy()
1887
- df[TARGET] = validated_y
1888
- df[EVAL_SET_INDEX] = 0
1889
- for idx, eval_pair in enumerate(eval_set):
1890
- eval_x, eval_y = self._validate_eval_set_pair(validated_X, eval_pair)
1891
- eval_df_with_index = eval_x.copy()
1892
- eval_df_with_index[TARGET] = eval_y
1893
- eval_df_with_index[EVAL_SET_INDEX] = idx + 1
1894
- df = pd.concat([df, eval_df_with_index])
1895
-
1896
- df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1897
-
1898
- # downsample if need to eval_set threshold
1899
- num_samples = _num_samples(df)
1900
- force_downsampling = (
1901
- not self.disable_force_downsampling
1902
- and self.columns_for_online_api is not None
1903
- and num_samples > Dataset.FORCE_SAMPLE_SIZE
1904
- )
1905
- # TODO: check that system_record_id was added before this step
1906
- if force_downsampling:
1907
- self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
1908
- df = balance_undersample_forced(
1909
- df=df,
1910
- target_column=TARGET,
1911
- id_columns=self.id_columns,
1912
- date_column=self._get_date_column(self.search_keys),
1913
- task_type=self.model_task_type,
1914
- cv_type=self.cv,
1915
- random_state=self.random_state,
1916
- sample_size=Dataset.FORCE_SAMPLE_SIZE,
1917
- logger=self.logger,
1918
- bundle=self.bundle,
1919
- warning_callback=self.__log_warning,
1920
- )
1921
- elif num_samples > Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD:
1922
- self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS}")
1923
- df = df.sample(n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state)
1882
+ has_eval_set = eval_set is not None
1924
1883
 
1925
- eval_set_sampled_dict = {}
1884
+ self.logger.info(f"Transform {'with' if has_eval_set else 'without'} eval_set")
1926
1885
 
1927
- tmp_target_name = "__target"
1928
- df = df.rename(columns={TARGET: tmp_target_name})
1886
+ # Prepare
1887
+ df = self.__combine_train_and_eval_sets(validated_X, validated_y, eval_set)
1888
+ df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1889
+ df = self.__downsample_for_metrics(df)
1929
1890
 
1930
- enriched_df, columns_renaming, generated_features = self.__inner_transform(
1931
- trace_id,
1932
- df,
1933
- exclude_features_sources=exclude_features_sources,
1934
- silent_mode=True,
1935
- metrics_calculation=True,
1936
- progress_bar=progress_bar,
1937
- progress_callback=progress_callback,
1938
- add_fit_system_record_id=True,
1939
- target_name=tmp_target_name,
1940
- )
1941
- if enriched_df is None:
1942
- return None
1891
+ # Transform
1943
1892
 
1944
- enriched_df = enriched_df.rename(columns={tmp_target_name: TARGET})
1893
+ enriched_df, _, _ = self.__inner_transform(
1894
+ trace_id,
1895
+ X=df.drop(columns=[TARGET]),
1896
+ y=df[TARGET],
1897
+ exclude_features_sources=exclude_features_sources,
1898
+ silent_mode=True,
1899
+ metrics_calculation=True,
1900
+ progress_bar=progress_bar,
1901
+ progress_callback=progress_callback,
1902
+ add_fit_system_record_id=True,
1903
+ )
1904
+ if enriched_df is None:
1905
+ return None
1945
1906
 
1946
- x_columns = [
1947
- c
1948
- for c in (validated_X.columns.tolist() + generated_features + [SYSTEM_RECORD_ID])
1949
- if c in enriched_df.columns
1950
- ]
1907
+ x_columns = [
1908
+ c
1909
+ for c in (validated_X.columns.tolist() + self.fit_generated_features + [SYSTEM_RECORD_ID])
1910
+ if c in enriched_df.columns
1911
+ ]
1951
1912
 
1952
- enriched_Xy = enriched_df.query(f"{EVAL_SET_INDEX} == 0")
1953
- X_sampled = enriched_Xy[x_columns].copy()
1954
- y_sampled = enriched_Xy[TARGET].copy()
1955
- enriched_X = enriched_Xy.drop(columns=[TARGET, EVAL_SET_INDEX])
1956
- enriched_X_columns = enriched_X.columns.tolist()
1913
+ X_sampled, y_sampled, enriched_X = self.__extract_train_data(enriched_df, x_columns)
1914
+ eval_set_sampled_dict = self.__extract_eval_data(
1915
+ enriched_df, x_columns, enriched_X.columns.tolist(), len(eval_set) if has_eval_set else 0
1916
+ )
1957
1917
 
1958
- for idx in range(len(eval_set)):
1959
- enriched_eval_xy = enriched_df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
1960
- eval_x_sampled = enriched_eval_xy[x_columns].copy()
1961
- eval_y_sampled = enriched_eval_xy[TARGET].copy()
1962
- enriched_eval_x = enriched_eval_xy[enriched_X_columns].copy()
1963
- eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
1964
- else:
1965
- self.logger.info("Transform without eval_set")
1966
- df = validated_X.copy()
1918
+ # Cache and return results
1919
+ return self.__cache_and_return_results(
1920
+ validated_X, validated_y, eval_set, X_sampled, y_sampled, enriched_X, eval_set_sampled_dict
1921
+ )
1967
1922
 
1968
- df[TARGET] = validated_y
1923
+ def __combine_train_and_eval_sets(
1924
+ self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]]
1925
+ ) -> pd.DataFrame:
1926
+ df = validated_X.copy()
1927
+ df[TARGET] = validated_y
1928
+ if eval_set is None:
1929
+ return df
1969
1930
 
1970
- df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1931
+ df[EVAL_SET_INDEX] = 0
1971
1932
 
1972
- num_samples = _num_samples(df)
1973
- force_downsampling = (
1974
- not self.disable_force_downsampling
1975
- and self.columns_for_online_api is not None
1976
- and num_samples > Dataset.FORCE_SAMPLE_SIZE
1977
- )
1933
+ for idx, eval_pair in enumerate(eval_set):
1934
+ eval_x, eval_y = self._validate_eval_set_pair(validated_X, eval_pair)
1935
+ eval_df_with_index = eval_x.copy()
1936
+ eval_df_with_index[TARGET] = eval_y
1937
+ eval_df_with_index[EVAL_SET_INDEX] = idx + 1
1938
+ df = pd.concat([df, eval_df_with_index])
1978
1939
 
1979
- if force_downsampling:
1980
- self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
1981
- df = balance_undersample_forced(
1982
- df=df,
1983
- target_column=TARGET,
1984
- id_columns=self.id_columns,
1985
- date_column=self._get_date_column(self.search_keys),
1986
- task_type=self.model_task_type,
1987
- cv_type=self.cv,
1988
- random_state=self.random_state,
1989
- sample_size=Dataset.FORCE_SAMPLE_SIZE,
1990
- logger=self.logger,
1991
- bundle=self.bundle,
1992
- warning_callback=self.__log_warning,
1993
- )
1994
- elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
1995
- self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
1996
- df = df.sample(n=Dataset.FIT_SAMPLE_ROWS, random_state=self.random_state)
1940
+ return df
1997
1941
 
1998
- tmp_target_name = "__target"
1999
- df = df.rename(columns={TARGET: tmp_target_name})
1942
+ def __downsample_for_metrics(self, df: pd.DataFrame) -> pd.DataFrame:
1943
+ num_samples = _num_samples(df)
1944
+ force_downsampling = (
1945
+ not self.disable_force_downsampling
1946
+ and self.columns_for_online_api is not None
1947
+ and num_samples > Dataset.FORCE_SAMPLE_SIZE
1948
+ )
2000
1949
 
2001
- enriched_Xy, columns_renaming, generated_features = self.__inner_transform(
2002
- trace_id,
2003
- df,
2004
- exclude_features_sources=exclude_features_sources,
2005
- silent_mode=True,
2006
- metrics_calculation=True,
2007
- progress_bar=progress_bar,
2008
- progress_callback=progress_callback,
2009
- add_fit_system_record_id=True,
2010
- target_name=tmp_target_name,
1950
+ if force_downsampling:
1951
+ self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
1952
+ return balance_undersample_forced(
1953
+ df=df,
1954
+ target_column=TARGET,
1955
+ id_columns=self.id_columns,
1956
+ date_column=self._get_date_column(self.search_keys),
1957
+ task_type=self.model_task_type,
1958
+ cv_type=self.cv,
1959
+ random_state=self.random_state,
1960
+ sample_size=Dataset.FORCE_SAMPLE_SIZE,
1961
+ logger=self.logger,
1962
+ bundle=self.bundle,
1963
+ warning_callback=self.__log_warning,
2011
1964
  )
2012
- if enriched_Xy is None:
2013
- return None
1965
+ elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
1966
+ if EVAL_SET_INDEX in df.columns:
1967
+ threshold = Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD
1968
+ sample_size = Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS
1969
+ else:
1970
+ threshold = Dataset.FIT_SAMPLE_THRESHOLD
1971
+ sample_size = Dataset.FIT_SAMPLE_ROWS
2014
1972
 
2015
- enriched_Xy = enriched_Xy.rename(columns={tmp_target_name: TARGET})
1973
+ if num_samples > threshold:
1974
+ self.logger.info(f"Downsampling from {num_samples} to {sample_size}")
1975
+ return df.sample(n=sample_size, random_state=self.random_state)
2016
1976
 
2017
- x_columns = [
2018
- c
2019
- for c in (validated_X.columns.tolist() + generated_features + [SYSTEM_RECORD_ID])
2020
- if c in enriched_Xy.columns
2021
- ]
1977
+ return df
1978
+
1979
+ def __extract_train_data(
1980
+ self, enriched_df: pd.DataFrame, x_columns: List[str]
1981
+ ) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
1982
+ if EVAL_SET_INDEX in enriched_df.columns:
1983
+ enriched_Xy = enriched_df.query(f"{EVAL_SET_INDEX} == 0")
1984
+ else:
1985
+ enriched_Xy = enriched_df
1986
+ X_sampled = enriched_Xy[x_columns].copy()
1987
+ y_sampled = enriched_Xy[TARGET].copy()
1988
+ enriched_X = enriched_Xy.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
1989
+ return X_sampled, y_sampled, enriched_X
1990
+
1991
+ def __extract_eval_data(
1992
+ self, enriched_df: pd.DataFrame, x_columns: List[str], enriched_X_columns: List[str], eval_set_len: int
1993
+ ) -> Dict[int, Tuple]:
1994
+ eval_set_sampled_dict = {}
2022
1995
 
2023
- X_sampled = enriched_Xy[x_columns].copy()
2024
- y_sampled = enriched_Xy[TARGET].copy()
2025
- enriched_X = enriched_Xy.drop(columns=TARGET)
1996
+ for idx in range(eval_set_len):
1997
+ enriched_eval_xy = enriched_df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
1998
+ eval_x_sampled = enriched_eval_xy[x_columns].copy()
1999
+ eval_y_sampled = enriched_eval_xy[TARGET].copy()
2000
+ enriched_eval_x = enriched_eval_xy[enriched_X_columns].copy()
2001
+ eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
2026
2002
 
2003
+ return eval_set_sampled_dict
2004
+
2005
+ def __cache_and_return_results(
2006
+ self,
2007
+ validated_X: pd.DataFrame,
2008
+ validated_y: pd.Series,
2009
+ eval_set: Optional[List[tuple]],
2010
+ X_sampled: pd.DataFrame,
2011
+ y_sampled: pd.Series,
2012
+ enriched_X: pd.DataFrame,
2013
+ eval_set_sampled_dict: Dict[int, Tuple],
2014
+ ) -> _SampledDataForMetrics:
2027
2015
  datasets_hash = hash_input(validated_X, validated_y, eval_set)
2016
+ columns_renaming = getattr(self, "fit_columns_renaming", {})
2017
+
2028
2018
  self.__cached_sampled_datasets[datasets_hash] = (
2029
2019
  X_sampled,
2030
2020
  y_sampled,
@@ -2161,6 +2151,7 @@ if response.status_code == 200:
2161
2151
  trace_id: str,
2162
2152
  X: pd.DataFrame,
2163
2153
  *,
2154
+ y: Optional[pd.Series] = None,
2164
2155
  exclude_features_sources: Optional[List[str]] = None,
2165
2156
  importance_threshold: Optional[float] = None,
2166
2157
  max_features: Optional[int] = None,
@@ -2179,8 +2170,14 @@ if response.status_code == 200:
2179
2170
  self.logger.info("Start transform")
2180
2171
 
2181
2172
  validated_X = self._validate_X(X, is_transform=True)
2173
+ if y is not None:
2174
+ validated_y = self._validate_y(validated_X, y)
2175
+ df = self.__combine_train_and_eval_sets(validated_X, validated_y, eval_set=None)
2176
+ else:
2177
+ validated_y = None
2178
+ df = validated_X
2182
2179
 
2183
- self.__log_debug_information(validated_X, exclude_features_sources=exclude_features_sources)
2180
+ self.__log_debug_information(validated_X, validated_y, exclude_features_sources=exclude_features_sources)
2184
2181
 
2185
2182
  self.__validate_search_keys(self.search_keys, self.search_id)
2186
2183
 
@@ -2223,29 +2220,27 @@ if response.status_code == 200:
2223
2220
  self.logger.info(msg)
2224
2221
  print(msg)
2225
2222
 
2226
- is_demo_dataset = hash_input(validated_X) in DEMO_DATASET_HASHES
2223
+ is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
2227
2224
 
2228
2225
  columns_to_drop = [
2229
- c for c in validated_X.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
2226
+ c for c in df.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
2230
2227
  ]
2231
2228
  if len(columns_to_drop) > 0:
2232
2229
  msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
2233
2230
  self.logger.warning(msg)
2234
2231
  print(msg)
2235
- validated_X = validated_X.drop(columns=columns_to_drop)
2232
+ df = df.drop(columns=columns_to_drop)
2236
2233
 
2237
2234
  search_keys = self.search_keys.copy()
2238
2235
  if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
2239
- self.search_keys.update(
2236
+ search_keys.update(
2240
2237
  {col: SearchKey.CUSTOM_KEY for col in self.id_columns if col not in self.search_keys}
2241
2238
  )
2242
2239
 
2243
2240
  search_keys = self.__prepare_search_keys(
2244
- validated_X, search_keys, is_demo_dataset, is_transform=True, silent_mode=silent_mode
2241
+ df, search_keys, is_demo_dataset, is_transform=True, silent_mode=silent_mode
2245
2242
  )
2246
2243
 
2247
- df = validated_X.copy()
2248
-
2249
2244
  df = self.__handle_index_search_keys(df, search_keys)
2250
2245
 
2251
2246
  if DEFAULT_INDEX in df.columns:
@@ -2284,8 +2279,11 @@ if response.status_code == 200:
2284
2279
  features_for_transform = self._search_task.get_features_for_transform() or []
2285
2280
  if len(features_for_transform) > 0:
2286
2281
  missing_features_for_transform = [
2287
- columns_renaming.get(f) for f in features_for_transform if f not in df.columns
2282
+ columns_renaming.get(f) or f for f in features_for_transform if f not in df.columns
2288
2283
  ]
2284
+ if TARGET in missing_features_for_transform:
2285
+ raise ValidationError(self.bundle.get("missing_target_for_transform"))
2286
+
2289
2287
  if len(missing_features_for_transform) > 0:
2290
2288
  raise ValidationError(
2291
2289
  self.bundle.get("missing_features_for_transform").format(missing_features_for_transform)
@@ -2341,11 +2339,10 @@ if response.status_code == 200:
2341
2339
  converter = PostalCodeSearchKeyConverter(postal_code)
2342
2340
  df = converter.convert(df)
2343
2341
 
2344
- # generated_features = [f for f in generated_features if f in self.fit_generated_features]
2342
+ meaning_types = {}
2343
+ meaning_types.update({col: FileColumnMeaningType.FEATURE for col in features_for_transform})
2344
+ meaning_types.update({col: key.value for col, key in search_keys.items()})
2345
2345
 
2346
- meaning_types = {col: key.value for col, key in search_keys.items()}
2347
- for col in features_for_transform:
2348
- meaning_types[col] = FileColumnMeaningType.FEATURE
2349
2346
  features_not_to_pass = [
2350
2347
  c
2351
2348
  for c in df.columns
@@ -2354,13 +2351,12 @@ if response.status_code == 200:
2354
2351
  and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
2355
2352
  ]
2356
2353
 
2357
- if add_fit_system_record_id and target_name is not None:
2358
- reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
2354
+ if add_fit_system_record_id:
2359
2355
  df = self.__add_fit_system_record_id(
2360
2356
  df,
2361
2357
  search_keys,
2362
2358
  SYSTEM_RECORD_ID,
2363
- reversed_columns_renaming.get(target_name, target_name),
2359
+ TARGET,
2364
2360
  columns_renaming,
2365
2361
  silent=True,
2366
2362
  )
@@ -136,6 +136,7 @@ x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
136
136
  baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
137
137
  baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
138
138
  missing_features_for_transform=Missing some features for transform that were presented on fit: {}
139
+ missing_target_for_transform=Search contains features on target. Please add y to the call and try again
139
140
  missing_id_column=Id column {} not found in X
140
141
  # target validation
141
142
  empty_target=Target is empty in all rows
upgini/search_task.py CHANGED
@@ -168,7 +168,13 @@ class SearchTask:
168
168
  for meta in self.provider_metadata_v2:
169
169
  if meta.features_used_for_embeddings is not None:
170
170
  features_for_transform.update(meta.features_used_for_embeddings)
171
-
171
+ if meta.generated_features:
172
+ features_for_transform.update(
173
+ c.original_name
174
+ for f in meta.generated_features
175
+ for c in f.base_columns
176
+ if c.ads_definition_id is None
177
+ )
172
178
  return list(features_for_transform)
173
179
 
174
180
  def get_shuffle_kfold(self) -> Optional[bool]:
upgini/utils/mstats.py CHANGED
@@ -118,7 +118,7 @@ def spearmanr(
118
118
  # - dof: degrees of freedom
119
119
  # - t_stat: t-statistic
120
120
  # - alternative: 'two-sided', 'greater', 'less'
121
- def compute_t_pvalue(t_stat, dof, alternative='two-sided'):
121
+ def compute_t_pvalue(t_stat, dof, alternative="two-sided"):
122
122
  from scipy.stats import t
123
123
 
124
124
  if alternative == "two-sided":
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.67
3
+ Version: 1.2.68a3818.dev1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,14 +1,14 @@
1
- upgini/__about__.py,sha256=x83kJMBbU7xkJWmY0kKk3DvyxpVE77jHCISbZ98r0HU,23
1
+ upgini/__about__.py,sha256=B8ku0HzP4G2N6EyFXdX43ZRi57azPbbOINogoH1dGG4,33
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=OGjpeFHbj3lWiZTOHTpWEoMMDmFY1FlNC44FKktoZvU,34956
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=YXG5uUBN1Qo-3X5EUV4Y--Pyqbvg4Gta3WIoWQMTYkU,205359
6
+ upgini/features_enricher.py,sha256=KBTdADF7_Wj3uDROYdevukOk6R8LVQw47gJkH4M1_iQ,204435
7
7
  upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
9
  upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
10
10
  upgini/metrics.py,sha256=t7uOOnlDYvP6E3DLjPMQcFBjyhJfUQY8aUlx7N0Mh-s,35477
11
- upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
11
+ upgini/search_task.py,sha256=EuCGp0iCWz2fpuJgN6M47aP_CtIi3Oq9zw78w0mkKiU,17595
12
12
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
13
13
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
14
14
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
@@ -16,18 +16,19 @@ upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo
16
16
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
18
18
  upgini/autofe/binary.py,sha256=MnQuFiERpocjCPQUjOljlsq5FE-04GPfwtNjzvfNMyU,7671
19
- upgini/autofe/date.py,sha256=I07psJerrxOcHao91PdSCk9X6KWu61IBVyFRLjGNgK8,10730
20
- upgini/autofe/feature.py,sha256=xgu6bVIlUJ5PCUgoXQRNcGkcMOhj-_BdDRmkB_qRFS4,14766
19
+ upgini/autofe/date.py,sha256=C86F7sPiscUGq2a45UtQA9ADWBWg0kt54mePHHzjbLE,10633
20
+ upgini/autofe/feature.py,sha256=y1x3wijhTVBmloayQAHiscqKU9Ll8kLcGm1PdvS357I,14910
21
21
  upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
22
22
  upgini/autofe/operator.py,sha256=EOffJw6vKXpEh5yymqb1RFNJPxGxmnHdFRo9dB5SCFo,4969
23
23
  upgini/autofe/unary.py,sha256=yVgPvtfnPSOhrii0YgezddmgWPwyOBCR0JutaIkdTTc,4658
24
+ upgini/autofe/utils.py,sha256=fK1am2_tQj3fL2vDslblye8lmyfWgGIUOX1beYVBz4k,2420
24
25
  upgini/autofe/vector.py,sha256=l0KdKg-txlZxDSE4hPPfCtfGQofYbl7oaABPr830sPI,667
25
26
  upgini/autofe/timeseries/__init__.py,sha256=PGwwDAMwvkXl3el12tXVEmZUgDUvlmIPlXtROm6bD18,738
26
27
  upgini/autofe/timeseries/base.py,sha256=T9Ec8LKJbiwTUGGsd_xhM0U0NUJblqmKchkzUI1sK88,3755
27
- upgini/autofe/timeseries/cross.py,sha256=Sh5hAXZFWKaFRqf_JGODu9pWO2tmuV5VKyK9eX3i7-I,4931
28
+ upgini/autofe/timeseries/cross.py,sha256=qdoMGKg0auoYKwu4Vz8V3XDs_6-5j9sE4gcwfAR41Ws,5231
28
29
  upgini/autofe/timeseries/delta.py,sha256=h0YhmI1TlPJnjwFpN_GQxLb6r59DQuucnG5tQAXSgjU,3520
29
30
  upgini/autofe/timeseries/lag.py,sha256=LfQtg484vuqM0mgY4Wft1swHX_Srq7OKKgZswCXoiXI,1882
30
- upgini/autofe/timeseries/roll.py,sha256=bNFMDszSYTWvB7EyhHbRY1DJqzSURvHlPAcBebt0y0Y,2878
31
+ upgini/autofe/timeseries/roll.py,sha256=zADKXU-eYWQnQ5R3am1yEal8uU6Tm0jLAixwPb_aCHg,2794
31
32
  upgini/autofe/timeseries/trend.py,sha256=9p2Q5ByAi6cx9RH9teBTe8FyjSzqthznC2Lo5dsJ0ho,2051
32
33
  upgini/autofe/timeseries/volatility.py,sha256=9shUmIKjpWTHVYjj80YBsk0XheBJ9uBuLv5NW9Mchnk,7953
33
34
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -38,7 +39,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
38
39
  upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
39
40
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
40
41
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
41
- upgini/resource_bundle/strings.properties,sha256=3zctRNQDJ1STTvLUfryBT72wYeHYnrllV4rG1C3HtfI,27542
42
+ upgini/resource_bundle/strings.properties,sha256=LDT-jtYlrD1IXvWjFSf-dtvapje0qSrqI9W3v7y2zVo,27646
42
43
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
43
44
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
45
  upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -60,7 +61,7 @@ upgini/utils/feature_info.py,sha256=m1tQcT3hTChPAiXzpk0WQcEqElj8KgeCifEJFa7-gss,
60
61
  upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
61
62
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
62
63
  upgini/utils/ip_utils.py,sha256=TSQ_qDsLlVnm09X1HacpabEf_HNqSWpxBF4Sdc2xs08,6580
63
- upgini/utils/mstats.py,sha256=dLJQr5Ak5BAoV-pDPpnfvMURZVkZ3_v250QzAsSlqY4,6286
64
+ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
64
65
  upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
65
66
  upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
66
67
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
@@ -70,7 +71,7 @@ upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,
70
71
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
71
72
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
72
73
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
73
- upgini-1.2.67.dist-info/METADATA,sha256=iubpRRFD4zoIH2UvaQKDU_LKtBI4GCNEoaSSAf6MeBk,49113
74
- upgini-1.2.67.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
- upgini-1.2.67.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
- upgini-1.2.67.dist-info/RECORD,,
74
+ upgini-1.2.68a3818.dev1.dist-info/METADATA,sha256=b70LVYxQjLh3v0j-pbeT-PWuf065TUhpgQxt_prM2Oo,49123
75
+ upgini-1.2.68a3818.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
76
+ upgini-1.2.68a3818.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
77
+ upgini-1.2.68a3818.dev1.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.25.0
2
+ Generator: hatchling 1.24.2
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any