upgini 1.2.68a3818.dev4__py3-none-any.whl → 1.2.68a3832.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.68a3818.dev4"
1
+ __version__ = "1.2.68a3832.dev2"
upgini/autofe/date.py CHANGED
@@ -8,7 +8,6 @@ from pandas.core.arrays.timedeltas import TimedeltaArray
8
8
  from pydantic import BaseModel, __version__ as pydantic_version
9
9
 
10
10
  from upgini.autofe.operator import PandasOperator, ParametrizedOperator
11
- from upgini.autofe.utils import pydantic_validator
12
11
 
13
12
 
14
13
  def get_pydantic_version():
@@ -210,14 +209,6 @@ class DateListDiffBounded(DateListDiff, ParametrizedOperator):
210
209
 
211
210
  return cls(diff_unit=diff_unit, lower_bound=lower_bound, upper_bound=upper_bound, aggregation=aggregation)
212
211
 
213
- def get_params(self) -> Dict[str, Optional[str]]:
214
- res = super().get_params()
215
- if self.lower_bound is not None:
216
- res["lower_bound"] = str(self.lower_bound)
217
- if self.upper_bound is not None:
218
- res["upper_bound"] = str(self.upper_bound)
219
- return res
220
-
221
212
  def _agg(self, x):
222
213
  x = x[
223
214
  (x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
@@ -278,17 +269,32 @@ class DatePercentile(DatePercentileBase):
278
269
  {
279
270
  "zero_month": self.zero_month,
280
271
  "zero_year": self.zero_year,
281
- "zero_bounds": json.dumps(self.zero_bounds),
272
+ "zero_bounds": self.zero_bounds,
282
273
  "step": self.step,
283
274
  }
284
275
  )
285
276
  return res
286
277
 
287
- @pydantic_validator("zero_bounds", mode="before")
288
- def parse_zero_bounds(cls, value):
289
- if isinstance(value, str):
290
- return json.loads(value)
291
- return value
278
+ # Check Pydantic version
279
+ if get_pydantic_version() >= 2:
280
+ # Use @field_validator for Pydantic 2.x
281
+ from pydantic import field_validator
282
+
283
+ @field_validator("zero_bounds", mode="before")
284
+ def parse_zero_bounds(cls, value):
285
+ if isinstance(value, str):
286
+ return json.loads(value)
287
+ return value
288
+
289
+ else:
290
+ # Use @validator for Pydantic 1.x
291
+ from pydantic import validator
292
+
293
+ @validator("zero_bounds", pre=True)
294
+ def parse_zero_bounds(cls, value):
295
+ if isinstance(value, str):
296
+ return json.loads(value)
297
+ return value
292
298
 
293
299
  def _get_bounds(self, date_col: pd.Series) -> pd.Series:
294
300
  months = date_col.dt.month
upgini/autofe/feature.py CHANGED
@@ -112,11 +112,7 @@ class Feature:
112
112
 
113
113
  def get_hash(self) -> str:
114
114
  return hashlib.sha256(
115
- "_".join(
116
- [self.op.get_hash_component()]
117
- + [ch.op.get_hash_component() for ch in self.children if isinstance(ch, Feature)]
118
- + [ch.get_display_name() for ch in self.children]
119
- ).encode("utf-8")
115
+ "_".join([self.op.get_hash_component()] + [ch.get_display_name() for ch in self.children]).encode("utf-8")
120
116
  ).hexdigest()[:8]
121
117
 
122
118
  def set_alias(self, alias: str) -> "Feature":
@@ -85,7 +85,7 @@ class TimeSeriesBase(PandasOperator, abc.ABC):
85
85
  """
86
86
  import re
87
87
 
88
- offset_regex = f"{base_regex}_offset_(\\d+)([a-zA-Z])$"
88
+ offset_regex = f"{base_regex}_offset_(\\d+)([a-zA-Z])"
89
89
  match = re.match(offset_regex, formula)
90
90
 
91
91
  if match:
@@ -1,13 +1,16 @@
1
- import json
2
1
  from typing import Dict, List, Optional
3
2
 
4
3
  import numpy as np
5
4
  import pandas as pd
6
5
 
6
+ try:
7
+ from pydantic import field_validator as validator # V2
8
+ except ImportError:
9
+ from pydantic import validator # V1
10
+
7
11
  from upgini.autofe.all_operators import find_op
8
12
  from upgini.autofe.operator import PandasOperator, ParametrizedOperator
9
13
  from upgini.autofe.timeseries.base import TimeSeriesBase
10
- from upgini.autofe.utils import pydantic_validator
11
14
 
12
15
 
13
16
  class CrossSeriesInteraction(TimeSeriesBase, ParametrizedOperator):
@@ -17,22 +20,11 @@ class CrossSeriesInteraction(TimeSeriesBase, ParametrizedOperator):
17
20
  left_descriptor: List[str] = []
18
21
  right_descriptor: List[str] = []
19
22
 
20
- @pydantic_validator("descriptor_indices")
23
+ @validator("descriptor_indices")
24
+ @classmethod
21
25
  def validate_descriptor_indices(cls, v):
22
- if isinstance(v, str):
23
- return json.loads(v)
24
- return v
25
-
26
- @pydantic_validator("left_descriptor", "right_descriptor", mode="before")
27
- def parse_descriptors(cls, v):
28
- if isinstance(v, str):
29
- return json.loads(v)
30
- return v
31
-
32
- @pydantic_validator("interaction_op", mode="before")
33
- def validate_interaction_op(cls, v):
34
- if isinstance(v, str):
35
- return find_op(v)
26
+ if not v:
27
+ raise ValueError("descriptor_indices cannot be empty for CrossSeriesInteraction")
36
28
  return v
37
29
 
38
30
  def __init__(self, **data):
@@ -96,9 +88,9 @@ class CrossSeriesInteraction(TimeSeriesBase, ParametrizedOperator):
96
88
  res.update(
97
89
  {
98
90
  "interaction_op": self._get_interaction_op_name(),
99
- "descriptor_indices": json.dumps(self.descriptor_indices),
100
- "left_descriptor": json.dumps(self.left_descriptor),
101
- "right_descriptor": json.dumps(self.right_descriptor),
91
+ "descriptor_indices": self.descriptor_indices,
92
+ "left_descriptor": self.left_descriptor,
93
+ "right_descriptor": self.right_descriptor,
102
94
  }
103
95
  )
104
96
  return res
@@ -3,7 +3,6 @@ from typing import Dict, Optional
3
3
 
4
4
  from upgini.autofe.operator import ParametrizedOperator
5
5
  from upgini.autofe.timeseries.base import TimeSeriesBase
6
- from upgini.autofe.utils import pydantic_validator
7
6
 
8
7
  # Roll aggregation functions
9
8
  roll_aggregations = {
@@ -13,13 +12,19 @@ roll_aggregations = {
13
12
  "iqr": lambda x: x.quantile(0.75) - x.quantile(0.25),
14
13
  }
15
14
 
15
+ try:
16
+ from pydantic import field_validator as validator # V2
17
+ except ImportError:
18
+ from pydantic import validator # V1
19
+
16
20
 
17
21
  class Roll(TimeSeriesBase, ParametrizedOperator):
18
22
  aggregation: str
19
23
  window_size: int = 1
20
24
  window_unit: str = "D"
21
25
 
22
- @pydantic_validator("window_unit")
26
+ @validator("window_unit")
27
+ @classmethod
23
28
  def validate_window_unit(cls, v: str) -> str:
24
29
  try:
25
30
  pd.tseries.frequencies.to_offset(v)
@@ -2,11 +2,10 @@ from typing import Dict, Optional, Union
2
2
  import numpy as np
3
3
  import pandas as pd
4
4
 
5
- from upgini.autofe.operator import ParametrizedOperator
6
5
  from upgini.autofe.timeseries.base import TimeSeriesBase
7
6
 
8
7
 
9
- class TrendCoefficient(TimeSeriesBase, ParametrizedOperator):
8
+ class TrendCoefficient(TimeSeriesBase):
10
9
  name: str = "trend_coef"
11
10
  step_size: int = 1
12
11
  step_unit: str = "D"
@@ -702,7 +702,6 @@ class FeaturesEnricher(TransformerMixin):
702
702
  def transform(
703
703
  self,
704
704
  X: pd.DataFrame,
705
- y: Optional[pd.Series] = None,
706
705
  *args,
707
706
  exclude_features_sources: Optional[List[str]] = None,
708
707
  keep_input: bool = True,
@@ -767,7 +766,6 @@ class FeaturesEnricher(TransformerMixin):
767
766
  result, _, _ = self.__inner_transform(
768
767
  trace_id,
769
768
  X,
770
- y=y,
771
769
  exclude_features_sources=exclude_features_sources,
772
770
  importance_threshold=importance_threshold,
773
771
  max_features=max_features,
@@ -1684,6 +1682,7 @@ class FeaturesEnricher(TransformerMixin):
1684
1682
  validated_X,
1685
1683
  validated_y,
1686
1684
  eval_set,
1685
+ is_demo_dataset,
1687
1686
  exclude_features_sources,
1688
1687
  trace_id,
1689
1688
  progress_bar,
@@ -1874,147 +1873,158 @@ class FeaturesEnricher(TransformerMixin):
1874
1873
  validated_X: pd.DataFrame,
1875
1874
  validated_y: pd.Series,
1876
1875
  eval_set: Optional[List[tuple]],
1876
+ is_demo_dataset: bool,
1877
1877
  exclude_features_sources: Optional[List[str]],
1878
1878
  trace_id: str,
1879
1879
  progress_bar: Optional[ProgressBar],
1880
1880
  progress_callback: Optional[Callable[[SearchProgress], Any]],
1881
1881
  ) -> _SampledDataForMetrics:
1882
- has_eval_set = eval_set is not None
1883
-
1884
- self.logger.info(f"Transform {'with' if has_eval_set else 'without'} eval_set")
1885
-
1886
- # Prepare
1887
- df = self.__combine_train_and_eval_sets(validated_X, validated_y, eval_set)
1888
- df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1889
- df = self.__downsample_for_metrics(df)
1890
-
1891
- # Transform
1882
+ eval_set_sampled_dict = {}
1883
+ if eval_set is not None:
1884
+ self.logger.info("Transform with eval_set")
1885
+ # concatenate X and eval_set with eval_set_index
1886
+ df = validated_X.copy()
1887
+ df[TARGET] = validated_y
1888
+ df[EVAL_SET_INDEX] = 0
1889
+ for idx, eval_pair in enumerate(eval_set):
1890
+ eval_x, eval_y = self._validate_eval_set_pair(validated_X, eval_pair)
1891
+ eval_df_with_index = eval_x.copy()
1892
+ eval_df_with_index[TARGET] = eval_y
1893
+ eval_df_with_index[EVAL_SET_INDEX] = idx + 1
1894
+ df = pd.concat([df, eval_df_with_index])
1895
+
1896
+ df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1897
+
1898
+ # downsample if need to eval_set threshold
1899
+ num_samples = _num_samples(df)
1900
+ force_downsampling = (
1901
+ not self.disable_force_downsampling
1902
+ and self.columns_for_online_api is not None
1903
+ and num_samples > Dataset.FORCE_SAMPLE_SIZE
1904
+ )
1905
+ # TODO: check that system_record_id was added before this step
1906
+ if force_downsampling:
1907
+ self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
1908
+ df = balance_undersample_forced(
1909
+ df=df,
1910
+ target_column=TARGET,
1911
+ id_columns=self.id_columns,
1912
+ date_column=self._get_date_column(self.search_keys),
1913
+ task_type=self.model_task_type,
1914
+ cv_type=self.cv,
1915
+ random_state=self.random_state,
1916
+ sample_size=Dataset.FORCE_SAMPLE_SIZE,
1917
+ logger=self.logger,
1918
+ bundle=self.bundle,
1919
+ warning_callback=self.__log_warning,
1920
+ )
1921
+ elif num_samples > Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD:
1922
+ self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS}")
1923
+ df = df.sample(n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state)
1892
1924
 
1893
- enriched_df, _, _ = self.__inner_transform(
1894
- trace_id,
1895
- X=df.drop(columns=[TARGET]),
1896
- y=df[TARGET],
1897
- exclude_features_sources=exclude_features_sources,
1898
- silent_mode=True,
1899
- metrics_calculation=True,
1900
- progress_bar=progress_bar,
1901
- progress_callback=progress_callback,
1902
- add_fit_system_record_id=True,
1903
- )
1904
- if enriched_df is None:
1905
- return None
1925
+ eval_set_sampled_dict = {}
1906
1926
 
1907
- x_columns = [
1908
- c
1909
- for c in (validated_X.columns.tolist() + self.fit_generated_features + [SYSTEM_RECORD_ID])
1910
- if c in enriched_df.columns
1911
- ]
1927
+ tmp_target_name = "__target"
1928
+ df = df.rename(columns={TARGET: tmp_target_name})
1912
1929
 
1913
- X_sampled, y_sampled, enriched_X = self.__extract_train_data(enriched_df, x_columns)
1914
- eval_set_sampled_dict = self.__extract_eval_data(
1915
- enriched_df, x_columns, enriched_X.columns.tolist(), len(eval_set) if has_eval_set else 0
1916
- )
1930
+ enriched_df, columns_renaming, generated_features = self.__inner_transform(
1931
+ trace_id,
1932
+ df,
1933
+ exclude_features_sources=exclude_features_sources,
1934
+ silent_mode=True,
1935
+ metrics_calculation=True,
1936
+ progress_bar=progress_bar,
1937
+ progress_callback=progress_callback,
1938
+ add_fit_system_record_id=True,
1939
+ target_name=tmp_target_name,
1940
+ )
1941
+ if enriched_df is None:
1942
+ return None
1917
1943
 
1918
- # Cache and return results
1919
- return self.__cache_and_return_results(
1920
- validated_X, validated_y, eval_set, X_sampled, y_sampled, enriched_X, eval_set_sampled_dict
1921
- )
1944
+ enriched_df = enriched_df.rename(columns={tmp_target_name: TARGET})
1922
1945
 
1923
- def __combine_train_and_eval_sets(
1924
- self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]]
1925
- ) -> pd.DataFrame:
1926
- df = validated_X.copy()
1927
- df[TARGET] = validated_y
1928
- if eval_set is None:
1929
- return df
1946
+ x_columns = [
1947
+ c
1948
+ for c in (validated_X.columns.tolist() + generated_features + [SYSTEM_RECORD_ID])
1949
+ if c in enriched_df.columns
1950
+ ]
1930
1951
 
1931
- df[EVAL_SET_INDEX] = 0
1952
+ enriched_Xy = enriched_df.query(f"{EVAL_SET_INDEX} == 0")
1953
+ X_sampled = enriched_Xy[x_columns].copy()
1954
+ y_sampled = enriched_Xy[TARGET].copy()
1955
+ enriched_X = enriched_Xy.drop(columns=[TARGET, EVAL_SET_INDEX])
1956
+ enriched_X_columns = enriched_X.columns.tolist()
1932
1957
 
1933
- for idx, eval_pair in enumerate(eval_set):
1934
- eval_x, eval_y = self._validate_eval_set_pair(validated_X, eval_pair)
1935
- eval_df_with_index = eval_x.copy()
1936
- eval_df_with_index[TARGET] = eval_y
1937
- eval_df_with_index[EVAL_SET_INDEX] = idx + 1
1938
- df = pd.concat([df, eval_df_with_index])
1958
+ for idx in range(len(eval_set)):
1959
+ enriched_eval_xy = enriched_df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
1960
+ eval_x_sampled = enriched_eval_xy[x_columns].copy()
1961
+ eval_y_sampled = enriched_eval_xy[TARGET].copy()
1962
+ enriched_eval_x = enriched_eval_xy[enriched_X_columns].copy()
1963
+ eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
1964
+ else:
1965
+ self.logger.info("Transform without eval_set")
1966
+ df = validated_X.copy()
1939
1967
 
1940
- return df
1968
+ df[TARGET] = validated_y
1941
1969
 
1942
- def __downsample_for_metrics(self, df: pd.DataFrame) -> pd.DataFrame:
1943
- num_samples = _num_samples(df)
1944
- force_downsampling = (
1945
- not self.disable_force_downsampling
1946
- and self.columns_for_online_api is not None
1947
- and num_samples > Dataset.FORCE_SAMPLE_SIZE
1948
- )
1970
+ df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1949
1971
 
1950
- if force_downsampling:
1951
- self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
1952
- return balance_undersample_forced(
1953
- df=df,
1954
- target_column=TARGET,
1955
- id_columns=self.id_columns,
1956
- date_column=self._get_date_column(self.search_keys),
1957
- task_type=self.model_task_type,
1958
- cv_type=self.cv,
1959
- random_state=self.random_state,
1960
- sample_size=Dataset.FORCE_SAMPLE_SIZE,
1961
- logger=self.logger,
1962
- bundle=self.bundle,
1963
- warning_callback=self.__log_warning,
1972
+ num_samples = _num_samples(df)
1973
+ force_downsampling = (
1974
+ not self.disable_force_downsampling
1975
+ and self.columns_for_online_api is not None
1976
+ and num_samples > Dataset.FORCE_SAMPLE_SIZE
1964
1977
  )
1965
- elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
1966
- if EVAL_SET_INDEX in df.columns:
1967
- threshold = Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD
1968
- sample_size = Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS
1969
- else:
1970
- threshold = Dataset.FIT_SAMPLE_THRESHOLD
1971
- sample_size = Dataset.FIT_SAMPLE_ROWS
1972
1978
 
1973
- if num_samples > threshold:
1974
- self.logger.info(f"Downsampling from {num_samples} to {sample_size}")
1975
- return df.sample(n=sample_size, random_state=self.random_state)
1979
+ if force_downsampling:
1980
+ self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
1981
+ df = balance_undersample_forced(
1982
+ df=df,
1983
+ target_column=TARGET,
1984
+ id_columns=self.id_columns,
1985
+ date_column=self._get_date_column(self.search_keys),
1986
+ task_type=self.model_task_type,
1987
+ cv_type=self.cv,
1988
+ random_state=self.random_state,
1989
+ sample_size=Dataset.FORCE_SAMPLE_SIZE,
1990
+ logger=self.logger,
1991
+ bundle=self.bundle,
1992
+ warning_callback=self.__log_warning,
1993
+ )
1994
+ elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
1995
+ self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
1996
+ df = df.sample(n=Dataset.FIT_SAMPLE_ROWS, random_state=self.random_state)
1976
1997
 
1977
- return df
1998
+ tmp_target_name = "__target"
1999
+ df = df.rename(columns={TARGET: tmp_target_name})
1978
2000
 
1979
- def __extract_train_data(
1980
- self, enriched_df: pd.DataFrame, x_columns: List[str]
1981
- ) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
1982
- if EVAL_SET_INDEX in enriched_df.columns:
1983
- enriched_Xy = enriched_df.query(f"{EVAL_SET_INDEX} == 0")
1984
- else:
1985
- enriched_Xy = enriched_df
1986
- X_sampled = enriched_Xy[x_columns].copy()
1987
- y_sampled = enriched_Xy[TARGET].copy()
1988
- enriched_X = enriched_Xy.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
1989
- return X_sampled, y_sampled, enriched_X
2001
+ enriched_Xy, columns_renaming, generated_features = self.__inner_transform(
2002
+ trace_id,
2003
+ df,
2004
+ exclude_features_sources=exclude_features_sources,
2005
+ silent_mode=True,
2006
+ metrics_calculation=True,
2007
+ progress_bar=progress_bar,
2008
+ progress_callback=progress_callback,
2009
+ add_fit_system_record_id=True,
2010
+ target_name=tmp_target_name,
2011
+ )
2012
+ if enriched_Xy is None:
2013
+ return None
1990
2014
 
1991
- def __extract_eval_data(
1992
- self, enriched_df: pd.DataFrame, x_columns: List[str], enriched_X_columns: List[str], eval_set_len: int
1993
- ) -> Dict[int, Tuple]:
1994
- eval_set_sampled_dict = {}
2015
+ enriched_Xy = enriched_Xy.rename(columns={tmp_target_name: TARGET})
1995
2016
 
1996
- for idx in range(eval_set_len):
1997
- enriched_eval_xy = enriched_df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
1998
- eval_x_sampled = enriched_eval_xy[x_columns].copy()
1999
- eval_y_sampled = enriched_eval_xy[TARGET].copy()
2000
- enriched_eval_x = enriched_eval_xy[enriched_X_columns].copy()
2001
- eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
2017
+ x_columns = [
2018
+ c
2019
+ for c in (validated_X.columns.tolist() + generated_features + [SYSTEM_RECORD_ID])
2020
+ if c in enriched_Xy.columns
2021
+ ]
2002
2022
 
2003
- return eval_set_sampled_dict
2023
+ X_sampled = enriched_Xy[x_columns].copy()
2024
+ y_sampled = enriched_Xy[TARGET].copy()
2025
+ enriched_X = enriched_Xy.drop(columns=TARGET)
2004
2026
 
2005
- def __cache_and_return_results(
2006
- self,
2007
- validated_X: pd.DataFrame,
2008
- validated_y: pd.Series,
2009
- eval_set: Optional[List[tuple]],
2010
- X_sampled: pd.DataFrame,
2011
- y_sampled: pd.Series,
2012
- enriched_X: pd.DataFrame,
2013
- eval_set_sampled_dict: Dict[int, Tuple],
2014
- ) -> _SampledDataForMetrics:
2015
2027
  datasets_hash = hash_input(validated_X, validated_y, eval_set)
2016
- columns_renaming = getattr(self, "fit_columns_renaming", {})
2017
-
2018
2028
  self.__cached_sampled_datasets[datasets_hash] = (
2019
2029
  X_sampled,
2020
2030
  y_sampled,
@@ -2151,7 +2161,6 @@ if response.status_code == 200:
2151
2161
  trace_id: str,
2152
2162
  X: pd.DataFrame,
2153
2163
  *,
2154
- y: Optional[pd.Series] = None,
2155
2164
  exclude_features_sources: Optional[List[str]] = None,
2156
2165
  importance_threshold: Optional[float] = None,
2157
2166
  max_features: Optional[int] = None,
@@ -2170,14 +2179,8 @@ if response.status_code == 200:
2170
2179
  self.logger.info("Start transform")
2171
2180
 
2172
2181
  validated_X = self._validate_X(X, is_transform=True)
2173
- if y is not None:
2174
- validated_y = self._validate_y(validated_X, y)
2175
- df = self.__combine_train_and_eval_sets(validated_X, validated_y, eval_set=None)
2176
- else:
2177
- validated_y = None
2178
- df = validated_X
2179
2182
 
2180
- self.__log_debug_information(validated_X, validated_y, exclude_features_sources=exclude_features_sources)
2183
+ self.__log_debug_information(validated_X, exclude_features_sources=exclude_features_sources)
2181
2184
 
2182
2185
  self.__validate_search_keys(self.search_keys, self.search_id)
2183
2186
 
@@ -2220,27 +2223,29 @@ if response.status_code == 200:
2220
2223
  self.logger.info(msg)
2221
2224
  print(msg)
2222
2225
 
2223
- is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
2226
+ is_demo_dataset = hash_input(validated_X) in DEMO_DATASET_HASHES
2224
2227
 
2225
2228
  columns_to_drop = [
2226
- c for c in df.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
2229
+ c for c in validated_X.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
2227
2230
  ]
2228
2231
  if len(columns_to_drop) > 0:
2229
2232
  msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
2230
2233
  self.logger.warning(msg)
2231
2234
  print(msg)
2232
- df = df.drop(columns=columns_to_drop)
2235
+ validated_X = validated_X.drop(columns=columns_to_drop)
2233
2236
 
2234
2237
  search_keys = self.search_keys.copy()
2235
2238
  if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
2236
- search_keys.update(
2239
+ self.search_keys.update(
2237
2240
  {col: SearchKey.CUSTOM_KEY for col in self.id_columns if col not in self.search_keys}
2238
2241
  )
2239
2242
 
2240
2243
  search_keys = self.__prepare_search_keys(
2241
- df, search_keys, is_demo_dataset, is_transform=True, silent_mode=silent_mode
2244
+ validated_X, search_keys, is_demo_dataset, is_transform=True, silent_mode=silent_mode
2242
2245
  )
2243
2246
 
2247
+ df = validated_X.copy()
2248
+
2244
2249
  df = self.__handle_index_search_keys(df, search_keys)
2245
2250
 
2246
2251
  if DEFAULT_INDEX in df.columns:
@@ -2279,11 +2284,8 @@ if response.status_code == 200:
2279
2284
  features_for_transform = self._search_task.get_features_for_transform() or []
2280
2285
  if len(features_for_transform) > 0:
2281
2286
  missing_features_for_transform = [
2282
- columns_renaming.get(f) or f for f in features_for_transform if f not in df.columns
2287
+ columns_renaming.get(f) for f in features_for_transform if f not in df.columns
2283
2288
  ]
2284
- if TARGET in missing_features_for_transform:
2285
- raise ValidationError(self.bundle.get("missing_target_for_transform"))
2286
-
2287
2289
  if len(missing_features_for_transform) > 0:
2288
2290
  raise ValidationError(
2289
2291
  self.bundle.get("missing_features_for_transform").format(missing_features_for_transform)
@@ -2339,10 +2341,11 @@ if response.status_code == 200:
2339
2341
  converter = PostalCodeSearchKeyConverter(postal_code)
2340
2342
  df = converter.convert(df)
2341
2343
 
2342
- meaning_types = {}
2343
- meaning_types.update({col: FileColumnMeaningType.FEATURE for col in features_for_transform})
2344
- meaning_types.update({col: key.value for col, key in search_keys.items()})
2344
+ # generated_features = [f for f in generated_features if f in self.fit_generated_features]
2345
2345
 
2346
+ meaning_types = {col: key.value for col, key in search_keys.items()}
2347
+ for col in features_for_transform:
2348
+ meaning_types[col] = FileColumnMeaningType.FEATURE
2346
2349
  features_not_to_pass = [
2347
2350
  c
2348
2351
  for c in df.columns
@@ -2351,12 +2354,13 @@ if response.status_code == 200:
2351
2354
  and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
2352
2355
  ]
2353
2356
 
2354
- if add_fit_system_record_id:
2357
+ if add_fit_system_record_id and target_name is not None:
2358
+ reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
2355
2359
  df = self.__add_fit_system_record_id(
2356
2360
  df,
2357
2361
  search_keys,
2358
2362
  SYSTEM_RECORD_ID,
2359
- TARGET,
2363
+ reversed_columns_renaming.get(target_name, target_name),
2360
2364
  columns_renaming,
2361
2365
  silent=True,
2362
2366
  )
@@ -4066,7 +4070,10 @@ if response.status_code == 200:
4066
4070
  )
4067
4071
 
4068
4072
  if all(k == SearchKey.CUSTOM_KEY for k in valid_search_keys.values()):
4069
- msg = self.bundle.get("unregistered_only_personal_keys")
4073
+ if self.__is_registered:
4074
+ msg = self.bundle.get("only_custom_keys")
4075
+ else:
4076
+ msg = self.bundle.get("unregistered_only_personal_keys")
4070
4077
  self.logger.warning(msg + f" Provided search keys: {search_keys}")
4071
4078
  raise ValidationError(msg)
4072
4079
 
upgini/metrics.py CHANGED
@@ -11,7 +11,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
11
11
  import catboost
12
12
  import numpy as np
13
13
  import pandas as pd
14
- from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor, Pool
14
+ from lightgbm import LGBMClassifier, LGBMRegressor
15
15
  from numpy import log1p
16
16
  from pandas.api.types import is_numeric_dtype
17
17
  from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
@@ -88,11 +88,18 @@ CATBOOST_MULTICLASS_PARAMS = {
88
88
 
89
89
  LIGHTGBM_PARAMS = {
90
90
  "random_state": DEFAULT_RANDOM_STATE,
91
- "num_leaves": 16,
91
+ # "num_leaves": 16,
92
+ # "n_estimators": 150,
93
+ # "min_child_weight": 1,
92
94
  "max_depth": 4,
93
- "n_estimators": 150,
95
+ "max_cat_threshold": 80,
96
+ "min_data_per_group": 25,
97
+ "num_boost_round": 150,
98
+ "cat_l2": 10,
99
+ "cat_smooth": 12,
94
100
  "learning_rate": 0.05,
95
- "min_child_weight": 1,
101
+ "feature_fraction": 1.0,
102
+ "min_sum_hessian_in_leaf": 0.01,
96
103
  }
97
104
 
98
105
  N_FOLDS = 5
@@ -211,6 +218,14 @@ SUPPORTED_CATBOOST_METRICS = {
211
218
  }
212
219
 
213
220
 
221
+ def is_catboost_estimator(estimator):
222
+ try:
223
+ from catboost import CatBoostClassifier, CatBoostRegressor
224
+ return isinstance(estimator, (CatBoostClassifier, CatBoostRegressor))
225
+ except ImportError:
226
+ return False
227
+
228
+
214
229
  @dataclass
215
230
  class _CrossValResults:
216
231
  metric: Optional[float]
@@ -351,7 +366,7 @@ class EstimatorWrapper:
351
366
  if shaps is not None:
352
367
  for feature, shap_value in shaps.items():
353
368
  # shap_values_all_folds[feature] = shap_values_all_folds.get(feature, []) + shap_value.tolist()
354
- shap_values_all_folds[feature].extend(shap_value.tolist())
369
+ shap_values_all_folds[feature].append(shap_value)
355
370
 
356
371
  if shap_values_all_folds:
357
372
  average_shap_values = {
@@ -431,17 +446,26 @@ class EstimatorWrapper:
431
446
  # if metric_name.upper() in SUPPORTED_CATBOOST_METRICS:
432
447
  # params["eval_metric"] = SUPPORTED_CATBOOST_METRICS[metric_name.upper()]
433
448
  if target_type == ModelTaskType.MULTICLASS:
434
- params = _get_add_params(params, CATBOOST_MULTICLASS_PARAMS)
449
+ # params = _get_add_params(params, CATBOOST_MULTICLASS_PARAMS)
450
+ # params = _get_add_params(params, add_params)
451
+ # estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
452
+ params = _get_add_params(params, LIGHTGBM_PARAMS)
435
453
  params = _get_add_params(params, add_params)
436
- estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
454
+ estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
437
455
  elif target_type == ModelTaskType.BINARY:
438
- params = _get_add_params(params, CATBOOST_BINARY_PARAMS)
456
+ # params = _get_add_params(params, CATBOOST_BINARY_PARAMS)
457
+ # params = _get_add_params(params, add_params)
458
+ # estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
459
+ params = _get_add_params(params, LIGHTGBM_PARAMS)
439
460
  params = _get_add_params(params, add_params)
440
- estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
461
+ estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
441
462
  elif target_type == ModelTaskType.REGRESSION:
442
- params = _get_add_params(params, CATBOOST_REGRESSION_PARAMS)
463
+ # params = _get_add_params(params, CATBOOST_REGRESSION_PARAMS)
464
+ # params = _get_add_params(params, add_params)
465
+ # estimator = CatBoostWrapper(CatBoostRegressor(**params), **kwargs)
466
+ params = _get_add_params(params, LIGHTGBM_PARAMS)
443
467
  params = _get_add_params(params, add_params)
444
- estimator = CatBoostWrapper(CatBoostRegressor(**params), **kwargs)
468
+ estimator = LightGBMWrapper(LGBMRegressor(**params), **kwargs)
445
469
  else:
446
470
  raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
447
471
  else:
@@ -450,7 +474,7 @@ class EstimatorWrapper:
450
474
  else:
451
475
  estimator_copy = deepcopy(estimator)
452
476
  kwargs["estimator"] = estimator_copy
453
- if isinstance(estimator, (CatBoostClassifier, CatBoostRegressor)):
477
+ if is_catboost_estimator(estimator):
454
478
  if cat_features is not None:
455
479
  for cat_feature in cat_features:
456
480
  if cat_feature not in x.columns:
@@ -458,23 +482,13 @@ class EstimatorWrapper:
458
482
  f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
459
483
  )
460
484
  estimator_copy.set_params(
461
- # cat_features=[x.columns.get_loc(cat_feature) for cat_feature in cat_features]
462
485
  cat_features=cat_features
463
486
  )
464
487
  estimator = CatBoostWrapper(**kwargs)
465
488
  else:
466
- try:
467
- from lightgbm import LGBMClassifier, LGBMRegressor
468
-
469
- if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
470
- estimator = LightGBMWrapper(**kwargs)
471
- else:
472
- logger.warning(
473
- f"Unexpected estimator is used for metrics: {estimator}. "
474
- "Default strategy for category features will be used"
475
- )
476
- estimator = OtherEstimatorWrapper(**kwargs)
477
- except ModuleNotFoundError:
489
+ if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
490
+ estimator = LightGBMWrapper(**kwargs)
491
+ else:
478
492
  logger.warning(
479
493
  f"Unexpected estimator is used for metrics: {estimator}. "
480
494
  "Default strategy for category features will be used"
@@ -487,7 +501,7 @@ class EstimatorWrapper:
487
501
  class CatBoostWrapper(EstimatorWrapper):
488
502
  def __init__(
489
503
  self,
490
- estimator: Union[CatBoostClassifier, CatBoostRegressor],
504
+ estimator,
491
505
  scorer: Callable,
492
506
  metric_name: str,
493
507
  multiplier: int,
@@ -517,6 +531,7 @@ class CatBoostWrapper(EstimatorWrapper):
517
531
  x, y, groups, params = super()._prepare_to_fit(x, y)
518
532
 
519
533
  # Find embeddings
534
+ from catboost import CatBoostClassifier
520
535
  if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
521
536
  emb_pattern = r"(.+)_emb\d+"
522
537
  self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
@@ -637,8 +652,9 @@ class CatBoostWrapper(EstimatorWrapper):
637
652
  else:
638
653
  raise e
639
654
 
640
- def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator: CatBoost) -> Optional[Dict[str, float]]:
655
+ def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
641
656
  try:
657
+ from catboost import Pool
642
658
  # Create Pool for fold data, if need (for example, when categorical features are present)
643
659
  fold_pool = Pool(
644
660
  x,
@@ -715,6 +731,34 @@ class LightGBMWrapper(EstimatorWrapper):
715
731
  y = correct_string_target(y)
716
732
  return x, y, params
717
733
 
734
+ def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
735
+ try:
736
+ import shap
737
+ import lightgbm as lgb
738
+
739
+ if not isinstance(estimator, (lgb.LGBMRegressor, lgb.LGBMClassifier)):
740
+ return None
741
+
742
+ explainer = shap.TreeExplainer(estimator)
743
+
744
+ shap_values = explainer.shap_values(x)
745
+
746
+ # For classification, shap_values is returned as a list for each class
747
+ # Take values for the positive class
748
+ if isinstance(shap_values, list):
749
+ shap_values = shap_values[1]
750
+
751
+ # Calculate mean absolute SHAP value for each feature
752
+ feature_importance = {}
753
+ for i, col in enumerate(x.columns):
754
+ feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
755
+
756
+ return feature_importance
757
+
758
+ except Exception as e:
759
+ self.logger.warning(f"Failed to calculate SHAP values: {str(e)}")
760
+ return None
761
+
718
762
 
719
763
  class OtherEstimatorWrapper(EstimatorWrapper):
720
764
  def __init__(
@@ -80,6 +80,7 @@ email_and_hem_simultanious=EMAIL and HEM search keys cannot be used simultaneous
80
80
  postal_code_without_country=COUNTRY search key required if POSTAL_CODE is present
81
81
  multiple_search_key=Search key {} passed multiple times
82
82
  unregistered_only_personal_keys=Only personal search keys used. Api_key from profile.upgini.com required for EMAIL/HEM, PHONE NUMBER or IPv4/IPv6 search keys\nSee docs https://github.com/upgini/upgini#-open-up-all-capabilities-of-upgini
83
+ only_custom_keys=Only CUSTOM_KEY search keys were provided. At least one of DATE, COUNTRY, POSTAL_CODE, PHONE, EMAIL, HEM, IP should be provided
83
84
  search_key_not_found=Column `{}` from search_keys was not found in X dataframe: {}
84
85
  numeric_search_key_not_found=Index {} in search_keys is out of bounds for {} columns of X dataframe
85
86
  unsupported_search_key_type=Unsupported type of key in search_keys: {}
@@ -136,7 +137,6 @@ x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
136
137
  baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
137
138
  baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
138
139
  missing_features_for_transform=Missing some features for transform that were presented on fit: {}
139
- missing_target_for_transform=Search contains features on target. Please add y to the call and try again
140
140
  missing_id_column=Id column {} not found in X
141
141
  # target validation
142
142
  empty_target=Target is empty in all rows
upgini/search_task.py CHANGED
@@ -168,13 +168,7 @@ class SearchTask:
168
168
  for meta in self.provider_metadata_v2:
169
169
  if meta.features_used_for_embeddings is not None:
170
170
  features_for_transform.update(meta.features_used_for_embeddings)
171
- if meta.generated_features:
172
- features_for_transform.update(
173
- c.original_name
174
- for f in meta.generated_features
175
- for c in f.base_columns
176
- if c.ads_definition_id is None
177
- )
171
+
178
172
  return list(features_for_transform)
179
173
 
180
174
  def get_shuffle_kfold(self) -> Optional[bool]:
upgini/utils/mstats.py CHANGED
@@ -118,7 +118,7 @@ def spearmanr(
118
118
  # - dof: degrees of freedom
119
119
  # - t_stat: t-statistic
120
120
  # - alternative: 'two-sided', 'greater', 'less'
121
- def compute_t_pvalue(t_stat, dof, alternative="two-sided"):
121
+ def compute_t_pvalue(t_stat, dof, alternative='two-sided'):
122
122
  from scipy.stats import t
123
123
 
124
124
  if alternative == "two-sided":
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.68a3818.dev4
3
+ Version: 1.2.68a3832.dev2
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -23,12 +23,12 @@ Classifier: Programming Language :: Python :: 3.10
23
23
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
24
24
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
25
25
  Requires-Python: <3.12,>=3.8
26
- Requires-Dist: catboost>=1.0.3
27
26
  Requires-Dist: fastparquet>=0.8.1
28
27
  Requires-Dist: ipywidgets>=8.1.0
29
28
  Requires-Dist: jarowinkler>=2.0.0
30
29
  Requires-Dist: levenshtein>=0.25.1
31
- Requires-Dist: numpy<=1.26.4,>=1.19.0
30
+ Requires-Dist: lightgbm>=4.6.0
31
+ Requires-Dist: numpy<3.0.0,>=1.19.0
32
32
  Requires-Dist: pandas<3.0.0,>=1.1.0
33
33
  Requires-Dist: psutil>=6.0.0
34
34
  Requires-Dist: pydantic<3.0.0,>1.0.0
@@ -39,6 +39,7 @@ Requires-Dist: python-json-logger>=3.3.0
39
39
  Requires-Dist: requests>=2.8.0
40
40
  Requires-Dist: scikit-learn>=1.3.0
41
41
  Requires-Dist: scipy>=1.10.0
42
+ Requires-Dist: shap>=0.44.0
42
43
  Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
43
44
  Description-Content-Type: text/markdown
44
45
 
@@ -1,14 +1,14 @@
1
- upgini/__about__.py,sha256=pr0OsTaI3yNNQF9UIlaDHZZ2gJf_aMlqsKQJLQbnalw,33
1
+ upgini/__about__.py,sha256=CkeEtpLS48GUJZm5YngYZIgkl2XmylbHJZDvIw0AP1M,33
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=OGjpeFHbj3lWiZTOHTpWEoMMDmFY1FlNC44FKktoZvU,34956
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=KBTdADF7_Wj3uDROYdevukOk6R8LVQw47gJkH4M1_iQ,204435
6
+ upgini/features_enricher.py,sha256=GXXx14jwf3F26_KrfJ6O40Vcu1hRx5iBjUB_jxy3Xvg,205476
7
7
  upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
9
  upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
10
- upgini/metrics.py,sha256=t7uOOnlDYvP6E3DLjPMQcFBjyhJfUQY8aUlx7N0Mh-s,35477
11
- upgini/search_task.py,sha256=EuCGp0iCWz2fpuJgN6M47aP_CtIi3Oq9zw78w0mkKiU,17595
10
+ upgini/metrics.py,sha256=OW2a3UWdMEkhRv7XDJvgBsc3iU6RLC5mtqvT1fLURwk,36983
11
+ upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
12
12
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
13
13
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
14
14
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
@@ -16,20 +16,19 @@ upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo
16
16
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
18
18
  upgini/autofe/binary.py,sha256=MnQuFiERpocjCPQUjOljlsq5FE-04GPfwtNjzvfNMyU,7671
19
- upgini/autofe/date.py,sha256=C86F7sPiscUGq2a45UtQA9ADWBWg0kt54mePHHzjbLE,10633
20
- upgini/autofe/feature.py,sha256=y1x3wijhTVBmloayQAHiscqKU9Ll8kLcGm1PdvS357I,14910
19
+ upgini/autofe/date.py,sha256=I07psJerrxOcHao91PdSCk9X6KWu61IBVyFRLjGNgK8,10730
20
+ upgini/autofe/feature.py,sha256=xgu6bVIlUJ5PCUgoXQRNcGkcMOhj-_BdDRmkB_qRFS4,14766
21
21
  upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
22
22
  upgini/autofe/operator.py,sha256=EOffJw6vKXpEh5yymqb1RFNJPxGxmnHdFRo9dB5SCFo,4969
23
23
  upgini/autofe/unary.py,sha256=yVgPvtfnPSOhrii0YgezddmgWPwyOBCR0JutaIkdTTc,4658
24
- upgini/autofe/utils.py,sha256=fK1am2_tQj3fL2vDslblye8lmyfWgGIUOX1beYVBz4k,2420
25
24
  upgini/autofe/vector.py,sha256=l0KdKg-txlZxDSE4hPPfCtfGQofYbl7oaABPr830sPI,667
26
25
  upgini/autofe/timeseries/__init__.py,sha256=PGwwDAMwvkXl3el12tXVEmZUgDUvlmIPlXtROm6bD18,738
27
- upgini/autofe/timeseries/base.py,sha256=-bzVQ3YgZ5FkRZ64Pm1_fZugaKCixBIDPxOvo6pI370,3756
28
- upgini/autofe/timeseries/cross.py,sha256=M3aKc_yKSG3Q1xpoTUd51K8kuLYLBJepw9lcvWZCF5Y,5219
26
+ upgini/autofe/timeseries/base.py,sha256=T9Ec8LKJbiwTUGGsd_xhM0U0NUJblqmKchkzUI1sK88,3755
27
+ upgini/autofe/timeseries/cross.py,sha256=Sh5hAXZFWKaFRqf_JGODu9pWO2tmuV5VKyK9eX3i7-I,4931
29
28
  upgini/autofe/timeseries/delta.py,sha256=h0YhmI1TlPJnjwFpN_GQxLb6r59DQuucnG5tQAXSgjU,3520
30
29
  upgini/autofe/timeseries/lag.py,sha256=LfQtg484vuqM0mgY4Wft1swHX_Srq7OKKgZswCXoiXI,1882
31
- upgini/autofe/timeseries/roll.py,sha256=zADKXU-eYWQnQ5R3am1yEal8uU6Tm0jLAixwPb_aCHg,2794
32
- upgini/autofe/timeseries/trend.py,sha256=K1_iw2ko_LIUU8YCUgrvN3n0MkHtsi7-63-8x9er1k4,2129
30
+ upgini/autofe/timeseries/roll.py,sha256=bNFMDszSYTWvB7EyhHbRY1DJqzSURvHlPAcBebt0y0Y,2878
31
+ upgini/autofe/timeseries/trend.py,sha256=9p2Q5ByAi6cx9RH9teBTe8FyjSzqthznC2Lo5dsJ0ho,2051
33
32
  upgini/autofe/timeseries/volatility.py,sha256=9shUmIKjpWTHVYjj80YBsk0XheBJ9uBuLv5NW9Mchnk,7953
34
33
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
34
  upgini/data_source/data_source_publisher.py,sha256=4S9qwlAklD8vg9tUU_c1pHE2_glUHAh15-wr5hMwKFw,22879
@@ -39,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
39
38
  upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
40
39
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
41
40
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
42
- upgini/resource_bundle/strings.properties,sha256=LDT-jtYlrD1IXvWjFSf-dtvapje0qSrqI9W3v7y2zVo,27646
41
+ upgini/resource_bundle/strings.properties,sha256=XU5ulr5ZDQfGbFk9QdFDzl3oDMaw0eDYCPoEq3ZvIkw,27687
43
42
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
44
43
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
44
  upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -61,7 +60,7 @@ upgini/utils/feature_info.py,sha256=m1tQcT3hTChPAiXzpk0WQcEqElj8KgeCifEJFa7-gss,
61
60
  upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
62
61
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
63
62
  upgini/utils/ip_utils.py,sha256=TSQ_qDsLlVnm09X1HacpabEf_HNqSWpxBF4Sdc2xs08,6580
64
- upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
63
+ upgini/utils/mstats.py,sha256=dLJQr5Ak5BAoV-pDPpnfvMURZVkZ3_v250QzAsSlqY4,6286
65
64
  upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
66
65
  upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
67
66
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
@@ -71,7 +70,7 @@ upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,
71
70
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
72
71
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
73
72
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
74
- upgini-1.2.68a3818.dev4.dist-info/METADATA,sha256=2_w7FmOcbRz6S74tyelXCUiF1A77KUXsDZmA3eZDQw0,49123
75
- upgini-1.2.68a3818.dev4.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
76
- upgini-1.2.68a3818.dev4.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
77
- upgini-1.2.68a3818.dev4.dist-info/RECORD,,
73
+ upgini-1.2.68a3832.dev2.dist-info/METADATA,sha256=P4ETW0O44yMgZ2Yr-Q44ngLPzagOuuLhOPN16qsCysE,49149
74
+ upgini-1.2.68a3832.dev2.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
+ upgini-1.2.68a3832.dev2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
+ upgini-1.2.68a3832.dev2.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.24.2
2
+ Generator: hatchling 1.25.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
upgini/autofe/utils.py DELETED
@@ -1,83 +0,0 @@
1
- """
2
- Utility functions for autofe module.
3
- """
4
-
5
- import functools
6
- from typing import Callable
7
-
8
-
9
- def get_pydantic_version():
10
- """
11
- Get the major version of pydantic.
12
-
13
- Returns:
14
- int: Major version number (1 or 2)
15
- """
16
- try:
17
- from pydantic import __version__ as pydantic_version
18
-
19
- major_version = int(pydantic_version.split(".")[0])
20
- return major_version
21
- except (ImportError, ValueError):
22
- # Default to version 1 if unable to determine
23
- return 1
24
-
25
-
26
- def pydantic_validator(field_name: str, *fields, mode: str = "before", **kwargs):
27
- """
28
- A decorator that applies the appropriate Pydantic validator based on the installed version.
29
-
30
- This decorator handles the differences between Pydantic v1 and v2 validator syntax,
31
- making it easier to write code that works with both versions.
32
-
33
- Args:
34
- field_name (str): The name of the field to validate
35
- mode (str): The validation mode, either "before" or "after" (for Pydantic v2)
36
- **kwargs: Additional arguments to pass to the validator
37
-
38
- Returns:
39
- Callable: A decorator that can be applied to validator methods
40
-
41
- Example:
42
- ```python
43
- class MyModel(BaseModel):
44
- items: List[int]
45
-
46
- @pydantic_validator("items")
47
- def parse_items(cls, value):
48
- if isinstance(value, str):
49
- return [int(x) for x in value.split(",")]
50
- return value
51
- ```
52
- """
53
- pydantic_version = get_pydantic_version()
54
-
55
- if pydantic_version >= 2:
56
- # Use field_validator for Pydantic 2.x
57
- from pydantic import field_validator
58
-
59
- def decorator(func: Callable) -> Callable:
60
- @field_validator(field_name, *fields, mode=mode, **kwargs)
61
- @functools.wraps(func)
62
- def wrapper(cls, value, **kw):
63
- return func(cls, value)
64
-
65
- return wrapper
66
-
67
- return decorator
68
- else:
69
- # Use validator for Pydantic 1.x
70
- from pydantic import validator
71
-
72
- # Map mode to Pydantic v1 parameters
73
- pre = True if mode == "before" else False
74
-
75
- def decorator(func: Callable) -> Callable:
76
- @validator(field_name, *fields, pre=pre, **kwargs)
77
- @functools.wraps(func)
78
- def wrapper(cls, value, **kw):
79
- return func(cls, value)
80
-
81
- return wrapper
82
-
83
- return decorator