upgini 1.2.66a3818.dev1__py3-none-any.whl → 1.2.68a3818.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.66a3818.dev1"
1
+ __version__ = "1.2.68a3818.dev1"
upgini/autofe/date.py CHANGED
@@ -8,6 +8,7 @@ from pandas.core.arrays.timedeltas import TimedeltaArray
8
8
  from pydantic import BaseModel, __version__ as pydantic_version
9
9
 
10
10
  from upgini.autofe.operator import PandasOperator, ParametrizedOperator
11
+ from upgini.autofe.utils import pydantic_validator
11
12
 
12
13
 
13
14
  def get_pydantic_version():
@@ -209,6 +210,14 @@ class DateListDiffBounded(DateListDiff, ParametrizedOperator):
209
210
 
210
211
  return cls(diff_unit=diff_unit, lower_bound=lower_bound, upper_bound=upper_bound, aggregation=aggregation)
211
212
 
213
+ def get_params(self) -> Dict[str, Optional[str]]:
214
+ res = super().get_params()
215
+ if self.lower_bound is not None:
216
+ res["lower_bound"] = str(self.lower_bound)
217
+ if self.upper_bound is not None:
218
+ res["upper_bound"] = str(self.upper_bound)
219
+ return res
220
+
212
221
  def _agg(self, x):
213
222
  x = x[
214
223
  (x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
@@ -269,32 +278,17 @@ class DatePercentile(DatePercentileBase):
269
278
  {
270
279
  "zero_month": self.zero_month,
271
280
  "zero_year": self.zero_year,
272
- "zero_bounds": self.zero_bounds,
281
+ "zero_bounds": json.dumps(self.zero_bounds),
273
282
  "step": self.step,
274
283
  }
275
284
  )
276
285
  return res
277
286
 
278
- # Check Pydantic version
279
- if get_pydantic_version() >= 2:
280
- # Use @field_validator for Pydantic 2.x
281
- from pydantic import field_validator
282
-
283
- @field_validator("zero_bounds", mode="before")
284
- def parse_zero_bounds(cls, value):
285
- if isinstance(value, str):
286
- return json.loads(value)
287
- return value
288
-
289
- else:
290
- # Use @validator for Pydantic 1.x
291
- from pydantic import validator
292
-
293
- @validator("zero_bounds", pre=True)
294
- def parse_zero_bounds(cls, value):
295
- if isinstance(value, str):
296
- return json.loads(value)
297
- return value
287
+ @pydantic_validator("zero_bounds", mode="before")
288
+ def parse_zero_bounds(cls, value):
289
+ if isinstance(value, str):
290
+ return json.loads(value)
291
+ return value
298
292
 
299
293
  def _get_bounds(self, date_col: pd.Series) -> pd.Series:
300
294
  months = date_col.dt.month
@@ -1,16 +1,13 @@
1
+ import json
1
2
  from typing import Dict, List, Optional
2
3
 
3
4
  import numpy as np
4
5
  import pandas as pd
5
6
 
6
- try:
7
- from pydantic import field_validator as validator # V2
8
- except ImportError:
9
- from pydantic import validator # V1
10
-
11
7
  from upgini.autofe.all_operators import find_op
12
8
  from upgini.autofe.operator import PandasOperator, ParametrizedOperator
13
9
  from upgini.autofe.timeseries.base import TimeSeriesBase
10
+ from upgini.autofe.utils import pydantic_validator
14
11
 
15
12
 
16
13
  class CrossSeriesInteraction(TimeSeriesBase, ParametrizedOperator):
@@ -20,13 +17,24 @@ class CrossSeriesInteraction(TimeSeriesBase, ParametrizedOperator):
20
17
  left_descriptor: List[str] = []
21
18
  right_descriptor: List[str] = []
22
19
 
23
- @validator("descriptor_indices")
24
- @classmethod
20
+ @pydantic_validator("descriptor_indices")
25
21
  def validate_descriptor_indices(cls, v):
26
22
  if not v:
27
23
  raise ValueError("descriptor_indices cannot be empty for CrossSeriesInteraction")
28
24
  return v
29
25
 
26
+ @pydantic_validator("left_descriptor", "right_descriptor", mode="before")
27
+ def parse_descriptors(cls, v):
28
+ if isinstance(v, str):
29
+ return json.loads(v)
30
+ return v
31
+
32
+ @pydantic_validator("interaction_op", mode="before")
33
+ def validate_interaction_op(cls, v):
34
+ if isinstance(v, str):
35
+ return find_op(v)
36
+ return v
37
+
30
38
  def __init__(self, **data):
31
39
  super().__init__(**data)
32
40
  indices = self.descriptor_indices
@@ -3,6 +3,7 @@ from typing import Dict, Optional
3
3
 
4
4
  from upgini.autofe.operator import ParametrizedOperator
5
5
  from upgini.autofe.timeseries.base import TimeSeriesBase
6
+ from upgini.autofe.utils import pydantic_validator
6
7
 
7
8
  # Roll aggregation functions
8
9
  roll_aggregations = {
@@ -12,19 +13,13 @@ roll_aggregations = {
12
13
  "iqr": lambda x: x.quantile(0.75) - x.quantile(0.25),
13
14
  }
14
15
 
15
- try:
16
- from pydantic import field_validator as validator # V2
17
- except ImportError:
18
- from pydantic import validator # V1
19
-
20
16
 
21
17
  class Roll(TimeSeriesBase, ParametrizedOperator):
22
18
  aggregation: str
23
19
  window_size: int = 1
24
20
  window_unit: str = "D"
25
21
 
26
- @validator("window_unit")
27
- @classmethod
22
+ @pydantic_validator("window_unit")
28
23
  def validate_window_unit(cls, v: str) -> str:
29
24
  try:
30
25
  pd.tseries.frequencies.to_offset(v)
upgini/autofe/utils.py ADDED
@@ -0,0 +1,83 @@
1
+ """
2
+ Utility functions for autofe module.
3
+ """
4
+
5
+ import functools
6
+ from typing import Callable
7
+
8
+
9
+ def get_pydantic_version():
10
+ """
11
+ Get the major version of pydantic.
12
+
13
+ Returns:
14
+ int: Major version number (1 or 2)
15
+ """
16
+ try:
17
+ from pydantic import __version__ as pydantic_version
18
+
19
+ major_version = int(pydantic_version.split(".")[0])
20
+ return major_version
21
+ except (ImportError, ValueError):
22
+ # Default to version 1 if unable to determine
23
+ return 1
24
+
25
+
26
+ def pydantic_validator(field_name: str, *fields, mode: str = "before", **kwargs):
27
+ """
28
+ A decorator that applies the appropriate Pydantic validator based on the installed version.
29
+
30
+ This decorator handles the differences between Pydantic v1 and v2 validator syntax,
31
+ making it easier to write code that works with both versions.
32
+
33
+ Args:
34
+ field_name (str): The name of the field to validate
35
+ mode (str): The validation mode, either "before" or "after" (for Pydantic v2)
36
+ **kwargs: Additional arguments to pass to the validator
37
+
38
+ Returns:
39
+ Callable: A decorator that can be applied to validator methods
40
+
41
+ Example:
42
+ ```python
43
+ class MyModel(BaseModel):
44
+ items: List[int]
45
+
46
+ @pydantic_validator("items")
47
+ def parse_items(cls, value):
48
+ if isinstance(value, str):
49
+ return [int(x) for x in value.split(",")]
50
+ return value
51
+ ```
52
+ """
53
+ pydantic_version = get_pydantic_version()
54
+
55
+ if pydantic_version >= 2:
56
+ # Use field_validator for Pydantic 2.x
57
+ from pydantic import field_validator
58
+
59
+ def decorator(func: Callable) -> Callable:
60
+ @field_validator(field_name, *fields, mode=mode, **kwargs)
61
+ @functools.wraps(func)
62
+ def wrapper(cls, value, **kw):
63
+ return func(cls, value)
64
+
65
+ return wrapper
66
+
67
+ return decorator
68
+ else:
69
+ # Use validator for Pydantic 1.x
70
+ from pydantic import validator
71
+
72
+ # Map mode to Pydantic v1 parameters
73
+ pre = True if mode == "before" else False
74
+
75
+ def decorator(func: Callable) -> Callable:
76
+ @validator(field_name, *fields, pre=pre, **kwargs)
77
+ @functools.wraps(func)
78
+ def wrapper(cls, value, **kw):
79
+ return func(cls, value)
80
+
81
+ return wrapper
82
+
83
+ return decorator
@@ -308,7 +308,8 @@ class FeaturesEnricher(TransformerMixin):
308
308
  self._search_task = search_task.poll_result(trace_id, quiet=True, check_fit=True)
309
309
  file_metadata = self._search_task.get_file_metadata(trace_id)
310
310
  x_columns = [c.originalName or c.name for c in file_metadata.columns]
311
- self.__prepare_feature_importances(trace_id, x_columns, silent=True)
311
+ df = pd.DataFrame(columns=x_columns)
312
+ self.__prepare_feature_importances(trace_id, df, silent=True)
312
313
  # TODO validate search_keys with search_keys from file_metadata
313
314
  print(self.bundle.get("search_by_task_id_finish"))
314
315
  self.logger.debug(f"Successfully initialized with search_id: {search_id}")
@@ -701,6 +702,7 @@ class FeaturesEnricher(TransformerMixin):
701
702
  def transform(
702
703
  self,
703
704
  X: pd.DataFrame,
705
+ y: Optional[pd.Series] = None,
704
706
  *args,
705
707
  exclude_features_sources: Optional[List[str]] = None,
706
708
  keep_input: bool = True,
@@ -765,6 +767,7 @@ class FeaturesEnricher(TransformerMixin):
765
767
  result, _, _ = self.__inner_transform(
766
768
  trace_id,
767
769
  X,
770
+ y=y,
768
771
  exclude_features_sources=exclude_features_sources,
769
772
  importance_threshold=importance_threshold,
770
773
  max_features=max_features,
@@ -1087,7 +1090,7 @@ class FeaturesEnricher(TransformerMixin):
1087
1090
  enriched_shaps = enriched_cv_result.shap_values
1088
1091
 
1089
1092
  if enriched_shaps is not None:
1090
- self._update_shap_values(trace_id, validated_X.columns.to_list(), enriched_shaps)
1093
+ self._update_shap_values(trace_id, fitting_X, enriched_shaps)
1091
1094
 
1092
1095
  if enriched_metric is None:
1093
1096
  self.logger.warning(
@@ -1255,14 +1258,14 @@ class FeaturesEnricher(TransformerMixin):
1255
1258
  finally:
1256
1259
  self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
1257
1260
 
1258
- def _update_shap_values(self, trace_id: str, x_columns: List[str], new_shaps: Dict[str, float]):
1261
+ def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float]):
1259
1262
  renaming = self.fit_columns_renaming or {}
1260
1263
  new_shaps = {
1261
1264
  renaming.get(feature, feature): _round_shap_value(shap)
1262
1265
  for feature, shap in new_shaps.items()
1263
1266
  if feature in self.feature_names_ or renaming.get(feature, feature) in self.feature_names_
1264
1267
  }
1265
- self.__prepare_feature_importances(trace_id, x_columns, new_shaps)
1268
+ self.__prepare_feature_importances(trace_id, df, new_shaps)
1266
1269
 
1267
1270
  if self.features_info_display_handle is not None:
1268
1271
  try:
@@ -1681,7 +1684,6 @@ class FeaturesEnricher(TransformerMixin):
1681
1684
  validated_X,
1682
1685
  validated_y,
1683
1686
  eval_set,
1684
- is_demo_dataset,
1685
1687
  exclude_features_sources,
1686
1688
  trace_id,
1687
1689
  progress_bar,
@@ -1872,158 +1874,147 @@ class FeaturesEnricher(TransformerMixin):
1872
1874
  validated_X: pd.DataFrame,
1873
1875
  validated_y: pd.Series,
1874
1876
  eval_set: Optional[List[tuple]],
1875
- is_demo_dataset: bool,
1876
1877
  exclude_features_sources: Optional[List[str]],
1877
1878
  trace_id: str,
1878
1879
  progress_bar: Optional[ProgressBar],
1879
1880
  progress_callback: Optional[Callable[[SearchProgress], Any]],
1880
1881
  ) -> _SampledDataForMetrics:
1881
- eval_set_sampled_dict = {}
1882
- if eval_set is not None:
1883
- self.logger.info("Transform with eval_set")
1884
- # concatenate X and eval_set with eval_set_index
1885
- df = validated_X.copy()
1886
- df[TARGET] = validated_y
1887
- df[EVAL_SET_INDEX] = 0
1888
- for idx, eval_pair in enumerate(eval_set):
1889
- eval_x, eval_y = self._validate_eval_set_pair(validated_X, eval_pair)
1890
- eval_df_with_index = eval_x.copy()
1891
- eval_df_with_index[TARGET] = eval_y
1892
- eval_df_with_index[EVAL_SET_INDEX] = idx + 1
1893
- df = pd.concat([df, eval_df_with_index])
1894
-
1895
- df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1896
-
1897
- # downsample if need to eval_set threshold
1898
- num_samples = _num_samples(df)
1899
- force_downsampling = (
1900
- not self.disable_force_downsampling
1901
- and self.columns_for_online_api is not None
1902
- and num_samples > Dataset.FORCE_SAMPLE_SIZE
1903
- )
1904
- # TODO: check that system_record_id was added before this step
1905
- if force_downsampling:
1906
- self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
1907
- df = balance_undersample_forced(
1908
- df=df,
1909
- target_column=TARGET,
1910
- id_columns=self.id_columns,
1911
- date_column=self._get_date_column(self.search_keys),
1912
- task_type=self.model_task_type,
1913
- cv_type=self.cv,
1914
- random_state=self.random_state,
1915
- sample_size=Dataset.FORCE_SAMPLE_SIZE,
1916
- logger=self.logger,
1917
- bundle=self.bundle,
1918
- warning_callback=self.__log_warning,
1919
- )
1920
- elif num_samples > Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD:
1921
- self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS}")
1922
- df = df.sample(n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state)
1882
+ has_eval_set = eval_set is not None
1923
1883
 
1924
- eval_set_sampled_dict = {}
1884
+ self.logger.info(f"Transform {'with' if has_eval_set else 'without'} eval_set")
1925
1885
 
1926
- tmp_target_name = "__target"
1927
- df = df.rename(columns={TARGET: tmp_target_name})
1886
+ # Prepare
1887
+ df = self.__combine_train_and_eval_sets(validated_X, validated_y, eval_set)
1888
+ df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1889
+ df = self.__downsample_for_metrics(df)
1928
1890
 
1929
- enriched_df, columns_renaming, generated_features = self.__inner_transform(
1930
- trace_id,
1931
- df,
1932
- exclude_features_sources=exclude_features_sources,
1933
- silent_mode=True,
1934
- metrics_calculation=True,
1935
- progress_bar=progress_bar,
1936
- progress_callback=progress_callback,
1937
- add_fit_system_record_id=True,
1938
- target_name=tmp_target_name,
1939
- )
1940
- if enriched_df is None:
1941
- return None
1891
+ # Transform
1942
1892
 
1943
- enriched_df = enriched_df.rename(columns={tmp_target_name: TARGET})
1893
+ enriched_df, _, _ = self.__inner_transform(
1894
+ trace_id,
1895
+ X=df.drop(columns=[TARGET]),
1896
+ y=df[TARGET],
1897
+ exclude_features_sources=exclude_features_sources,
1898
+ silent_mode=True,
1899
+ metrics_calculation=True,
1900
+ progress_bar=progress_bar,
1901
+ progress_callback=progress_callback,
1902
+ add_fit_system_record_id=True,
1903
+ )
1904
+ if enriched_df is None:
1905
+ return None
1944
1906
 
1945
- x_columns = [
1946
- c
1947
- for c in (validated_X.columns.tolist() + generated_features + [SYSTEM_RECORD_ID])
1948
- if c in enriched_df.columns
1949
- ]
1907
+ x_columns = [
1908
+ c
1909
+ for c in (validated_X.columns.tolist() + self.fit_generated_features + [SYSTEM_RECORD_ID])
1910
+ if c in enriched_df.columns
1911
+ ]
1950
1912
 
1951
- enriched_Xy = enriched_df.query(f"{EVAL_SET_INDEX} == 0")
1952
- X_sampled = enriched_Xy[x_columns].copy()
1953
- y_sampled = enriched_Xy[TARGET].copy()
1954
- enriched_X = enriched_Xy.drop(columns=[TARGET, EVAL_SET_INDEX])
1955
- enriched_X_columns = enriched_X.columns.tolist()
1913
+ X_sampled, y_sampled, enriched_X = self.__extract_train_data(enriched_df, x_columns)
1914
+ eval_set_sampled_dict = self.__extract_eval_data(
1915
+ enriched_df, x_columns, enriched_X.columns.tolist(), len(eval_set) if has_eval_set else 0
1916
+ )
1956
1917
 
1957
- for idx in range(len(eval_set)):
1958
- enriched_eval_xy = enriched_df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
1959
- eval_x_sampled = enriched_eval_xy[x_columns].copy()
1960
- eval_y_sampled = enriched_eval_xy[TARGET].copy()
1961
- enriched_eval_x = enriched_eval_xy[enriched_X_columns].copy()
1962
- eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
1963
- else:
1964
- self.logger.info("Transform without eval_set")
1965
- df = validated_X.copy()
1918
+ # Cache and return results
1919
+ return self.__cache_and_return_results(
1920
+ validated_X, validated_y, eval_set, X_sampled, y_sampled, enriched_X, eval_set_sampled_dict
1921
+ )
1966
1922
 
1967
- df[TARGET] = validated_y
1923
+ def __combine_train_and_eval_sets(
1924
+ self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]]
1925
+ ) -> pd.DataFrame:
1926
+ df = validated_X.copy()
1927
+ df[TARGET] = validated_y
1928
+ if eval_set is None:
1929
+ return df
1968
1930
 
1969
- df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1931
+ df[EVAL_SET_INDEX] = 0
1970
1932
 
1971
- num_samples = _num_samples(df)
1972
- force_downsampling = (
1973
- not self.disable_force_downsampling
1974
- and self.columns_for_online_api is not None
1975
- and num_samples > Dataset.FORCE_SAMPLE_SIZE
1976
- )
1933
+ for idx, eval_pair in enumerate(eval_set):
1934
+ eval_x, eval_y = self._validate_eval_set_pair(validated_X, eval_pair)
1935
+ eval_df_with_index = eval_x.copy()
1936
+ eval_df_with_index[TARGET] = eval_y
1937
+ eval_df_with_index[EVAL_SET_INDEX] = idx + 1
1938
+ df = pd.concat([df, eval_df_with_index])
1977
1939
 
1978
- if force_downsampling:
1979
- self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
1980
- df = balance_undersample_forced(
1981
- df=df,
1982
- target_column=TARGET,
1983
- id_columns=self.id_columns,
1984
- date_column=self._get_date_column(self.search_keys),
1985
- task_type=self.model_task_type,
1986
- cv_type=self.cv,
1987
- random_state=self.random_state,
1988
- sample_size=Dataset.FORCE_SAMPLE_SIZE,
1989
- logger=self.logger,
1990
- bundle=self.bundle,
1991
- warning_callback=self.__log_warning,
1992
- )
1993
- elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
1994
- self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
1995
- df = df.sample(n=Dataset.FIT_SAMPLE_ROWS, random_state=self.random_state)
1940
+ return df
1996
1941
 
1997
- tmp_target_name = "__target"
1998
- df = df.rename(columns={TARGET: tmp_target_name})
1942
+ def __downsample_for_metrics(self, df: pd.DataFrame) -> pd.DataFrame:
1943
+ num_samples = _num_samples(df)
1944
+ force_downsampling = (
1945
+ not self.disable_force_downsampling
1946
+ and self.columns_for_online_api is not None
1947
+ and num_samples > Dataset.FORCE_SAMPLE_SIZE
1948
+ )
1999
1949
 
2000
- enriched_Xy, columns_renaming, generated_features = self.__inner_transform(
2001
- trace_id,
2002
- df,
2003
- exclude_features_sources=exclude_features_sources,
2004
- silent_mode=True,
2005
- metrics_calculation=True,
2006
- progress_bar=progress_bar,
2007
- progress_callback=progress_callback,
2008
- add_fit_system_record_id=True,
2009
- target_name=tmp_target_name,
1950
+ if force_downsampling:
1951
+ self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
1952
+ return balance_undersample_forced(
1953
+ df=df,
1954
+ target_column=TARGET,
1955
+ id_columns=self.id_columns,
1956
+ date_column=self._get_date_column(self.search_keys),
1957
+ task_type=self.model_task_type,
1958
+ cv_type=self.cv,
1959
+ random_state=self.random_state,
1960
+ sample_size=Dataset.FORCE_SAMPLE_SIZE,
1961
+ logger=self.logger,
1962
+ bundle=self.bundle,
1963
+ warning_callback=self.__log_warning,
2010
1964
  )
2011
- if enriched_Xy is None:
2012
- return None
1965
+ elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
1966
+ if EVAL_SET_INDEX in df.columns:
1967
+ threshold = Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD
1968
+ sample_size = Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS
1969
+ else:
1970
+ threshold = Dataset.FIT_SAMPLE_THRESHOLD
1971
+ sample_size = Dataset.FIT_SAMPLE_ROWS
2013
1972
 
2014
- enriched_Xy = enriched_Xy.rename(columns={tmp_target_name: TARGET})
1973
+ if num_samples > threshold:
1974
+ self.logger.info(f"Downsampling from {num_samples} to {sample_size}")
1975
+ return df.sample(n=sample_size, random_state=self.random_state)
2015
1976
 
2016
- x_columns = [
2017
- c
2018
- for c in (validated_X.columns.tolist() + generated_features + [SYSTEM_RECORD_ID])
2019
- if c in enriched_Xy.columns
2020
- ]
1977
+ return df
2021
1978
 
2022
- X_sampled = enriched_Xy[x_columns].copy()
2023
- y_sampled = enriched_Xy[TARGET].copy()
2024
- enriched_X = enriched_Xy.drop(columns=TARGET)
1979
+ def __extract_train_data(
1980
+ self, enriched_df: pd.DataFrame, x_columns: List[str]
1981
+ ) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
1982
+ if EVAL_SET_INDEX in enriched_df.columns:
1983
+ enriched_Xy = enriched_df.query(f"{EVAL_SET_INDEX} == 0")
1984
+ else:
1985
+ enriched_Xy = enriched_df
1986
+ X_sampled = enriched_Xy[x_columns].copy()
1987
+ y_sampled = enriched_Xy[TARGET].copy()
1988
+ enriched_X = enriched_Xy.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
1989
+ return X_sampled, y_sampled, enriched_X
1990
+
1991
+ def __extract_eval_data(
1992
+ self, enriched_df: pd.DataFrame, x_columns: List[str], enriched_X_columns: List[str], eval_set_len: int
1993
+ ) -> Dict[int, Tuple]:
1994
+ eval_set_sampled_dict = {}
1995
+
1996
+ for idx in range(eval_set_len):
1997
+ enriched_eval_xy = enriched_df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
1998
+ eval_x_sampled = enriched_eval_xy[x_columns].copy()
1999
+ eval_y_sampled = enriched_eval_xy[TARGET].copy()
2000
+ enriched_eval_x = enriched_eval_xy[enriched_X_columns].copy()
2001
+ eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
2002
+
2003
+ return eval_set_sampled_dict
2025
2004
 
2005
+ def __cache_and_return_results(
2006
+ self,
2007
+ validated_X: pd.DataFrame,
2008
+ validated_y: pd.Series,
2009
+ eval_set: Optional[List[tuple]],
2010
+ X_sampled: pd.DataFrame,
2011
+ y_sampled: pd.Series,
2012
+ enriched_X: pd.DataFrame,
2013
+ eval_set_sampled_dict: Dict[int, Tuple],
2014
+ ) -> _SampledDataForMetrics:
2026
2015
  datasets_hash = hash_input(validated_X, validated_y, eval_set)
2016
+ columns_renaming = getattr(self, "fit_columns_renaming", {})
2017
+
2027
2018
  self.__cached_sampled_datasets[datasets_hash] = (
2028
2019
  X_sampled,
2029
2020
  y_sampled,
@@ -2160,6 +2151,7 @@ if response.status_code == 200:
2160
2151
  trace_id: str,
2161
2152
  X: pd.DataFrame,
2162
2153
  *,
2154
+ y: Optional[pd.Series] = None,
2163
2155
  exclude_features_sources: Optional[List[str]] = None,
2164
2156
  importance_threshold: Optional[float] = None,
2165
2157
  max_features: Optional[int] = None,
@@ -2178,8 +2170,14 @@ if response.status_code == 200:
2178
2170
  self.logger.info("Start transform")
2179
2171
 
2180
2172
  validated_X = self._validate_X(X, is_transform=True)
2173
+ if y is not None:
2174
+ validated_y = self._validate_y(validated_X, y)
2175
+ df = self.__combine_train_and_eval_sets(validated_X, validated_y, eval_set=None)
2176
+ else:
2177
+ validated_y = None
2178
+ df = validated_X
2181
2179
 
2182
- self.__log_debug_information(validated_X, exclude_features_sources=exclude_features_sources)
2180
+ self.__log_debug_information(validated_X, validated_y, exclude_features_sources=exclude_features_sources)
2183
2181
 
2184
2182
  self.__validate_search_keys(self.search_keys, self.search_id)
2185
2183
 
@@ -2222,29 +2220,27 @@ if response.status_code == 200:
2222
2220
  self.logger.info(msg)
2223
2221
  print(msg)
2224
2222
 
2225
- is_demo_dataset = hash_input(validated_X) in DEMO_DATASET_HASHES
2223
+ is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
2226
2224
 
2227
2225
  columns_to_drop = [
2228
- c for c in validated_X.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
2226
+ c for c in df.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
2229
2227
  ]
2230
2228
  if len(columns_to_drop) > 0:
2231
2229
  msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
2232
2230
  self.logger.warning(msg)
2233
2231
  print(msg)
2234
- validated_X = validated_X.drop(columns=columns_to_drop)
2232
+ df = df.drop(columns=columns_to_drop)
2235
2233
 
2236
2234
  search_keys = self.search_keys.copy()
2237
2235
  if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
2238
- self.search_keys.update(
2236
+ search_keys.update(
2239
2237
  {col: SearchKey.CUSTOM_KEY for col in self.id_columns if col not in self.search_keys}
2240
2238
  )
2241
2239
 
2242
2240
  search_keys = self.__prepare_search_keys(
2243
- validated_X, search_keys, is_demo_dataset, is_transform=True, silent_mode=silent_mode
2241
+ df, search_keys, is_demo_dataset, is_transform=True, silent_mode=silent_mode
2244
2242
  )
2245
2243
 
2246
- df = validated_X.copy()
2247
-
2248
2244
  df = self.__handle_index_search_keys(df, search_keys)
2249
2245
 
2250
2246
  if DEFAULT_INDEX in df.columns:
@@ -2283,8 +2279,11 @@ if response.status_code == 200:
2283
2279
  features_for_transform = self._search_task.get_features_for_transform() or []
2284
2280
  if len(features_for_transform) > 0:
2285
2281
  missing_features_for_transform = [
2286
- columns_renaming.get(f) for f in features_for_transform if f not in df.columns
2282
+ columns_renaming.get(f) or f for f in features_for_transform if f not in df.columns
2287
2283
  ]
2284
+ if TARGET in missing_features_for_transform:
2285
+ raise ValidationError(self.bundle.get("missing_target_for_transform"))
2286
+
2288
2287
  if len(missing_features_for_transform) > 0:
2289
2288
  raise ValidationError(
2290
2289
  self.bundle.get("missing_features_for_transform").format(missing_features_for_transform)
@@ -2340,11 +2339,10 @@ if response.status_code == 200:
2340
2339
  converter = PostalCodeSearchKeyConverter(postal_code)
2341
2340
  df = converter.convert(df)
2342
2341
 
2343
- # generated_features = [f for f in generated_features if f in self.fit_generated_features]
2342
+ meaning_types = {}
2343
+ meaning_types.update({col: FileColumnMeaningType.FEATURE for col in features_for_transform})
2344
+ meaning_types.update({col: key.value for col, key in search_keys.items()})
2344
2345
 
2345
- meaning_types = {col: key.value for col, key in search_keys.items()}
2346
- for col in features_for_transform:
2347
- meaning_types[col] = FileColumnMeaningType.FEATURE
2348
2346
  features_not_to_pass = [
2349
2347
  c
2350
2348
  for c in df.columns
@@ -2353,13 +2351,12 @@ if response.status_code == 200:
2353
2351
  and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
2354
2352
  ]
2355
2353
 
2356
- if add_fit_system_record_id and target_name is not None:
2357
- reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
2354
+ if add_fit_system_record_id:
2358
2355
  df = self.__add_fit_system_record_id(
2359
2356
  df,
2360
2357
  search_keys,
2361
2358
  SYSTEM_RECORD_ID,
2362
- reversed_columns_renaming.get(target_name, target_name),
2359
+ TARGET,
2363
2360
  columns_renaming,
2364
2361
  silent=True,
2365
2362
  )
@@ -3021,7 +3018,7 @@ if response.status_code == 200:
3021
3018
  msg = self.bundle.get("features_not_generated").format(unused_features_for_generation)
3022
3019
  self.__log_warning(msg)
3023
3020
 
3024
- self.__prepare_feature_importances(trace_id, validated_X.columns.to_list() + self.fit_generated_features)
3021
+ self.__prepare_feature_importances(trace_id, df)
3025
3022
 
3026
3023
  self.__show_selected_features(self.fit_search_keys)
3027
3024
 
@@ -3796,7 +3793,7 @@ if response.status_code == 200:
3796
3793
  return result_train, result_eval_sets
3797
3794
 
3798
3795
  def __prepare_feature_importances(
3799
- self, trace_id: str, x_columns: List[str], updated_shaps: Optional[Dict[str, float]] = None, silent=False
3796
+ self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
3800
3797
  ):
3801
3798
  if self._search_task is None:
3802
3799
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
@@ -3807,6 +3804,8 @@ if response.status_code == 200:
3807
3804
  original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
3808
3805
  features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
3809
3806
 
3807
+ df = df.rename(columns=original_names_dict)
3808
+
3810
3809
  self.feature_names_ = []
3811
3810
  self.dropped_client_feature_names_ = []
3812
3811
  self.feature_importances_ = []
@@ -3825,7 +3824,7 @@ if response.status_code == 200:
3825
3824
  if feature_meta.name in original_names_dict.keys():
3826
3825
  feature_meta.name = original_names_dict[feature_meta.name]
3827
3826
 
3828
- is_client_feature = feature_meta.name in x_columns
3827
+ is_client_feature = feature_meta.name in df.columns
3829
3828
 
3830
3829
  # TODO make a decision about selected features based on special flag from mlb
3831
3830
  if original_shaps.get(feature_meta.name, 0.0) == 0.0:
@@ -3845,7 +3844,7 @@ if response.status_code == 200:
3845
3844
  self.feature_names_.append(feature_meta.name)
3846
3845
  self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
3847
3846
 
3848
- df_for_sample = features_df if feature_meta.name in features_df.columns else self.X
3847
+ df_for_sample = features_df if feature_meta.name in features_df.columns else df
3849
3848
  feature_info = FeatureInfo.from_metadata(feature_meta, df_for_sample, is_client_feature)
3850
3849
  features_info.append(feature_info.to_row(self.bundle))
3851
3850
  features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
@@ -136,6 +136,7 @@ x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
136
136
  baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
137
137
  baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
138
138
  missing_features_for_transform=Missing some features for transform that were presented on fit: {}
139
+ missing_target_for_transform=Search contains features on target. Please add y to the call and try again
139
140
  missing_id_column=Id column {} not found in X
140
141
  # target validation
141
142
  empty_target=Target is empty in all rows
upgini/search_task.py CHANGED
@@ -168,7 +168,13 @@ class SearchTask:
168
168
  for meta in self.provider_metadata_v2:
169
169
  if meta.features_used_for_embeddings is not None:
170
170
  features_for_transform.update(meta.features_used_for_embeddings)
171
-
171
+ if meta.generated_features:
172
+ features_for_transform.update(
173
+ c.original_name
174
+ for f in meta.generated_features
175
+ for c in f.base_columns
176
+ if c.ads_definition_id is None
177
+ )
172
178
  return list(features_for_transform)
173
179
 
174
180
  def get_shuffle_kfold(self) -> Optional[bool]:
@@ -88,8 +88,11 @@ class FeatureInfo:
88
88
 
89
89
 
90
90
  def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: Optional[pd.DataFrame]) -> str:
91
- if data is not None and feature_meta.name in data.columns:
92
- feature_sample = np.random.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
91
+ if data is not None and len(data) > 0 and feature_meta.name in data.columns:
92
+ if len(data) > 3:
93
+ feature_sample = np.random.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
94
+ else:
95
+ feature_sample = data[feature_meta.name].dropna().unique().tolist()
93
96
  if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
94
97
  feature_sample = [round(f, 4) for f in feature_sample]
95
98
  feature_sample = [str(f) for f in feature_sample]
@@ -123,7 +126,11 @@ def _get_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) ->
123
126
 
124
127
 
125
128
  def _get_internal_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
126
- return "" if is_client_feature else (feature_meta.data_provider or "Upgini")
129
+ providers = _list_or_single(feature_meta.data_providers, feature_meta.data_provider)
130
+ if providers:
131
+ return ", ".join(providers)
132
+ else:
133
+ return "" if is_client_feature else (feature_meta.data_provider or "Upgini")
127
134
 
128
135
 
129
136
  def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
@@ -137,13 +144,17 @@ def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> st
137
144
 
138
145
 
139
146
  def _get_internal_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
140
- return feature_meta.data_source or (
141
- LLM_SOURCE
142
- if not feature_meta.name.endswith("_country")
143
- and not feature_meta.name.endswith("_postal_code")
144
- and not is_client_feature
145
- else ""
146
- )
147
+ sources = _list_or_single(feature_meta.data_sources, feature_meta.data_source)
148
+ if sources:
149
+ return ", ".join(sources)
150
+ else:
151
+ return feature_meta.data_source or (
152
+ LLM_SOURCE
153
+ if not feature_meta.name.endswith("_country")
154
+ and not feature_meta.name.endswith("_postal_code")
155
+ and not is_client_feature
156
+ else ""
157
+ )
147
158
 
148
159
 
149
160
  def _list_or_single(lst: List[str], single: str):
@@ -161,7 +172,7 @@ def _to_anchor(link: str, value: str) -> str:
161
172
  return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
162
173
 
163
174
 
164
- def _make_links(names: List[str], links: List[str]):
175
+ def _make_links(names: List[str], links: List[str]) -> str:
165
176
  all_links = [_to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
166
177
  return ",".join(all_links)
167
178
 
upgini/utils/mstats.py CHANGED
@@ -118,7 +118,7 @@ def spearmanr(
118
118
  # - dof: degrees of freedom
119
119
  # - t_stat: t-statistic
120
120
  # - alternative: 'two-sided', 'greater', 'less'
121
- def compute_t_pvalue(t_stat, dof, alternative='two-sided'):
121
+ def compute_t_pvalue(t_stat, dof, alternative="two-sided"):
122
122
  from scipy.stats import t
123
123
 
124
124
  if alternative == "two-sided":
upgini/utils/sort.py CHANGED
@@ -49,7 +49,7 @@ def sort_columns(
49
49
  target = target_column if isinstance(target_column, pd.Series) else df[target_column]
50
50
  target = prepare_target(target, model_task_type)
51
51
  sort_dict = get_sort_columns_dict(
52
- df[sorted_keys + other_columns], target, sorted_keys, omit_nan=True, sort_all_columns=sort_all_columns
52
+ df[sorted_keys + other_columns], target, sorted_keys, sort_all_columns=sort_all_columns
53
53
  )
54
54
  other_columns = [c for c in other_columns if c in sort_dict]
55
55
  columns_for_sort = sorted_keys + sorted(other_columns, key=lambda e: sort_dict[e], reverse=True)
@@ -60,7 +60,6 @@ def get_sort_columns_dict(
60
60
  df: pd.DataFrame,
61
61
  target: pd.Series,
62
62
  sorted_keys: List[str],
63
- omit_nan: bool,
64
63
  n_jobs: Optional[int] = None,
65
64
  sort_all_columns: bool = False,
66
65
  ) -> Dict[str, Any]:
@@ -78,6 +77,13 @@ def get_sort_columns_dict(
78
77
  return {}
79
78
 
80
79
  df = df[columns_for_sort]
80
+ df_with_target = pd.concat([df, target], axis=1)
81
+ # Drop rows where target is NaN
82
+ df_with_target = df_with_target.loc[~target.isna()]
83
+ df = df_with_target.iloc[:, :-1]
84
+ target = df_with_target.iloc[:, -1]
85
+ df = df.fillna(df.mean())
86
+ omit_nan = False
81
87
  hashes = [hash_series(df[col]) for col in columns_for_sort]
82
88
  df = np.asarray(df, dtype=np.float32)
83
89
  correlations = get_sort_columns_correlations(df, target, omit_nan, n_jobs)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.66a3818.dev1
3
+ Version: 1.2.68a3818.dev1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,14 +1,14 @@
1
- upgini/__about__.py,sha256=NbAl7_TAPRLWAiByFYGbEOi4eRvu1Erxk-b19Z5nTRs,33
1
+ upgini/__about__.py,sha256=B8ku0HzP4G2N6EyFXdX43ZRi57azPbbOINogoH1dGG4,33
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=OGjpeFHbj3lWiZTOHTpWEoMMDmFY1FlNC44FKktoZvU,34956
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=nXGBMC42VPAmqQKXbEqZJFIHiGj6F_G2AwhurA8LuQs,205351
6
+ upgini/features_enricher.py,sha256=KBTdADF7_Wj3uDROYdevukOk6R8LVQw47gJkH4M1_iQ,204435
7
7
  upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
9
  upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
10
10
  upgini/metrics.py,sha256=t7uOOnlDYvP6E3DLjPMQcFBjyhJfUQY8aUlx7N0Mh-s,35477
11
- upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
11
+ upgini/search_task.py,sha256=EuCGp0iCWz2fpuJgN6M47aP_CtIi3Oq9zw78w0mkKiU,17595
12
12
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
13
13
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
14
14
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
@@ -16,18 +16,19 @@ upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo
16
16
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
18
18
  upgini/autofe/binary.py,sha256=MnQuFiERpocjCPQUjOljlsq5FE-04GPfwtNjzvfNMyU,7671
19
- upgini/autofe/date.py,sha256=I07psJerrxOcHao91PdSCk9X6KWu61IBVyFRLjGNgK8,10730
19
+ upgini/autofe/date.py,sha256=C86F7sPiscUGq2a45UtQA9ADWBWg0kt54mePHHzjbLE,10633
20
20
  upgini/autofe/feature.py,sha256=y1x3wijhTVBmloayQAHiscqKU9Ll8kLcGm1PdvS357I,14910
21
21
  upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
22
22
  upgini/autofe/operator.py,sha256=EOffJw6vKXpEh5yymqb1RFNJPxGxmnHdFRo9dB5SCFo,4969
23
23
  upgini/autofe/unary.py,sha256=yVgPvtfnPSOhrii0YgezddmgWPwyOBCR0JutaIkdTTc,4658
24
+ upgini/autofe/utils.py,sha256=fK1am2_tQj3fL2vDslblye8lmyfWgGIUOX1beYVBz4k,2420
24
25
  upgini/autofe/vector.py,sha256=l0KdKg-txlZxDSE4hPPfCtfGQofYbl7oaABPr830sPI,667
25
26
  upgini/autofe/timeseries/__init__.py,sha256=PGwwDAMwvkXl3el12tXVEmZUgDUvlmIPlXtROm6bD18,738
26
27
  upgini/autofe/timeseries/base.py,sha256=T9Ec8LKJbiwTUGGsd_xhM0U0NUJblqmKchkzUI1sK88,3755
27
- upgini/autofe/timeseries/cross.py,sha256=Sh5hAXZFWKaFRqf_JGODu9pWO2tmuV5VKyK9eX3i7-I,4931
28
+ upgini/autofe/timeseries/cross.py,sha256=qdoMGKg0auoYKwu4Vz8V3XDs_6-5j9sE4gcwfAR41Ws,5231
28
29
  upgini/autofe/timeseries/delta.py,sha256=h0YhmI1TlPJnjwFpN_GQxLb6r59DQuucnG5tQAXSgjU,3520
29
30
  upgini/autofe/timeseries/lag.py,sha256=LfQtg484vuqM0mgY4Wft1swHX_Srq7OKKgZswCXoiXI,1882
30
- upgini/autofe/timeseries/roll.py,sha256=bNFMDszSYTWvB7EyhHbRY1DJqzSURvHlPAcBebt0y0Y,2878
31
+ upgini/autofe/timeseries/roll.py,sha256=zADKXU-eYWQnQ5R3am1yEal8uU6Tm0jLAixwPb_aCHg,2794
31
32
  upgini/autofe/timeseries/trend.py,sha256=9p2Q5ByAi6cx9RH9teBTe8FyjSzqthznC2Lo5dsJ0ho,2051
32
33
  upgini/autofe/timeseries/volatility.py,sha256=9shUmIKjpWTHVYjj80YBsk0XheBJ9uBuLv5NW9Mchnk,7953
33
34
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -38,7 +39,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
38
39
  upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
39
40
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
40
41
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
41
- upgini/resource_bundle/strings.properties,sha256=3zctRNQDJ1STTvLUfryBT72wYeHYnrllV4rG1C3HtfI,27542
42
+ upgini/resource_bundle/strings.properties,sha256=LDT-jtYlrD1IXvWjFSf-dtvapje0qSrqI9W3v7y2zVo,27646
42
43
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
43
44
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
45
  upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -56,21 +57,21 @@ upgini/utils/deduplicate_utils.py,sha256=SMZx9IKIhWI5HqXepfKiQb3uDJrogQZtG6jcWuM
56
57
  upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
57
58
  upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
58
59
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
59
- upgini/utils/feature_info.py,sha256=0rOXSyCj-sw-8migWP0ge8qrOzGU50dQvH0JUJUrDfQ,6766
60
+ upgini/utils/feature_info.py,sha256=m1tQcT3hTChPAiXzpk0WQcEqElj8KgeCifEJFa7-gss,7247
60
61
  upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
61
62
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
62
63
  upgini/utils/ip_utils.py,sha256=TSQ_qDsLlVnm09X1HacpabEf_HNqSWpxBF4Sdc2xs08,6580
63
- upgini/utils/mstats.py,sha256=dLJQr5Ak5BAoV-pDPpnfvMURZVkZ3_v250QzAsSlqY4,6286
64
+ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
64
65
  upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
65
66
  upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
66
67
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
67
68
  upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
68
- upgini/utils/sort.py,sha256=GfWfCIbfK7e7BvSPZZNJD-PEtiN19DnTCEQkeefHHxI,6491
69
+ upgini/utils/sort.py,sha256=VDXgZObIVAuGzXlAEejlKCNQcHmN5pN2bMou58sDKFI,6729
69
70
  upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,16579
70
71
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
71
72
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
72
73
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
73
- upgini-1.2.66a3818.dev1.dist-info/METADATA,sha256=RlvM_n0dDfEJ6-4PCEiyh7bXHCDZjjdTOOP7uGjQd-M,49123
74
- upgini-1.2.66a3818.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
75
- upgini-1.2.66a3818.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
- upgini-1.2.66a3818.dev1.dist-info/RECORD,,
74
+ upgini-1.2.68a3818.dev1.dist-info/METADATA,sha256=b70LVYxQjLh3v0j-pbeT-PWuf065TUhpgQxt_prM2Oo,49123
75
+ upgini-1.2.68a3818.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
76
+ upgini-1.2.68a3818.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
77
+ upgini-1.2.68a3818.dev1.dist-info/RECORD,,