upgini 1.2.91a3884.dev4__py3-none-any.whl → 1.2.91a3906.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,7 +30,6 @@ from scipy.stats import ks_2samp
30
30
  from sklearn.base import TransformerMixin
31
31
  from sklearn.exceptions import NotFittedError
32
32
  from sklearn.model_selection import BaseCrossValidator, TimeSeriesSplit
33
- from sklearn.preprocessing import OrdinalEncoder
34
33
 
35
34
  from upgini.autofe.feature import Feature
36
35
  from upgini.autofe.timeseries import TimeSeriesBase
@@ -119,9 +118,9 @@ except Exception:
119
118
  CustomFallbackProgressBar as ProgressBar,
120
119
  )
121
120
 
122
- from upgini.utils.sample_utils import SampleColumns, SampleConfig, _num_samples, sample
123
121
  from upgini.utils.sort import sort_columns
124
122
  from upgini.utils.target_utils import (
123
+ balance_undersample_forced,
125
124
  calculate_psi,
126
125
  define_task,
127
126
  )
@@ -243,7 +242,6 @@ class FeaturesEnricher(TransformerMixin):
243
242
  disable_force_downsampling: bool = False,
244
243
  id_columns: Optional[List[str]] = None,
245
244
  generate_search_key_features: bool = True,
246
- sample_config: Optional[SampleConfig] = None,
247
245
  **kwargs,
248
246
  ):
249
247
  self.bundle = get_custom_bundle(custom_bundle_config)
@@ -288,7 +286,6 @@ class FeaturesEnricher(TransformerMixin):
288
286
 
289
287
  self.search_keys = search_keys or {}
290
288
  self.id_columns = id_columns
291
- self.id_columns_encoder = None
292
289
  self.country_code = country_code
293
290
  self.__validate_search_keys(search_keys, search_id)
294
291
 
@@ -362,8 +359,10 @@ class FeaturesEnricher(TransformerMixin):
362
359
  self.columns_for_online_api = columns_for_online_api
363
360
  if columns_for_online_api is not None:
364
361
  self.runtime_parameters.properties["columns_for_online_api"] = ",".join(columns_for_online_api)
365
-
366
- self.sample_config = self._get_sample_config(sample_config)
362
+ maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
363
+ if maybe_downsampling_limit is not None:
364
+ Dataset.FIT_SAMPLE_THRESHOLD = int(maybe_downsampling_limit)
365
+ Dataset.FIT_SAMPLE_ROWS = int(maybe_downsampling_limit)
367
366
 
368
367
  self.raise_validation_error = raise_validation_error
369
368
  self.exclude_columns = exclude_columns
@@ -376,16 +375,6 @@ class FeaturesEnricher(TransformerMixin):
376
375
  self.autofe_features_display_handle = None
377
376
  self.report_button_handle = None
378
377
 
379
- def _get_sample_config(self, sample_config: Optional[SampleConfig] = None):
380
- sample_config = sample_config or SampleConfig(force_sample_size=Dataset.FORCE_SAMPLE_SIZE)
381
-
382
- maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
383
- if maybe_downsampling_limit is not None:
384
- sample_config.fit_sample_rows = int(maybe_downsampling_limit)
385
- sample_config.fit_sample_threshold = int(maybe_downsampling_limit)
386
-
387
- return sample_config
388
-
389
378
  def _get_api_key(self):
390
379
  return self._api_key
391
380
 
@@ -939,15 +928,16 @@ class FeaturesEnricher(TransformerMixin):
939
928
  ):
940
929
  raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
941
930
 
942
- validated_X, validated_y, validated_eval_set = self._validate_train_eval(
943
- effective_X, effective_y, effective_eval_set
931
+ validated_X = self._validate_X(effective_X)
932
+ validated_y = self._validate_y(validated_X, effective_y)
933
+ validated_eval_set = (
934
+ [self._validate_eval_set_pair(validated_X, eval_pair) for eval_pair in effective_eval_set]
935
+ if effective_eval_set is not None
936
+ else None
944
937
  )
945
938
 
946
939
  if self.X is None:
947
940
  self.X = X
948
- self.id_columns_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1).fit(
949
- X[self.id_columns or []]
950
- )
951
941
  if self.y is None:
952
942
  self.y = y
953
943
  if self.eval_set is None:
@@ -981,19 +971,6 @@ class FeaturesEnricher(TransformerMixin):
981
971
  client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
982
972
  estimator, validated_X, self.search_keys
983
973
  )
984
- if self.id_columns_encoder is not None:
985
- if cat_features_from_backend:
986
- cat_features_from_backend = [
987
- c
988
- for c in cat_features_from_backend
989
- if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
990
- ]
991
- if client_cat_features:
992
- client_cat_features = [
993
- c
994
- for c in client_cat_features
995
- if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
996
- ]
997
974
  for cat_feature in cat_features_from_backend:
998
975
  original_cat_feature = self.fit_columns_renaming.get(cat_feature)
999
976
  if original_cat_feature in self.search_keys:
@@ -1268,8 +1245,7 @@ class FeaturesEnricher(TransformerMixin):
1268
1245
  metrics.append(eval_metrics)
1269
1246
 
1270
1247
  if updating_shaps is not None:
1271
- decoded_X = self._decode_id_columns(fitting_X, columns_renaming)
1272
- self._update_shap_values(trace_id, decoded_X, updating_shaps, silent=not internal_call)
1248
+ self._update_shap_values(trace_id, fitting_X, updating_shaps, silent=not internal_call)
1273
1249
 
1274
1250
  metrics_df = pd.DataFrame(metrics)
1275
1251
  mean_target_hdr = self.bundle.get("quality_metrics_mean_target_header")
@@ -1523,10 +1499,16 @@ class FeaturesEnricher(TransformerMixin):
1523
1499
  ):
1524
1500
  is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
1525
1501
  is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
1502
+ validated_X = self._validate_X(X)
1503
+ validated_y = self._validate_y(validated_X, y)
1526
1504
  checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
1527
- validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set)
1505
+ validated_eval_set = (
1506
+ [self._validate_eval_set_pair(validated_X, eval_set_pair) for eval_set_pair in checked_eval_set]
1507
+ if checked_eval_set
1508
+ else None
1509
+ )
1528
1510
 
1529
- sampled_data = self._get_enriched_for_metrics(
1511
+ sampled_data = self._sample_data_for_metrics(
1530
1512
  trace_id,
1531
1513
  validated_X,
1532
1514
  validated_y,
@@ -1600,11 +1582,7 @@ class FeaturesEnricher(TransformerMixin):
1600
1582
  fitting_enriched_X = fitting_enriched_X.drop(columns=columns_with_high_cardinality, errors="ignore")
1601
1583
 
1602
1584
  # Detect and drop constant columns
1603
- constant_columns = [
1604
- c
1605
- for c in FeaturesValidator.find_constant_features(fitting_X)
1606
- if self.fit_columns_renaming.get(c, c) not in (self.id_columns or [])
1607
- ]
1585
+ constant_columns = FeaturesValidator.find_constant_features(fitting_X)
1608
1586
  if len(constant_columns) > 0:
1609
1587
  self.logger.warning(f"Constant columns {constant_columns} will be dropped for metrics calculation")
1610
1588
  fitting_X = fitting_X.drop(columns=constant_columns, errors="ignore")
@@ -1647,7 +1625,6 @@ class FeaturesEnricher(TransformerMixin):
1647
1625
  fitting_X, y_sorted, search_keys, self.model_task_type, sort_all_columns=True, logger=self.logger
1648
1626
  )
1649
1627
  fitting_X = fitting_X[fitting_x_columns]
1650
- fitting_X, _ = self._encode_id_columns(fitting_X, self.fit_columns_renaming)
1651
1628
  self.logger.info(f"Final sorted list of fitting X columns: {fitting_x_columns}")
1652
1629
  fitting_enriched_x_columns = fitting_enriched_X.columns.to_list()
1653
1630
  fitting_enriched_x_columns = sort_columns(
@@ -1659,7 +1636,6 @@ class FeaturesEnricher(TransformerMixin):
1659
1636
  logger=self.logger,
1660
1637
  )
1661
1638
  fitting_enriched_X = fitting_enriched_X[fitting_enriched_x_columns]
1662
- fitting_enriched_X, _ = self._encode_id_columns(fitting_enriched_X, self.fit_columns_renaming)
1663
1639
  self.logger.info(f"Final sorted list of fitting enriched X columns: {fitting_enriched_x_columns}")
1664
1640
  for idx, eval_tuple in eval_set_sampled_dict.items():
1665
1641
  eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
@@ -1687,12 +1663,6 @@ class FeaturesEnricher(TransformerMixin):
1687
1663
  .astype(np.float64)
1688
1664
  )
1689
1665
 
1690
- fitting_eval_X, unknown_dict = self._encode_id_columns(fitting_eval_X, self.fit_columns_renaming)
1691
- fitting_enriched_eval_X, _ = self._encode_id_columns(fitting_enriched_eval_X, self.fit_columns_renaming)
1692
-
1693
- if len(unknown_dict) > 0:
1694
- print(self.bundle.get("unknown_id_column_value_in_eval_set").format(unknown_dict))
1695
-
1696
1666
  fitting_eval_set_dict[idx] = (
1697
1667
  fitting_eval_X,
1698
1668
  eval_y_sorted,
@@ -1714,7 +1684,7 @@ class FeaturesEnricher(TransformerMixin):
1714
1684
  )
1715
1685
 
1716
1686
  @dataclass
1717
- class _EnrichedDataForMetrics:
1687
+ class _SampledDataForMetrics:
1718
1688
  X_sampled: pd.DataFrame
1719
1689
  y_sampled: pd.Series
1720
1690
  enriched_X: pd.DataFrame
@@ -1722,7 +1692,7 @@ class FeaturesEnricher(TransformerMixin):
1722
1692
  search_keys: Dict[str, SearchKey]
1723
1693
  columns_renaming: Dict[str, str]
1724
1694
 
1725
- def _get_enriched_for_metrics(
1695
+ def _sample_data_for_metrics(
1726
1696
  self,
1727
1697
  trace_id: str,
1728
1698
  validated_X: Union[pd.DataFrame, pd.Series, np.ndarray, None],
@@ -1734,7 +1704,7 @@ class FeaturesEnricher(TransformerMixin):
1734
1704
  remove_outliers_calc_metrics: Optional[bool],
1735
1705
  progress_bar: Optional[ProgressBar],
1736
1706
  progress_callback: Optional[Callable[[SearchProgress], Any]],
1737
- ) -> _EnrichedDataForMetrics:
1707
+ ) -> _SampledDataForMetrics:
1738
1708
  datasets_hash = hash_input(validated_X, validated_y, eval_set)
1739
1709
  cached_sampled_datasets = self.__cached_sampled_datasets.get(datasets_hash)
1740
1710
  if cached_sampled_datasets is not None and is_input_same_as_fit and remove_outliers_calc_metrics is None:
@@ -1742,7 +1712,7 @@ class FeaturesEnricher(TransformerMixin):
1742
1712
  return self.__get_sampled_cached_enriched(datasets_hash, exclude_features_sources)
1743
1713
  elif len(self.feature_importances_) == 0:
1744
1714
  self.logger.info("No external features selected. So use only input datasets for metrics calculation")
1745
- return self.__get_enriched_as_input(validated_X, validated_y, eval_set, is_demo_dataset)
1715
+ return self.__sample_only_input(validated_X, validated_y, eval_set, is_demo_dataset)
1746
1716
  # TODO save and check if dataset was deduplicated - use imbalance branch for such case
1747
1717
  elif (
1748
1718
  not self.imbalanced
@@ -1751,14 +1721,14 @@ class FeaturesEnricher(TransformerMixin):
1751
1721
  and self.df_with_original_index is not None
1752
1722
  ):
1753
1723
  self.logger.info("Dataset is not imbalanced, so use enriched_X from fit")
1754
- return self.__get_enriched_from_fit(eval_set, trace_id, remove_outliers_calc_metrics)
1724
+ return self.__sample_balanced(eval_set, trace_id, remove_outliers_calc_metrics)
1755
1725
  else:
1756
1726
  self.logger.info(
1757
1727
  "Dataset is imbalanced or exclude_features_sources or X was passed or this is saved search."
1758
1728
  " Run transform"
1759
1729
  )
1760
1730
  print(self.bundle.get("prepare_data_for_metrics"))
1761
- return self.__get_enriched_from_transform(
1731
+ return self.__sample_imbalanced(
1762
1732
  validated_X,
1763
1733
  validated_y,
1764
1734
  eval_set,
@@ -1770,7 +1740,7 @@ class FeaturesEnricher(TransformerMixin):
1770
1740
 
1771
1741
  def __get_sampled_cached_enriched(
1772
1742
  self, datasets_hash: str, exclude_features_sources: Optional[List[str]]
1773
- ) -> _EnrichedDataForMetrics:
1743
+ ) -> _SampledDataForMetrics:
1774
1744
  X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = (
1775
1745
  self.__cached_sampled_datasets[datasets_hash]
1776
1746
  )
@@ -1787,9 +1757,9 @@ class FeaturesEnricher(TransformerMixin):
1787
1757
  search_keys,
1788
1758
  )
1789
1759
 
1790
- def __get_enriched_as_input(
1760
+ def __sample_only_input(
1791
1761
  self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
1792
- ) -> _EnrichedDataForMetrics:
1762
+ ) -> _SampledDataForMetrics:
1793
1763
  eval_set_sampled_dict = {}
1794
1764
 
1795
1765
  df = validated_X.copy()
@@ -1831,13 +1801,24 @@ class FeaturesEnricher(TransformerMixin):
1831
1801
  normalizer = Normalizer(self.bundle, self.logger)
1832
1802
  df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
1833
1803
  columns_renaming = normalizer.columns_renaming
1804
+ # columns_renaming = {c: c for c in df.columns}
1834
1805
 
1835
1806
  df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1836
- df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID, TARGET, columns_renaming, silent=True)
1837
1807
 
1808
+ num_samples = _num_samples(df)
1809
+ sample_threshold, sample_rows = (
1810
+ (Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD, Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS)
1811
+ if eval_set is not None
1812
+ else (Dataset.FIT_SAMPLE_THRESHOLD, Dataset.FIT_SAMPLE_ROWS)
1813
+ )
1814
+
1815
+ df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID, TARGET, columns_renaming, silent=True)
1838
1816
  # Sample after sorting by system_record_id for idempotency
1839
1817
  df.sort_values(by=SYSTEM_RECORD_ID, inplace=True)
1840
- df = self.__downsample_for_metrics(df)
1818
+
1819
+ if num_samples > sample_threshold:
1820
+ self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
1821
+ df = df.sample(n=sample_rows, random_state=self.random_state)
1841
1822
 
1842
1823
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
1843
1824
  df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
@@ -1866,12 +1847,12 @@ class FeaturesEnricher(TransformerMixin):
1866
1847
  search_keys,
1867
1848
  )
1868
1849
 
1869
- def __get_enriched_from_fit(
1850
+ def __sample_balanced(
1870
1851
  self,
1871
1852
  eval_set: Optional[List[tuple]],
1872
1853
  trace_id: str,
1873
1854
  remove_outliers_calc_metrics: Optional[bool],
1874
- ) -> _EnrichedDataForMetrics:
1855
+ ) -> _SampledDataForMetrics:
1875
1856
  eval_set_sampled_dict = {}
1876
1857
  search_keys = self.fit_search_keys
1877
1858
 
@@ -1970,7 +1951,7 @@ class FeaturesEnricher(TransformerMixin):
1970
1951
  search_keys,
1971
1952
  )
1972
1953
 
1973
- def __get_enriched_from_transform(
1954
+ def __sample_imbalanced(
1974
1955
  self,
1975
1956
  validated_X: pd.DataFrame,
1976
1957
  validated_y: pd.Series,
@@ -1979,7 +1960,7 @@ class FeaturesEnricher(TransformerMixin):
1979
1960
  trace_id: str,
1980
1961
  progress_bar: Optional[ProgressBar],
1981
1962
  progress_callback: Optional[Callable[[SearchProgress], Any]],
1982
- ) -> _EnrichedDataForMetrics:
1963
+ ) -> _SampledDataForMetrics:
1983
1964
  has_eval_set = eval_set is not None
1984
1965
 
1985
1966
  self.logger.info(f"Transform {'with' if has_eval_set else 'without'} eval_set")
@@ -2036,58 +2017,61 @@ class FeaturesEnricher(TransformerMixin):
2036
2017
  )
2037
2018
 
2038
2019
  def __combine_train_and_eval_sets(
2039
- self, X: pd.DataFrame, y: Optional[pd.Series] = None, eval_set: Optional[List[tuple]] = None
2020
+ self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]]
2040
2021
  ) -> pd.DataFrame:
2041
- df = X.copy()
2042
- if y is not None:
2043
- df[TARGET] = y
2044
- if not eval_set:
2022
+ df = validated_X.copy()
2023
+ df[TARGET] = validated_y
2024
+ if eval_set is None:
2045
2025
  return df
2046
2026
 
2047
2027
  df[EVAL_SET_INDEX] = 0
2048
2028
 
2049
2029
  for idx, eval_pair in enumerate(eval_set):
2050
- eval_x, eval_y = eval_pair
2030
+ eval_x, eval_y = self._validate_eval_set_pair(validated_X, eval_pair)
2051
2031
  eval_df_with_index = eval_x.copy()
2052
- if eval_y is not None:
2053
- eval_df_with_index[TARGET] = eval_y
2032
+ eval_df_with_index[TARGET] = eval_y
2054
2033
  eval_df_with_index[EVAL_SET_INDEX] = idx + 1
2055
2034
  df = pd.concat([df, eval_df_with_index])
2056
2035
 
2057
2036
  return df
2058
2037
 
2059
2038
  def __downsample_for_metrics(self, df: pd.DataFrame) -> pd.DataFrame:
2060
- force_downsampling = self.__use_force_downsampling(df)
2061
-
2062
- sample_columns = SampleColumns(
2063
- ids=self.id_columns,
2064
- date=self._get_date_column(self.search_keys),
2065
- target=TARGET,
2066
- eval_set_index=EVAL_SET_INDEX,
2067
- )
2068
-
2069
- return sample(
2070
- df,
2071
- self.model_task_type,
2072
- self.cv,
2073
- self.sample_config,
2074
- sample_columns,
2075
- self.random_state,
2076
- force_downsampling=force_downsampling,
2077
- balance=False,
2078
- logger=self.logger,
2079
- bundle=self.bundle,
2080
- warning_callback=self.__log_warning,
2081
- )
2082
-
2083
- def __use_force_downsampling(self, df: pd.DataFrame) -> bool:
2084
2039
  num_samples = _num_samples(df)
2085
- return (
2040
+ force_downsampling = (
2086
2041
  not self.disable_force_downsampling
2087
2042
  and self.columns_for_online_api is not None
2088
2043
  and num_samples > Dataset.FORCE_SAMPLE_SIZE
2089
2044
  )
2090
2045
 
2046
+ if force_downsampling:
2047
+ self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
2048
+ return balance_undersample_forced(
2049
+ df=df,
2050
+ target_column=TARGET,
2051
+ id_columns=self.id_columns,
2052
+ date_column=self._get_date_column(self.search_keys),
2053
+ task_type=self.model_task_type,
2054
+ cv_type=self.cv,
2055
+ random_state=self.random_state,
2056
+ sample_size=Dataset.FORCE_SAMPLE_SIZE,
2057
+ logger=self.logger,
2058
+ bundle=self.bundle,
2059
+ warning_callback=self.__log_warning,
2060
+ )
2061
+ elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
2062
+ if EVAL_SET_INDEX in df.columns:
2063
+ threshold = Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD
2064
+ sample_size = Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS
2065
+ else:
2066
+ threshold = Dataset.FIT_SAMPLE_THRESHOLD
2067
+ sample_size = Dataset.FIT_SAMPLE_ROWS
2068
+
2069
+ if num_samples > threshold:
2070
+ self.logger.info(f"Downsampling from {num_samples} to {sample_size}")
2071
+ return df.sample(n=sample_size, random_state=self.random_state)
2072
+
2073
+ return df
2074
+
2091
2075
  def __extract_train_data(
2092
2076
  self, enriched_df: pd.DataFrame, x_columns: List[str]
2093
2077
  ) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
@@ -2123,7 +2107,7 @@ class FeaturesEnricher(TransformerMixin):
2123
2107
  eval_set_sampled_dict: Dict[int, Tuple],
2124
2108
  columns_renaming: Dict[str, str],
2125
2109
  search_keys: Dict[str, SearchKey],
2126
- ) -> _EnrichedDataForMetrics:
2110
+ ) -> _SampledDataForMetrics:
2127
2111
 
2128
2112
  self.__cached_sampled_datasets[datasets_hash] = (
2129
2113
  X_sampled,
@@ -2154,7 +2138,7 @@ class FeaturesEnricher(TransformerMixin):
2154
2138
  for k, v in search_keys.items()
2155
2139
  if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
2156
2140
  }
2157
- return FeaturesEnricher._EnrichedDataForMetrics(
2141
+ return FeaturesEnricher._SampledDataForMetrics(
2158
2142
  X_sampled=X_sampled,
2159
2143
  y_sampled=y_sampled,
2160
2144
  enriched_X=enriched_X,
@@ -2302,10 +2286,13 @@ if response.status_code == 200:
2302
2286
  with MDC(trace_id=trace_id, search_id=search_id):
2303
2287
  self.logger.info("Start transform")
2304
2288
 
2305
- validated_X, validated_y, validated_eval_set = self._validate_train_eval(
2306
- X, y, eval_set=None, is_transform=True
2307
- )
2308
- df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
2289
+ validated_X = self._validate_X(X, is_transform=True)
2290
+ if y is not None:
2291
+ validated_y = self._validate_y(validated_X, y)
2292
+ df = self.__combine_train_and_eval_sets(validated_X, validated_y, eval_set=None)
2293
+ else:
2294
+ validated_y = None
2295
+ df = validated_X
2309
2296
 
2310
2297
  validated_Xy = df.copy()
2311
2298
 
@@ -2359,7 +2346,7 @@ if response.status_code == 200:
2359
2346
 
2360
2347
  is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
2361
2348
 
2362
- columns_to_drop = [c for c in df.columns if c in self.feature_names_ and c not in (self.id_columns or [])]
2349
+ columns_to_drop = [c for c in df.columns if c in self.feature_names_]
2363
2350
  if len(columns_to_drop) > 0:
2364
2351
  msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
2365
2352
  self.logger.warning(msg)
@@ -2563,7 +2550,6 @@ if response.status_code == 200:
2563
2550
  id_columns=self.__get_renamed_id_columns(columns_renaming),
2564
2551
  date_column=self._get_date_column(search_keys),
2565
2552
  date_format=self.date_format,
2566
- sample_config=self.sample_config,
2567
2553
  rest_client=self.rest_client,
2568
2554
  logger=self.logger,
2569
2555
  bundle=self.bundle,
@@ -2667,7 +2653,7 @@ if response.status_code == 200:
2667
2653
  selecting_columns = [
2668
2654
  c
2669
2655
  for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
2670
- if c not in self.zero_shap_client_features or c in (self.id_columns or [])
2656
+ if c not in self.zero_shap_client_features
2671
2657
  ]
2672
2658
  selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
2673
2659
  if add_fit_system_record_id:
@@ -2815,8 +2801,13 @@ if response.status_code == 200:
2815
2801
  self.fit_dropped_features = set()
2816
2802
  self.fit_generated_features = []
2817
2803
 
2818
- validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, eval_set)
2819
-
2804
+ validated_X = self._validate_X(X)
2805
+ validated_y = self._validate_y(validated_X, y)
2806
+ validated_eval_set = (
2807
+ [self._validate_eval_set_pair(validated_X, eval_pair) for eval_pair in eval_set]
2808
+ if eval_set is not None
2809
+ else None
2810
+ )
2820
2811
  is_demo_dataset = hash_input(validated_X, validated_y, validated_eval_set) in DEMO_DATASET_HASHES
2821
2812
  if is_demo_dataset:
2822
2813
  msg = self.bundle.get("demo_dataset_info")
@@ -2861,8 +2852,14 @@ if response.status_code == 200:
2861
2852
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
2862
2853
  )
2863
2854
 
2864
- df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
2865
- self.id_columns_encoder = OrdinalEncoder().fit(df[self.id_columns or []])
2855
+ df = pd.concat([validated_X, validated_y], axis=1)
2856
+
2857
+ if validated_eval_set is not None and len(validated_eval_set) > 0:
2858
+ df[EVAL_SET_INDEX] = 0
2859
+ for idx, (eval_X, eval_y) in enumerate(validated_eval_set):
2860
+ eval_df = pd.concat([eval_X, eval_y], axis=1)
2861
+ eval_df[EVAL_SET_INDEX] = idx + 1
2862
+ df = pd.concat([df, eval_df])
2866
2863
 
2867
2864
  self.fit_search_keys = self.search_keys.copy()
2868
2865
  df = self.__handle_index_search_keys(df, self.fit_search_keys)
@@ -2973,8 +2970,47 @@ if response.status_code == 200:
2973
2970
  # TODO check maybe need to drop _time column from df_with_original_index
2974
2971
 
2975
2972
  df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys, self.fit_columns_renaming)
2976
- # Convert EMAIL to HEM etc after unnesting to do it only with one column
2977
- df = self.__convert_unnestable_keys(df, unnest_search_keys)
2973
+
2974
+ # Convert EMAIL to HEM after unnesting to do it only with one column
2975
+ email_column = self._get_email_column(self.fit_search_keys)
2976
+ hem_column = self._get_hem_column(self.fit_search_keys)
2977
+ if email_column:
2978
+ converter = EmailSearchKeyConverter(
2979
+ email_column,
2980
+ hem_column,
2981
+ self.fit_search_keys,
2982
+ self.fit_columns_renaming,
2983
+ list(unnest_search_keys.keys()),
2984
+ self.bundle,
2985
+ self.logger,
2986
+ )
2987
+ df = converter.convert(df)
2988
+
2989
+ ip_column = self._get_ip_column(self.fit_search_keys)
2990
+ if ip_column:
2991
+ converter = IpSearchKeyConverter(
2992
+ ip_column,
2993
+ self.fit_search_keys,
2994
+ self.fit_columns_renaming,
2995
+ list(unnest_search_keys.keys()),
2996
+ self.bundle,
2997
+ self.logger,
2998
+ )
2999
+ df = converter.convert(df)
3000
+ phone_column = self._get_phone_column(self.fit_search_keys)
3001
+ country_column = self._get_country_column(self.fit_search_keys)
3002
+ if phone_column:
3003
+ converter = PhoneSearchKeyConverter(phone_column, country_column)
3004
+ df = converter.convert(df)
3005
+
3006
+ if country_column:
3007
+ converter = CountrySearchKeyConverter(country_column)
3008
+ df = converter.convert(df)
3009
+
3010
+ postal_code = self._get_postal_column(self.fit_search_keys)
3011
+ if postal_code:
3012
+ converter = PostalCodeSearchKeyConverter(postal_code)
3013
+ df = converter.convert(df)
2978
3014
 
2979
3015
  non_feature_columns = [
2980
3016
  self.TARGET_NAME,
@@ -3025,7 +3061,11 @@ if response.status_code == 200:
3025
3061
  runtime_parameters = self._get_copy_of_runtime_parameters()
3026
3062
 
3027
3063
  # Force downsampling to 7000 for API features generation
3028
- force_downsampling = self.__use_force_downsampling(df)
3064
+ force_downsampling = (
3065
+ not self.disable_force_downsampling
3066
+ and self.columns_for_online_api is not None
3067
+ and len(df) > Dataset.FORCE_SAMPLE_SIZE
3068
+ )
3029
3069
  if force_downsampling:
3030
3070
  runtime_parameters.properties["fast_fit"] = True
3031
3071
 
@@ -3045,7 +3085,6 @@ if response.status_code == 200:
3045
3085
  logger=self.logger,
3046
3086
  bundle=self.bundle,
3047
3087
  warning_callback=self.__log_warning,
3048
- sample_config=self.sample_config,
3049
3088
  )
3050
3089
  dataset.columns_renaming = self.fit_columns_renaming
3051
3090
 
@@ -3201,49 +3240,6 @@ if response.status_code == 200:
3201
3240
  if not self.warning_counter.has_warnings():
3202
3241
  self.__display_support_link(self.bundle.get("all_ok_community_invite"))
3203
3242
 
3204
- def __convert_unnestable_keys(self, df: pd.DataFrame, unnest_search_keys: Dict[str, str]):
3205
- email_column = self._get_email_column(self.fit_search_keys)
3206
- hem_column = self._get_hem_column(self.fit_search_keys)
3207
- if email_column:
3208
- converter = EmailSearchKeyConverter(
3209
- email_column,
3210
- hem_column,
3211
- self.fit_search_keys,
3212
- self.fit_columns_renaming,
3213
- list(unnest_search_keys.keys()),
3214
- self.bundle,
3215
- self.logger,
3216
- )
3217
- df = converter.convert(df)
3218
-
3219
- ip_column = self._get_ip_column(self.fit_search_keys)
3220
- if ip_column:
3221
- converter = IpSearchKeyConverter(
3222
- ip_column,
3223
- self.fit_search_keys,
3224
- self.fit_columns_renaming,
3225
- list(unnest_search_keys.keys()),
3226
- self.bundle,
3227
- self.logger,
3228
- )
3229
- df = converter.convert(df)
3230
- phone_column = self._get_phone_column(self.fit_search_keys)
3231
- country_column = self._get_country_column(self.fit_search_keys)
3232
- if phone_column:
3233
- converter = PhoneSearchKeyConverter(phone_column, country_column)
3234
- df = converter.convert(df)
3235
-
3236
- if country_column:
3237
- converter = CountrySearchKeyConverter(country_column)
3238
- df = converter.convert(df)
3239
-
3240
- postal_code = self._get_postal_column(self.fit_search_keys)
3241
- if postal_code:
3242
- converter = PostalCodeSearchKeyConverter(postal_code)
3243
- df = converter.convert(df)
3244
-
3245
- return df
3246
-
3247
3243
  def __should_add_date_column(self):
3248
3244
  return self.add_date_if_missing or (self.cv is not None and self.cv.is_time_series())
3249
3245
 
@@ -3286,57 +3282,6 @@ if response.status_code == 200:
3286
3282
  search_keys_with_autodetection = {**self.search_keys, **self.autodetected_search_keys}
3287
3283
  return [c for c, v in search_keys_with_autodetection.items() if v.value.value in keys]
3288
3284
 
3289
- def _validate_train_eval(
3290
- self,
3291
- X: pd.DataFrame,
3292
- y: Optional[pd.Series] = None,
3293
- eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]] = None,
3294
- is_transform: bool = False,
3295
- ) -> Tuple[pd.DataFrame, pd.Series, Optional[List[Tuple[pd.DataFrame, pd.Series]]]]:
3296
- validated_X = self._validate_X(X, is_transform)
3297
- validated_y = self._validate_y(validated_X, y)
3298
- validated_eval_set = self._validate_eval_set(validated_X, eval_set)
3299
- return validated_X, validated_y, validated_eval_set
3300
-
3301
- def _encode_id_columns(
3302
- self,
3303
- X: pd.DataFrame,
3304
- columns_renaming: Optional[Dict[str, str]] = None,
3305
- ) -> Tuple[pd.DataFrame, Dict[str, List[Any]]]:
3306
- columns_renaming = columns_renaming or {}
3307
- unknown_dict = {}
3308
-
3309
- if self.id_columns and self.id_columns_encoder is not None:
3310
- inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
3311
- renamed_id_columns = [
3312
- inverse_columns_renaming.get(col, col) for col in self.id_columns_encoder.feature_names_in_
3313
- ]
3314
- self.logger.info(f"Convert id columns to int: {renamed_id_columns}")
3315
- encoded = self.id_columns_encoder.transform(X[renamed_id_columns].rename(columns=columns_renaming))
3316
- for i, c in enumerate(renamed_id_columns):
3317
- unknown_values = X[encoded[:, i] == -1][c].unique().tolist()
3318
- if len(unknown_values) > 0:
3319
- unknown_dict[c] = unknown_values
3320
- X[renamed_id_columns] = encoded
3321
- X = X.loc[(X[renamed_id_columns] != -1).all(axis=1)]
3322
-
3323
- if len(unknown_dict) > 0:
3324
- self.logger.warning(f"Unknown values in id columns: {unknown_dict}")
3325
-
3326
- return X, unknown_dict
3327
-
3328
- def _decode_id_columns(self, X: pd.DataFrame, columns_renaming: Dict[str, str]):
3329
- columns_renaming = columns_renaming or {}
3330
- if self.id_columns and self.id_columns_encoder is not None:
3331
- inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
3332
- renamed_id_columns = [
3333
- inverse_columns_renaming.get(col, col) for col in self.id_columns_encoder.feature_names_in_
3334
- ]
3335
- decoded = self.id_columns_encoder.inverse_transform(X[renamed_id_columns].rename(columns=columns_renaming))
3336
- X[renamed_id_columns] = decoded
3337
-
3338
- return X
3339
-
3340
3285
  def _validate_X(self, X, is_transform=False) -> pd.DataFrame:
3341
3286
  if isinstance(X, pd.DataFrame):
3342
3287
  if isinstance(X.columns, pd.MultiIndex) or isinstance(X.index, pd.MultiIndex):
@@ -3378,9 +3323,7 @@ if response.status_code == 200:
3378
3323
 
3379
3324
  return validated_X
3380
3325
 
3381
- def _validate_y(self, X: pd.DataFrame, y) -> Optional[pd.Series]:
3382
- if y is None:
3383
- return None
3326
+ def _validate_y(self, X: pd.DataFrame, y) -> pd.Series:
3384
3327
  if (
3385
3328
  not isinstance(y, pd.Series)
3386
3329
  and not isinstance(y, pd.DataFrame)
@@ -3427,11 +3370,6 @@ if response.status_code == 200:
3427
3370
 
3428
3371
  return validated_y
3429
3372
 
3430
- def _validate_eval_set(self, X: pd.DataFrame, eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]]):
3431
- if eval_set is None:
3432
- return None
3433
- return [self._validate_eval_set_pair(X, eval_pair) for eval_pair in eval_set]
3434
-
3435
3373
  def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
3436
3374
  if len(eval_pair) != 2:
3437
3375
  raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
@@ -3512,7 +3450,7 @@ if response.status_code == 200:
3512
3450
  raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
3513
3451
 
3514
3452
  # Check for duplicates between train and eval sets by comparing all values
3515
- train_eval_intersection = pd.merge(X, validated_eval_X, how="inner")
3453
+ train_eval_intersection = pd.merge(X, validated_eval_X, how='inner')
3516
3454
  if len(train_eval_intersection) > 0:
3517
3455
  raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
3518
3456
 
@@ -4042,7 +3980,7 @@ if response.status_code == 200:
4042
3980
  if features_meta is None:
4043
3981
  raise Exception(self.bundle.get("missing_features_meta"))
4044
3982
 
4045
- return [f.name for f in features_meta if f.type == "categorical" and f.name not in (self.id_columns or [])]
3983
+ return [f.name for f in features_meta if f.type == "categorical"]
4046
3984
 
4047
3985
  def __prepare_feature_importances(
4048
3986
  self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
@@ -4685,6 +4623,35 @@ if response.status_code == 200:
4685
4623
  self.logger.warning("Failed to dump input files", exc_info=True)
4686
4624
 
4687
4625
 
4626
+ def _num_samples(x):
4627
+ """Return number of samples in array-like x."""
4628
+ if x is None:
4629
+ return 0
4630
+ message = "Expected sequence or array-like, got %s" % type(x)
4631
+ if hasattr(x, "fit") and callable(x.fit):
4632
+ # Don't get num_samples from an ensembles length!
4633
+ raise TypeError(message)
4634
+
4635
+ if not hasattr(x, "__len__") and not hasattr(x, "shape"):
4636
+ if hasattr(x, "__array__"):
4637
+ x = np.asarray(x)
4638
+ else:
4639
+ raise TypeError(message)
4640
+
4641
+ if hasattr(x, "shape") and x.shape is not None:
4642
+ if len(x.shape) == 0:
4643
+ raise TypeError("Singleton array %r cannot be considered a valid collection." % x)
4644
+ # Check that shape is returning an integer or default to len
4645
+ # Dask dataframes may not return numeric shape[0] value
4646
+ if isinstance(x.shape[0], numbers.Integral):
4647
+ return x.shape[0]
4648
+
4649
+ try:
4650
+ return len(x)
4651
+ except TypeError as type_error:
4652
+ raise TypeError(message) from type_error
4653
+
4654
+
4688
4655
  def is_frames_equal(first, second, bundle: ResourceBundle) -> bool:
4689
4656
  if (isinstance(first, pd.DataFrame) and isinstance(second, pd.DataFrame)) or (
4690
4657
  isinstance(first, pd.Series) and isinstance(second, pd.Series)