upgini 1.2.90__py3-none-any.whl → 1.2.91a3884.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,6 +30,7 @@ from scipy.stats import ks_2samp
30
30
  from sklearn.base import TransformerMixin
31
31
  from sklearn.exceptions import NotFittedError
32
32
  from sklearn.model_selection import BaseCrossValidator, TimeSeriesSplit
33
+ from sklearn.preprocessing import OrdinalEncoder
33
34
 
34
35
  from upgini.autofe.feature import Feature
35
36
  from upgini.autofe.timeseries import TimeSeriesBase
@@ -118,9 +119,9 @@ except Exception:
118
119
  CustomFallbackProgressBar as ProgressBar,
119
120
  )
120
121
 
122
+ from upgini.utils.sample_utils import SampleColumns, SampleConfig, _num_samples, sample
121
123
  from upgini.utils.sort import sort_columns
122
124
  from upgini.utils.target_utils import (
123
- balance_undersample_forced,
124
125
  calculate_psi,
125
126
  define_task,
126
127
  )
@@ -242,6 +243,7 @@ class FeaturesEnricher(TransformerMixin):
242
243
  disable_force_downsampling: bool = False,
243
244
  id_columns: Optional[List[str]] = None,
244
245
  generate_search_key_features: bool = True,
246
+ sample_config: Optional[SampleConfig] = None,
245
247
  **kwargs,
246
248
  ):
247
249
  self.bundle = get_custom_bundle(custom_bundle_config)
@@ -286,6 +288,7 @@ class FeaturesEnricher(TransformerMixin):
286
288
 
287
289
  self.search_keys = search_keys or {}
288
290
  self.id_columns = id_columns
291
+ self.id_columns_encoder = None
289
292
  self.country_code = country_code
290
293
  self.__validate_search_keys(search_keys, search_id)
291
294
 
@@ -359,10 +362,8 @@ class FeaturesEnricher(TransformerMixin):
359
362
  self.columns_for_online_api = columns_for_online_api
360
363
  if columns_for_online_api is not None:
361
364
  self.runtime_parameters.properties["columns_for_online_api"] = ",".join(columns_for_online_api)
362
- maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
363
- if maybe_downsampling_limit is not None:
364
- Dataset.FIT_SAMPLE_THRESHOLD = int(maybe_downsampling_limit)
365
- Dataset.FIT_SAMPLE_ROWS = int(maybe_downsampling_limit)
365
+
366
+ self.sample_config = self._get_sample_config(sample_config)
366
367
 
367
368
  self.raise_validation_error = raise_validation_error
368
369
  self.exclude_columns = exclude_columns
@@ -375,6 +376,16 @@ class FeaturesEnricher(TransformerMixin):
375
376
  self.autofe_features_display_handle = None
376
377
  self.report_button_handle = None
377
378
 
379
+ def _get_sample_config(self, sample_config: Optional[SampleConfig] = None):
380
+ sample_config = sample_config or SampleConfig(force_sample_size=Dataset.FORCE_SAMPLE_SIZE)
381
+
382
+ maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
383
+ if maybe_downsampling_limit is not None:
384
+ sample_config.fit_sample_rows = int(maybe_downsampling_limit)
385
+ sample_config.fit_sample_threshold = int(maybe_downsampling_limit)
386
+
387
+ return sample_config
388
+
378
389
  def _get_api_key(self):
379
390
  return self._api_key
380
391
 
@@ -928,16 +939,15 @@ class FeaturesEnricher(TransformerMixin):
928
939
  ):
929
940
  raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
930
941
 
931
- validated_X = self._validate_X(effective_X)
932
- validated_y = self._validate_y(validated_X, effective_y)
933
- validated_eval_set = (
934
- [self._validate_eval_set_pair(validated_X, eval_pair) for eval_pair in effective_eval_set]
935
- if effective_eval_set is not None
936
- else None
942
+ validated_X, validated_y, validated_eval_set = self._validate_train_eval(
943
+ effective_X, effective_y, effective_eval_set
937
944
  )
938
945
 
939
946
  if self.X is None:
940
947
  self.X = X
948
+ self.id_columns_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1).fit(
949
+ X[self.id_columns or []]
950
+ )
941
951
  if self.y is None:
942
952
  self.y = y
943
953
  if self.eval_set is None:
@@ -1245,7 +1255,8 @@ class FeaturesEnricher(TransformerMixin):
1245
1255
  metrics.append(eval_metrics)
1246
1256
 
1247
1257
  if updating_shaps is not None:
1248
- self._update_shap_values(trace_id, fitting_X, updating_shaps, silent=not internal_call)
1258
+ decoded_X = self._decode_id_columns(fitting_X, columns_renaming)
1259
+ self._update_shap_values(trace_id, decoded_X, updating_shaps, silent=not internal_call)
1249
1260
 
1250
1261
  metrics_df = pd.DataFrame(metrics)
1251
1262
  mean_target_hdr = self.bundle.get("quality_metrics_mean_target_header")
@@ -1499,16 +1510,10 @@ class FeaturesEnricher(TransformerMixin):
1499
1510
  ):
1500
1511
  is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
1501
1512
  is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
1502
- validated_X = self._validate_X(X)
1503
- validated_y = self._validate_y(validated_X, y)
1504
1513
  checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
1505
- validated_eval_set = (
1506
- [self._validate_eval_set_pair(validated_X, eval_set_pair) for eval_set_pair in checked_eval_set]
1507
- if checked_eval_set
1508
- else None
1509
- )
1514
+ validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set)
1510
1515
 
1511
- sampled_data = self._sample_data_for_metrics(
1516
+ sampled_data = self._get_enriched_for_metrics(
1512
1517
  trace_id,
1513
1518
  validated_X,
1514
1519
  validated_y,
@@ -1582,7 +1587,11 @@ class FeaturesEnricher(TransformerMixin):
1582
1587
  fitting_enriched_X = fitting_enriched_X.drop(columns=columns_with_high_cardinality, errors="ignore")
1583
1588
 
1584
1589
  # Detect and drop constant columns
1585
- constant_columns = FeaturesValidator.find_constant_features(fitting_X)
1590
+ constant_columns = [
1591
+ c
1592
+ for c in FeaturesValidator.find_constant_features(fitting_X)
1593
+ if self.fit_columns_renaming.get(c, c) not in (self.id_columns or [])
1594
+ ]
1586
1595
  if len(constant_columns) > 0:
1587
1596
  self.logger.warning(f"Constant columns {constant_columns} will be dropped for metrics calculation")
1588
1597
  fitting_X = fitting_X.drop(columns=constant_columns, errors="ignore")
@@ -1625,6 +1634,7 @@ class FeaturesEnricher(TransformerMixin):
1625
1634
  fitting_X, y_sorted, search_keys, self.model_task_type, sort_all_columns=True, logger=self.logger
1626
1635
  )
1627
1636
  fitting_X = fitting_X[fitting_x_columns]
1637
+ fitting_X, _ = self._encode_id_columns(fitting_X, self.fit_columns_renaming)
1628
1638
  self.logger.info(f"Final sorted list of fitting X columns: {fitting_x_columns}")
1629
1639
  fitting_enriched_x_columns = fitting_enriched_X.columns.to_list()
1630
1640
  fitting_enriched_x_columns = sort_columns(
@@ -1636,6 +1646,7 @@ class FeaturesEnricher(TransformerMixin):
1636
1646
  logger=self.logger,
1637
1647
  )
1638
1648
  fitting_enriched_X = fitting_enriched_X[fitting_enriched_x_columns]
1649
+ fitting_enriched_X, _ = self._encode_id_columns(fitting_enriched_X, self.fit_columns_renaming)
1639
1650
  self.logger.info(f"Final sorted list of fitting enriched X columns: {fitting_enriched_x_columns}")
1640
1651
  for idx, eval_tuple in eval_set_sampled_dict.items():
1641
1652
  eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
@@ -1663,6 +1674,12 @@ class FeaturesEnricher(TransformerMixin):
1663
1674
  .astype(np.float64)
1664
1675
  )
1665
1676
 
1677
+ fitting_eval_X, unknown_dict = self._encode_id_columns(fitting_eval_X, self.fit_columns_renaming)
1678
+ fitting_enriched_eval_X, _ = self._encode_id_columns(fitting_enriched_eval_X, self.fit_columns_renaming)
1679
+
1680
+ if len(unknown_dict) > 0:
1681
+ print(self.bundle.get("unknown_id_column_value_in_eval_set").format(unknown_dict))
1682
+
1666
1683
  fitting_eval_set_dict[idx] = (
1667
1684
  fitting_eval_X,
1668
1685
  eval_y_sorted,
@@ -1684,7 +1701,7 @@ class FeaturesEnricher(TransformerMixin):
1684
1701
  )
1685
1702
 
1686
1703
  @dataclass
1687
- class _SampledDataForMetrics:
1704
+ class _EnrichedDataForMetrics:
1688
1705
  X_sampled: pd.DataFrame
1689
1706
  y_sampled: pd.Series
1690
1707
  enriched_X: pd.DataFrame
@@ -1692,7 +1709,7 @@ class FeaturesEnricher(TransformerMixin):
1692
1709
  search_keys: Dict[str, SearchKey]
1693
1710
  columns_renaming: Dict[str, str]
1694
1711
 
1695
- def _sample_data_for_metrics(
1712
+ def _get_enriched_for_metrics(
1696
1713
  self,
1697
1714
  trace_id: str,
1698
1715
  validated_X: Union[pd.DataFrame, pd.Series, np.ndarray, None],
@@ -1704,7 +1721,7 @@ class FeaturesEnricher(TransformerMixin):
1704
1721
  remove_outliers_calc_metrics: Optional[bool],
1705
1722
  progress_bar: Optional[ProgressBar],
1706
1723
  progress_callback: Optional[Callable[[SearchProgress], Any]],
1707
- ) -> _SampledDataForMetrics:
1724
+ ) -> _EnrichedDataForMetrics:
1708
1725
  datasets_hash = hash_input(validated_X, validated_y, eval_set)
1709
1726
  cached_sampled_datasets = self.__cached_sampled_datasets.get(datasets_hash)
1710
1727
  if cached_sampled_datasets is not None and is_input_same_as_fit and remove_outliers_calc_metrics is None:
@@ -1712,7 +1729,7 @@ class FeaturesEnricher(TransformerMixin):
1712
1729
  return self.__get_sampled_cached_enriched(datasets_hash, exclude_features_sources)
1713
1730
  elif len(self.feature_importances_) == 0:
1714
1731
  self.logger.info("No external features selected. So use only input datasets for metrics calculation")
1715
- return self.__sample_only_input(validated_X, validated_y, eval_set, is_demo_dataset)
1732
+ return self.__get_enriched_as_input(validated_X, validated_y, eval_set, is_demo_dataset)
1716
1733
  # TODO save and check if dataset was deduplicated - use imbalance branch for such case
1717
1734
  elif (
1718
1735
  not self.imbalanced
@@ -1721,14 +1738,14 @@ class FeaturesEnricher(TransformerMixin):
1721
1738
  and self.df_with_original_index is not None
1722
1739
  ):
1723
1740
  self.logger.info("Dataset is not imbalanced, so use enriched_X from fit")
1724
- return self.__sample_balanced(eval_set, trace_id, remove_outliers_calc_metrics)
1741
+ return self.__get_enriched_from_fit(eval_set, trace_id, remove_outliers_calc_metrics)
1725
1742
  else:
1726
1743
  self.logger.info(
1727
1744
  "Dataset is imbalanced or exclude_features_sources or X was passed or this is saved search."
1728
1745
  " Run transform"
1729
1746
  )
1730
1747
  print(self.bundle.get("prepare_data_for_metrics"))
1731
- return self.__sample_imbalanced(
1748
+ return self.__get_enriched_from_transform(
1732
1749
  validated_X,
1733
1750
  validated_y,
1734
1751
  eval_set,
@@ -1740,7 +1757,7 @@ class FeaturesEnricher(TransformerMixin):
1740
1757
 
1741
1758
  def __get_sampled_cached_enriched(
1742
1759
  self, datasets_hash: str, exclude_features_sources: Optional[List[str]]
1743
- ) -> _SampledDataForMetrics:
1760
+ ) -> _EnrichedDataForMetrics:
1744
1761
  X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = (
1745
1762
  self.__cached_sampled_datasets[datasets_hash]
1746
1763
  )
@@ -1757,9 +1774,9 @@ class FeaturesEnricher(TransformerMixin):
1757
1774
  search_keys,
1758
1775
  )
1759
1776
 
1760
- def __sample_only_input(
1777
+ def __get_enriched_as_input(
1761
1778
  self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
1762
- ) -> _SampledDataForMetrics:
1779
+ ) -> _EnrichedDataForMetrics:
1763
1780
  eval_set_sampled_dict = {}
1764
1781
 
1765
1782
  df = validated_X.copy()
@@ -1801,24 +1818,13 @@ class FeaturesEnricher(TransformerMixin):
1801
1818
  normalizer = Normalizer(self.bundle, self.logger)
1802
1819
  df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
1803
1820
  columns_renaming = normalizer.columns_renaming
1804
- # columns_renaming = {c: c for c in df.columns}
1805
1821
 
1806
1822
  df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1807
-
1808
- num_samples = _num_samples(df)
1809
- sample_threshold, sample_rows = (
1810
- (Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD, Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS)
1811
- if eval_set is not None
1812
- else (Dataset.FIT_SAMPLE_THRESHOLD, Dataset.FIT_SAMPLE_ROWS)
1813
- )
1814
-
1815
1823
  df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID, TARGET, columns_renaming, silent=True)
1824
+
1816
1825
  # Sample after sorting by system_record_id for idempotency
1817
1826
  df.sort_values(by=SYSTEM_RECORD_ID, inplace=True)
1818
-
1819
- if num_samples > sample_threshold:
1820
- self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
1821
- df = df.sample(n=sample_rows, random_state=self.random_state)
1827
+ df = self.__downsample_for_metrics(df)
1822
1828
 
1823
1829
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
1824
1830
  df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
@@ -1847,12 +1853,12 @@ class FeaturesEnricher(TransformerMixin):
1847
1853
  search_keys,
1848
1854
  )
1849
1855
 
1850
- def __sample_balanced(
1856
+ def __get_enriched_from_fit(
1851
1857
  self,
1852
1858
  eval_set: Optional[List[tuple]],
1853
1859
  trace_id: str,
1854
1860
  remove_outliers_calc_metrics: Optional[bool],
1855
- ) -> _SampledDataForMetrics:
1861
+ ) -> _EnrichedDataForMetrics:
1856
1862
  eval_set_sampled_dict = {}
1857
1863
  search_keys = self.fit_search_keys
1858
1864
 
@@ -1951,7 +1957,7 @@ class FeaturesEnricher(TransformerMixin):
1951
1957
  search_keys,
1952
1958
  )
1953
1959
 
1954
- def __sample_imbalanced(
1960
+ def __get_enriched_from_transform(
1955
1961
  self,
1956
1962
  validated_X: pd.DataFrame,
1957
1963
  validated_y: pd.Series,
@@ -1960,7 +1966,7 @@ class FeaturesEnricher(TransformerMixin):
1960
1966
  trace_id: str,
1961
1967
  progress_bar: Optional[ProgressBar],
1962
1968
  progress_callback: Optional[Callable[[SearchProgress], Any]],
1963
- ) -> _SampledDataForMetrics:
1969
+ ) -> _EnrichedDataForMetrics:
1964
1970
  has_eval_set = eval_set is not None
1965
1971
 
1966
1972
  self.logger.info(f"Transform {'with' if has_eval_set else 'without'} eval_set")
@@ -2017,61 +2023,58 @@ class FeaturesEnricher(TransformerMixin):
2017
2023
  )
2018
2024
 
2019
2025
  def __combine_train_and_eval_sets(
2020
- self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]]
2026
+ self, X: pd.DataFrame, y: Optional[pd.Series] = None, eval_set: Optional[List[tuple]] = None
2021
2027
  ) -> pd.DataFrame:
2022
- df = validated_X.copy()
2023
- df[TARGET] = validated_y
2024
- if eval_set is None:
2028
+ df = X.copy()
2029
+ if y is not None:
2030
+ df[TARGET] = y
2031
+ if not eval_set:
2025
2032
  return df
2026
2033
 
2027
2034
  df[EVAL_SET_INDEX] = 0
2028
2035
 
2029
2036
  for idx, eval_pair in enumerate(eval_set):
2030
- eval_x, eval_y = self._validate_eval_set_pair(validated_X, eval_pair)
2037
+ eval_x, eval_y = eval_pair
2031
2038
  eval_df_with_index = eval_x.copy()
2032
- eval_df_with_index[TARGET] = eval_y
2039
+ if eval_y is not None:
2040
+ eval_df_with_index[TARGET] = eval_y
2033
2041
  eval_df_with_index[EVAL_SET_INDEX] = idx + 1
2034
2042
  df = pd.concat([df, eval_df_with_index])
2035
2043
 
2036
2044
  return df
2037
2045
 
2038
2046
  def __downsample_for_metrics(self, df: pd.DataFrame) -> pd.DataFrame:
2047
+ force_downsampling = self.__use_force_downsampling(df)
2048
+
2049
+ sample_columns = SampleColumns(
2050
+ ids=self.id_columns,
2051
+ date=self._get_date_column(self.search_keys),
2052
+ target=TARGET,
2053
+ eval_set_index=EVAL_SET_INDEX,
2054
+ )
2055
+
2056
+ return sample(
2057
+ df,
2058
+ self.model_task_type,
2059
+ self.cv,
2060
+ self.sample_config,
2061
+ sample_columns,
2062
+ self.random_state,
2063
+ force_downsampling=force_downsampling,
2064
+ balance=False,
2065
+ logger=self.logger,
2066
+ bundle=self.bundle,
2067
+ warning_callback=self.__log_warning,
2068
+ )
2069
+
2070
+ def __use_force_downsampling(self, df: pd.DataFrame) -> bool:
2039
2071
  num_samples = _num_samples(df)
2040
- force_downsampling = (
2072
+ return (
2041
2073
  not self.disable_force_downsampling
2042
2074
  and self.columns_for_online_api is not None
2043
2075
  and num_samples > Dataset.FORCE_SAMPLE_SIZE
2044
2076
  )
2045
2077
 
2046
- if force_downsampling:
2047
- self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
2048
- return balance_undersample_forced(
2049
- df=df,
2050
- target_column=TARGET,
2051
- id_columns=self.id_columns,
2052
- date_column=self._get_date_column(self.search_keys),
2053
- task_type=self.model_task_type,
2054
- cv_type=self.cv,
2055
- random_state=self.random_state,
2056
- sample_size=Dataset.FORCE_SAMPLE_SIZE,
2057
- logger=self.logger,
2058
- bundle=self.bundle,
2059
- warning_callback=self.__log_warning,
2060
- )
2061
- elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
2062
- if EVAL_SET_INDEX in df.columns:
2063
- threshold = Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD
2064
- sample_size = Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS
2065
- else:
2066
- threshold = Dataset.FIT_SAMPLE_THRESHOLD
2067
- sample_size = Dataset.FIT_SAMPLE_ROWS
2068
-
2069
- if num_samples > threshold:
2070
- self.logger.info(f"Downsampling from {num_samples} to {sample_size}")
2071
- return df.sample(n=sample_size, random_state=self.random_state)
2072
-
2073
- return df
2074
-
2075
2078
  def __extract_train_data(
2076
2079
  self, enriched_df: pd.DataFrame, x_columns: List[str]
2077
2080
  ) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
@@ -2107,7 +2110,7 @@ class FeaturesEnricher(TransformerMixin):
2107
2110
  eval_set_sampled_dict: Dict[int, Tuple],
2108
2111
  columns_renaming: Dict[str, str],
2109
2112
  search_keys: Dict[str, SearchKey],
2110
- ) -> _SampledDataForMetrics:
2113
+ ) -> _EnrichedDataForMetrics:
2111
2114
 
2112
2115
  self.__cached_sampled_datasets[datasets_hash] = (
2113
2116
  X_sampled,
@@ -2138,7 +2141,7 @@ class FeaturesEnricher(TransformerMixin):
2138
2141
  for k, v in search_keys.items()
2139
2142
  if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
2140
2143
  }
2141
- return FeaturesEnricher._SampledDataForMetrics(
2144
+ return FeaturesEnricher._EnrichedDataForMetrics(
2142
2145
  X_sampled=X_sampled,
2143
2146
  y_sampled=y_sampled,
2144
2147
  enriched_X=enriched_X,
@@ -2286,13 +2289,10 @@ if response.status_code == 200:
2286
2289
  with MDC(trace_id=trace_id, search_id=search_id):
2287
2290
  self.logger.info("Start transform")
2288
2291
 
2289
- validated_X = self._validate_X(X, is_transform=True)
2290
- if y is not None:
2291
- validated_y = self._validate_y(validated_X, y)
2292
- df = self.__combine_train_and_eval_sets(validated_X, validated_y, eval_set=None)
2293
- else:
2294
- validated_y = None
2295
- df = validated_X
2292
+ validated_X, validated_y, validated_eval_set = self._validate_train_eval(
2293
+ X, y, eval_set=None, is_transform=True
2294
+ )
2295
+ df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
2296
2296
 
2297
2297
  validated_Xy = df.copy()
2298
2298
 
@@ -2346,7 +2346,7 @@ if response.status_code == 200:
2346
2346
 
2347
2347
  is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
2348
2348
 
2349
- columns_to_drop = [c for c in df.columns if c in self.feature_names_]
2349
+ columns_to_drop = [c for c in df.columns if c in self.feature_names_ and c not in (self.id_columns or [])]
2350
2350
  if len(columns_to_drop) > 0:
2351
2351
  msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
2352
2352
  self.logger.warning(msg)
@@ -2550,6 +2550,7 @@ if response.status_code == 200:
2550
2550
  id_columns=self.__get_renamed_id_columns(columns_renaming),
2551
2551
  date_column=self._get_date_column(search_keys),
2552
2552
  date_format=self.date_format,
2553
+ sample_config=self.sample_config,
2553
2554
  rest_client=self.rest_client,
2554
2555
  logger=self.logger,
2555
2556
  bundle=self.bundle,
@@ -2653,7 +2654,7 @@ if response.status_code == 200:
2653
2654
  selecting_columns = [
2654
2655
  c
2655
2656
  for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
2656
- if c not in self.zero_shap_client_features
2657
+ if c not in self.zero_shap_client_features or c in (self.id_columns or [])
2657
2658
  ]
2658
2659
  selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
2659
2660
  if add_fit_system_record_id:
@@ -2801,13 +2802,8 @@ if response.status_code == 200:
2801
2802
  self.fit_dropped_features = set()
2802
2803
  self.fit_generated_features = []
2803
2804
 
2804
- validated_X = self._validate_X(X)
2805
- validated_y = self._validate_y(validated_X, y)
2806
- validated_eval_set = (
2807
- [self._validate_eval_set_pair(validated_X, eval_pair) for eval_pair in eval_set]
2808
- if eval_set is not None
2809
- else None
2810
- )
2805
+ validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, eval_set)
2806
+
2811
2807
  is_demo_dataset = hash_input(validated_X, validated_y, validated_eval_set) in DEMO_DATASET_HASHES
2812
2808
  if is_demo_dataset:
2813
2809
  msg = self.bundle.get("demo_dataset_info")
@@ -2852,14 +2848,8 @@ if response.status_code == 200:
2852
2848
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
2853
2849
  )
2854
2850
 
2855
- df = pd.concat([validated_X, validated_y], axis=1)
2856
-
2857
- if validated_eval_set is not None and len(validated_eval_set) > 0:
2858
- df[EVAL_SET_INDEX] = 0
2859
- for idx, (eval_X, eval_y) in enumerate(validated_eval_set):
2860
- eval_df = pd.concat([eval_X, eval_y], axis=1)
2861
- eval_df[EVAL_SET_INDEX] = idx + 1
2862
- df = pd.concat([df, eval_df])
2851
+ df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
2852
+ self.id_columns_encoder = OrdinalEncoder().fit(df[self.id_columns or []])
2863
2853
 
2864
2854
  self.fit_search_keys = self.search_keys.copy()
2865
2855
  df = self.__handle_index_search_keys(df, self.fit_search_keys)
@@ -2970,47 +2960,8 @@ if response.status_code == 200:
2970
2960
  # TODO check maybe need to drop _time column from df_with_original_index
2971
2961
 
2972
2962
  df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys, self.fit_columns_renaming)
2973
-
2974
- # Convert EMAIL to HEM after unnesting to do it only with one column
2975
- email_column = self._get_email_column(self.fit_search_keys)
2976
- hem_column = self._get_hem_column(self.fit_search_keys)
2977
- if email_column:
2978
- converter = EmailSearchKeyConverter(
2979
- email_column,
2980
- hem_column,
2981
- self.fit_search_keys,
2982
- self.fit_columns_renaming,
2983
- list(unnest_search_keys.keys()),
2984
- self.bundle,
2985
- self.logger,
2986
- )
2987
- df = converter.convert(df)
2988
-
2989
- ip_column = self._get_ip_column(self.fit_search_keys)
2990
- if ip_column:
2991
- converter = IpSearchKeyConverter(
2992
- ip_column,
2993
- self.fit_search_keys,
2994
- self.fit_columns_renaming,
2995
- list(unnest_search_keys.keys()),
2996
- self.bundle,
2997
- self.logger,
2998
- )
2999
- df = converter.convert(df)
3000
- phone_column = self._get_phone_column(self.fit_search_keys)
3001
- country_column = self._get_country_column(self.fit_search_keys)
3002
- if phone_column:
3003
- converter = PhoneSearchKeyConverter(phone_column, country_column)
3004
- df = converter.convert(df)
3005
-
3006
- if country_column:
3007
- converter = CountrySearchKeyConverter(country_column)
3008
- df = converter.convert(df)
3009
-
3010
- postal_code = self._get_postal_column(self.fit_search_keys)
3011
- if postal_code:
3012
- converter = PostalCodeSearchKeyConverter(postal_code)
3013
- df = converter.convert(df)
2963
+ # Convert EMAIL to HEM etc after unnesting to do it only with one column
2964
+ df = self.__convert_unnestable_keys(df, unnest_search_keys)
3014
2965
 
3015
2966
  non_feature_columns = [
3016
2967
  self.TARGET_NAME,
@@ -3061,11 +3012,7 @@ if response.status_code == 200:
3061
3012
  runtime_parameters = self._get_copy_of_runtime_parameters()
3062
3013
 
3063
3014
  # Force downsampling to 7000 for API features generation
3064
- force_downsampling = (
3065
- not self.disable_force_downsampling
3066
- and self.columns_for_online_api is not None
3067
- and len(df) > Dataset.FORCE_SAMPLE_SIZE
3068
- )
3015
+ force_downsampling = self.__use_force_downsampling(df)
3069
3016
  if force_downsampling:
3070
3017
  runtime_parameters.properties["fast_fit"] = True
3071
3018
 
@@ -3085,6 +3032,7 @@ if response.status_code == 200:
3085
3032
  logger=self.logger,
3086
3033
  bundle=self.bundle,
3087
3034
  warning_callback=self.__log_warning,
3035
+ sample_config=self.sample_config,
3088
3036
  )
3089
3037
  dataset.columns_renaming = self.fit_columns_renaming
3090
3038
 
@@ -3240,6 +3188,49 @@ if response.status_code == 200:
3240
3188
  if not self.warning_counter.has_warnings():
3241
3189
  self.__display_support_link(self.bundle.get("all_ok_community_invite"))
3242
3190
 
3191
+ def __convert_unnestable_keys(self, df: pd.DataFrame, unnest_search_keys: Dict[str, str]):
3192
+ email_column = self._get_email_column(self.fit_search_keys)
3193
+ hem_column = self._get_hem_column(self.fit_search_keys)
3194
+ if email_column:
3195
+ converter = EmailSearchKeyConverter(
3196
+ email_column,
3197
+ hem_column,
3198
+ self.fit_search_keys,
3199
+ self.fit_columns_renaming,
3200
+ list(unnest_search_keys.keys()),
3201
+ self.bundle,
3202
+ self.logger,
3203
+ )
3204
+ df = converter.convert(df)
3205
+
3206
+ ip_column = self._get_ip_column(self.fit_search_keys)
3207
+ if ip_column:
3208
+ converter = IpSearchKeyConverter(
3209
+ ip_column,
3210
+ self.fit_search_keys,
3211
+ self.fit_columns_renaming,
3212
+ list(unnest_search_keys.keys()),
3213
+ self.bundle,
3214
+ self.logger,
3215
+ )
3216
+ df = converter.convert(df)
3217
+ phone_column = self._get_phone_column(self.fit_search_keys)
3218
+ country_column = self._get_country_column(self.fit_search_keys)
3219
+ if phone_column:
3220
+ converter = PhoneSearchKeyConverter(phone_column, country_column)
3221
+ df = converter.convert(df)
3222
+
3223
+ if country_column:
3224
+ converter = CountrySearchKeyConverter(country_column)
3225
+ df = converter.convert(df)
3226
+
3227
+ postal_code = self._get_postal_column(self.fit_search_keys)
3228
+ if postal_code:
3229
+ converter = PostalCodeSearchKeyConverter(postal_code)
3230
+ df = converter.convert(df)
3231
+
3232
+ return df
3233
+
3243
3234
  def __should_add_date_column(self):
3244
3235
  return self.add_date_if_missing or (self.cv is not None and self.cv.is_time_series())
3245
3236
 
@@ -3282,6 +3273,57 @@ if response.status_code == 200:
3282
3273
  search_keys_with_autodetection = {**self.search_keys, **self.autodetected_search_keys}
3283
3274
  return [c for c, v in search_keys_with_autodetection.items() if v.value.value in keys]
3284
3275
 
3276
+ def _validate_train_eval(
3277
+ self,
3278
+ X: pd.DataFrame,
3279
+ y: Optional[pd.Series] = None,
3280
+ eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]] = None,
3281
+ is_transform: bool = False,
3282
+ ) -> Tuple[pd.DataFrame, pd.Series, Optional[List[Tuple[pd.DataFrame, pd.Series]]]]:
3283
+ validated_X = self._validate_X(X, is_transform)
3284
+ validated_y = self._validate_y(validated_X, y)
3285
+ validated_eval_set = self._validate_eval_set(validated_X, eval_set)
3286
+ return validated_X, validated_y, validated_eval_set
3287
+
3288
+ def _encode_id_columns(
3289
+ self,
3290
+ X: pd.DataFrame,
3291
+ columns_renaming: Optional[Dict[str, str]] = None,
3292
+ ) -> Tuple[pd.DataFrame, Dict[str, List[Any]]]:
3293
+ columns_renaming = columns_renaming or {}
3294
+ unknown_dict = {}
3295
+
3296
+ if self.id_columns and self.id_columns_encoder is not None:
3297
+ inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
3298
+ renamed_id_columns = [
3299
+ inverse_columns_renaming.get(col, col) for col in self.id_columns_encoder.feature_names_in_
3300
+ ]
3301
+ self.logger.info(f"Convert id columns to int: {renamed_id_columns}")
3302
+ encoded = self.id_columns_encoder.transform(X[renamed_id_columns].rename(columns=columns_renaming))
3303
+ for i, c in enumerate(renamed_id_columns):
3304
+ unknown_values = X[encoded[:, i] == -1][c].unique().tolist()
3305
+ if len(unknown_values) > 0:
3306
+ unknown_dict[c] = unknown_values
3307
+ X[renamed_id_columns] = encoded
3308
+ X = X.loc[(X[renamed_id_columns] != -1).all(axis=1)]
3309
+
3310
+ if len(unknown_dict) > 0:
3311
+ self.logger.warning(f"Unknown values in id columns: {unknown_dict}")
3312
+
3313
+ return X, unknown_dict
3314
+
3315
+ def _decode_id_columns(self, X: pd.DataFrame, columns_renaming: Dict[str, str]):
3316
+ columns_renaming = columns_renaming or {}
3317
+ if self.id_columns and self.id_columns_encoder is not None:
3318
+ inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
3319
+ renamed_id_columns = [
3320
+ inverse_columns_renaming.get(col, col) for col in self.id_columns_encoder.feature_names_in_
3321
+ ]
3322
+ decoded = self.id_columns_encoder.inverse_transform(X[renamed_id_columns].rename(columns=columns_renaming))
3323
+ X[renamed_id_columns] = decoded
3324
+
3325
+ return X
3326
+
3285
3327
  def _validate_X(self, X, is_transform=False) -> pd.DataFrame:
3286
3328
  if isinstance(X, pd.DataFrame):
3287
3329
  if isinstance(X.columns, pd.MultiIndex) or isinstance(X.index, pd.MultiIndex):
@@ -3323,7 +3365,9 @@ if response.status_code == 200:
3323
3365
 
3324
3366
  return validated_X
3325
3367
 
3326
- def _validate_y(self, X: pd.DataFrame, y) -> pd.Series:
3368
+ def _validate_y(self, X: pd.DataFrame, y) -> Optional[pd.Series]:
3369
+ if y is None:
3370
+ return None
3327
3371
  if (
3328
3372
  not isinstance(y, pd.Series)
3329
3373
  and not isinstance(y, pd.DataFrame)
@@ -3370,6 +3414,11 @@ if response.status_code == 200:
3370
3414
 
3371
3415
  return validated_y
3372
3416
 
3417
+ def _validate_eval_set(self, X: pd.DataFrame, eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]]):
3418
+ if eval_set is None:
3419
+ return None
3420
+ return [self._validate_eval_set_pair(X, eval_pair) for eval_pair in eval_set]
3421
+
3373
3422
  def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
3374
3423
  if len(eval_pair) != 2:
3375
3424
  raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
@@ -3450,7 +3499,7 @@ if response.status_code == 200:
3450
3499
  raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
3451
3500
 
3452
3501
  # Check for duplicates between train and eval sets by comparing all values
3453
- train_eval_intersection = pd.merge(X, validated_eval_X, how='inner')
3502
+ train_eval_intersection = pd.merge(X, validated_eval_X, how="inner")
3454
3503
  if len(train_eval_intersection) > 0:
3455
3504
  raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
3456
3505
 
@@ -3980,7 +4029,7 @@ if response.status_code == 200:
3980
4029
  if features_meta is None:
3981
4030
  raise Exception(self.bundle.get("missing_features_meta"))
3982
4031
 
3983
- return [f.name for f in features_meta if f.type == "categorical"]
4032
+ return [f.name for f in features_meta if f.type == "categorical" and f.name not in (self.id_columns or [])]
3984
4033
 
3985
4034
  def __prepare_feature_importances(
3986
4035
  self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
@@ -4623,35 +4672,6 @@ if response.status_code == 200:
4623
4672
  self.logger.warning("Failed to dump input files", exc_info=True)
4624
4673
 
4625
4674
 
4626
- def _num_samples(x):
4627
- """Return number of samples in array-like x."""
4628
- if x is None:
4629
- return 0
4630
- message = "Expected sequence or array-like, got %s" % type(x)
4631
- if hasattr(x, "fit") and callable(x.fit):
4632
- # Don't get num_samples from an ensembles length!
4633
- raise TypeError(message)
4634
-
4635
- if not hasattr(x, "__len__") and not hasattr(x, "shape"):
4636
- if hasattr(x, "__array__"):
4637
- x = np.asarray(x)
4638
- else:
4639
- raise TypeError(message)
4640
-
4641
- if hasattr(x, "shape") and x.shape is not None:
4642
- if len(x.shape) == 0:
4643
- raise TypeError("Singleton array %r cannot be considered a valid collection." % x)
4644
- # Check that shape is returning an integer or default to len
4645
- # Dask dataframes may not return numeric shape[0] value
4646
- if isinstance(x.shape[0], numbers.Integral):
4647
- return x.shape[0]
4648
-
4649
- try:
4650
- return len(x)
4651
- except TypeError as type_error:
4652
- raise TypeError(message) from type_error
4653
-
4654
-
4655
4675
  def is_frames_equal(first, second, bundle: ResourceBundle) -> bool:
4656
4676
  if (isinstance(first, pd.DataFrame) and isinstance(second, pd.DataFrame)) or (
4657
4677
  isinstance(first, pd.Series) and isinstance(second, pd.Series)