upgini 1.2.89a1__py3-none-any.whl → 1.2.91__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,6 @@ import hashlib
5
5
  import itertools
6
6
  import json
7
7
  import logging
8
- import numbers
9
8
  import os
10
9
  import sys
11
10
  import tempfile
@@ -30,6 +29,7 @@ from scipy.stats import ks_2samp
30
29
  from sklearn.base import TransformerMixin
31
30
  from sklearn.exceptions import NotFittedError
32
31
  from sklearn.model_selection import BaseCrossValidator, TimeSeriesSplit
32
+ from sklearn.preprocessing import OrdinalEncoder
33
33
 
34
34
  from upgini.autofe.feature import Feature
35
35
  from upgini.autofe.timeseries import TimeSeriesBase
@@ -118,9 +118,9 @@ except Exception:
118
118
  CustomFallbackProgressBar as ProgressBar,
119
119
  )
120
120
 
121
+ from upgini.utils.sample_utils import SampleColumns, SampleConfig, _num_samples, sample
121
122
  from upgini.utils.sort import sort_columns
122
123
  from upgini.utils.target_utils import (
123
- balance_undersample_forced,
124
124
  calculate_psi,
125
125
  define_task,
126
126
  )
@@ -242,6 +242,7 @@ class FeaturesEnricher(TransformerMixin):
242
242
  disable_force_downsampling: bool = False,
243
243
  id_columns: Optional[List[str]] = None,
244
244
  generate_search_key_features: bool = True,
245
+ sample_config: Optional[SampleConfig] = None,
245
246
  **kwargs,
246
247
  ):
247
248
  self.bundle = get_custom_bundle(custom_bundle_config)
@@ -286,6 +287,7 @@ class FeaturesEnricher(TransformerMixin):
286
287
 
287
288
  self.search_keys = search_keys or {}
288
289
  self.id_columns = id_columns
290
+ self.id_columns_encoder = None
289
291
  self.country_code = country_code
290
292
  self.__validate_search_keys(search_keys, search_id)
291
293
 
@@ -299,6 +301,7 @@ class FeaturesEnricher(TransformerMixin):
299
301
  self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
300
302
  self.metrics: Optional[pd.DataFrame] = None
301
303
  self.feature_names_ = []
304
+ self.external_source_feature_names = []
302
305
  self.zero_shap_client_features = []
303
306
  self.feature_importances_ = []
304
307
  self.search_id = search_id
@@ -359,10 +362,8 @@ class FeaturesEnricher(TransformerMixin):
359
362
  self.columns_for_online_api = columns_for_online_api
360
363
  if columns_for_online_api is not None:
361
364
  self.runtime_parameters.properties["columns_for_online_api"] = ",".join(columns_for_online_api)
362
- maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
363
- if maybe_downsampling_limit is not None:
364
- Dataset.FIT_SAMPLE_THRESHOLD = int(maybe_downsampling_limit)
365
- Dataset.FIT_SAMPLE_ROWS = int(maybe_downsampling_limit)
365
+
366
+ self.sample_config = self._get_sample_config(sample_config)
366
367
 
367
368
  self.raise_validation_error = raise_validation_error
368
369
  self.exclude_columns = exclude_columns
@@ -375,6 +376,16 @@ class FeaturesEnricher(TransformerMixin):
375
376
  self.autofe_features_display_handle = None
376
377
  self.report_button_handle = None
377
378
 
379
+ def _get_sample_config(self, sample_config: Optional[SampleConfig] = None):
380
+ sample_config = sample_config or SampleConfig(force_sample_size=Dataset.FORCE_SAMPLE_SIZE)
381
+
382
+ maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
383
+ if maybe_downsampling_limit is not None:
384
+ sample_config.fit_sample_rows = int(maybe_downsampling_limit)
385
+ sample_config.fit_sample_threshold = int(maybe_downsampling_limit)
386
+
387
+ return sample_config
388
+
378
389
  def _get_api_key(self):
379
390
  return self._api_key
380
391
 
@@ -928,16 +939,15 @@ class FeaturesEnricher(TransformerMixin):
928
939
  ):
929
940
  raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
930
941
 
931
- validated_X = self._validate_X(effective_X)
932
- validated_y = self._validate_y(validated_X, effective_y)
933
- validated_eval_set = (
934
- [self._validate_eval_set_pair(validated_X, eval_pair) for eval_pair in effective_eval_set]
935
- if effective_eval_set is not None
936
- else None
942
+ validated_X, validated_y, validated_eval_set = self._validate_train_eval(
943
+ effective_X, effective_y, effective_eval_set
937
944
  )
938
945
 
939
946
  if self.X is None:
940
947
  self.X = X
948
+ self.id_columns_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1).fit(
949
+ X[self.id_columns or []]
950
+ )
941
951
  if self.y is None:
942
952
  self.y = y
943
953
  if self.eval_set is None:
@@ -971,6 +981,19 @@ class FeaturesEnricher(TransformerMixin):
971
981
  client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
972
982
  estimator, validated_X, self.search_keys
973
983
  )
984
+ if self.id_columns and self.id_columns_encoder is not None:
985
+ if cat_features_from_backend:
986
+ cat_features_from_backend = [
987
+ c
988
+ for c in cat_features_from_backend
989
+ if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
990
+ ]
991
+ if client_cat_features:
992
+ client_cat_features = [
993
+ c
994
+ for c in client_cat_features
995
+ if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
996
+ ]
974
997
  for cat_feature in cat_features_from_backend:
975
998
  original_cat_feature = self.fit_columns_renaming.get(cat_feature)
976
999
  if original_cat_feature in self.search_keys:
@@ -1245,7 +1268,8 @@ class FeaturesEnricher(TransformerMixin):
1245
1268
  metrics.append(eval_metrics)
1246
1269
 
1247
1270
  if updating_shaps is not None:
1248
- self._update_shap_values(trace_id, fitting_X, updating_shaps, silent=not internal_call)
1271
+ decoded_X = self._decode_id_columns(fitting_X, columns_renaming)
1272
+ self._update_shap_values(trace_id, decoded_X, updating_shaps, silent=not internal_call)
1249
1273
 
1250
1274
  metrics_df = pd.DataFrame(metrics)
1251
1275
  mean_target_hdr = self.bundle.get("quality_metrics_mean_target_header")
@@ -1499,16 +1523,10 @@ class FeaturesEnricher(TransformerMixin):
1499
1523
  ):
1500
1524
  is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
1501
1525
  is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
1502
- validated_X = self._validate_X(X)
1503
- validated_y = self._validate_y(validated_X, y)
1504
1526
  checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
1505
- validated_eval_set = (
1506
- [self._validate_eval_set_pair(validated_X, eval_set_pair) for eval_set_pair in checked_eval_set]
1507
- if checked_eval_set
1508
- else None
1509
- )
1527
+ validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set)
1510
1528
 
1511
- sampled_data = self._sample_data_for_metrics(
1529
+ sampled_data = self._get_enriched_for_metrics(
1512
1530
  trace_id,
1513
1531
  validated_X,
1514
1532
  validated_y,
@@ -1582,7 +1600,11 @@ class FeaturesEnricher(TransformerMixin):
1582
1600
  fitting_enriched_X = fitting_enriched_X.drop(columns=columns_with_high_cardinality, errors="ignore")
1583
1601
 
1584
1602
  # Detect and drop constant columns
1585
- constant_columns = FeaturesValidator.find_constant_features(fitting_X)
1603
+ constant_columns = [
1604
+ c
1605
+ for c in FeaturesValidator.find_constant_features(fitting_X)
1606
+ if self.fit_columns_renaming.get(c, c) not in (self.id_columns or [])
1607
+ ]
1586
1608
  if len(constant_columns) > 0:
1587
1609
  self.logger.warning(f"Constant columns {constant_columns} will be dropped for metrics calculation")
1588
1610
  fitting_X = fitting_X.drop(columns=constant_columns, errors="ignore")
@@ -1625,6 +1647,7 @@ class FeaturesEnricher(TransformerMixin):
1625
1647
  fitting_X, y_sorted, search_keys, self.model_task_type, sort_all_columns=True, logger=self.logger
1626
1648
  )
1627
1649
  fitting_X = fitting_X[fitting_x_columns]
1650
+ fitting_X, _ = self._encode_id_columns(fitting_X, self.fit_columns_renaming)
1628
1651
  self.logger.info(f"Final sorted list of fitting X columns: {fitting_x_columns}")
1629
1652
  fitting_enriched_x_columns = fitting_enriched_X.columns.to_list()
1630
1653
  fitting_enriched_x_columns = sort_columns(
@@ -1636,6 +1659,7 @@ class FeaturesEnricher(TransformerMixin):
1636
1659
  logger=self.logger,
1637
1660
  )
1638
1661
  fitting_enriched_X = fitting_enriched_X[fitting_enriched_x_columns]
1662
+ fitting_enriched_X, _ = self._encode_id_columns(fitting_enriched_X, self.fit_columns_renaming)
1639
1663
  self.logger.info(f"Final sorted list of fitting enriched X columns: {fitting_enriched_x_columns}")
1640
1664
  for idx, eval_tuple in eval_set_sampled_dict.items():
1641
1665
  eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
@@ -1663,6 +1687,12 @@ class FeaturesEnricher(TransformerMixin):
1663
1687
  .astype(np.float64)
1664
1688
  )
1665
1689
 
1690
+ fitting_eval_X, unknown_dict = self._encode_id_columns(fitting_eval_X, self.fit_columns_renaming)
1691
+ fitting_enriched_eval_X, _ = self._encode_id_columns(fitting_enriched_eval_X, self.fit_columns_renaming)
1692
+
1693
+ if len(unknown_dict) > 0:
1694
+ print(self.bundle.get("unknown_id_column_value_in_eval_set").format(unknown_dict))
1695
+
1666
1696
  fitting_eval_set_dict[idx] = (
1667
1697
  fitting_eval_X,
1668
1698
  eval_y_sorted,
@@ -1684,7 +1714,7 @@ class FeaturesEnricher(TransformerMixin):
1684
1714
  )
1685
1715
 
1686
1716
  @dataclass
1687
- class _SampledDataForMetrics:
1717
+ class _EnrichedDataForMetrics:
1688
1718
  X_sampled: pd.DataFrame
1689
1719
  y_sampled: pd.Series
1690
1720
  enriched_X: pd.DataFrame
@@ -1692,7 +1722,7 @@ class FeaturesEnricher(TransformerMixin):
1692
1722
  search_keys: Dict[str, SearchKey]
1693
1723
  columns_renaming: Dict[str, str]
1694
1724
 
1695
- def _sample_data_for_metrics(
1725
+ def _get_enriched_for_metrics(
1696
1726
  self,
1697
1727
  trace_id: str,
1698
1728
  validated_X: Union[pd.DataFrame, pd.Series, np.ndarray, None],
@@ -1704,7 +1734,7 @@ class FeaturesEnricher(TransformerMixin):
1704
1734
  remove_outliers_calc_metrics: Optional[bool],
1705
1735
  progress_bar: Optional[ProgressBar],
1706
1736
  progress_callback: Optional[Callable[[SearchProgress], Any]],
1707
- ) -> _SampledDataForMetrics:
1737
+ ) -> _EnrichedDataForMetrics:
1708
1738
  datasets_hash = hash_input(validated_X, validated_y, eval_set)
1709
1739
  cached_sampled_datasets = self.__cached_sampled_datasets.get(datasets_hash)
1710
1740
  if cached_sampled_datasets is not None and is_input_same_as_fit and remove_outliers_calc_metrics is None:
@@ -1712,7 +1742,7 @@ class FeaturesEnricher(TransformerMixin):
1712
1742
  return self.__get_sampled_cached_enriched(datasets_hash, exclude_features_sources)
1713
1743
  elif len(self.feature_importances_) == 0:
1714
1744
  self.logger.info("No external features selected. So use only input datasets for metrics calculation")
1715
- return self.__sample_only_input(validated_X, validated_y, eval_set, is_demo_dataset)
1745
+ return self.__get_enriched_as_input(validated_X, validated_y, eval_set, is_demo_dataset)
1716
1746
  # TODO save and check if dataset was deduplicated - use imbalance branch for such case
1717
1747
  elif (
1718
1748
  not self.imbalanced
@@ -1721,14 +1751,14 @@ class FeaturesEnricher(TransformerMixin):
1721
1751
  and self.df_with_original_index is not None
1722
1752
  ):
1723
1753
  self.logger.info("Dataset is not imbalanced, so use enriched_X from fit")
1724
- return self.__sample_balanced(eval_set, trace_id, remove_outliers_calc_metrics)
1754
+ return self.__get_enriched_from_fit(eval_set, trace_id, remove_outliers_calc_metrics)
1725
1755
  else:
1726
1756
  self.logger.info(
1727
1757
  "Dataset is imbalanced or exclude_features_sources or X was passed or this is saved search."
1728
1758
  " Run transform"
1729
1759
  )
1730
1760
  print(self.bundle.get("prepare_data_for_metrics"))
1731
- return self.__sample_imbalanced(
1761
+ return self.__get_enriched_from_transform(
1732
1762
  validated_X,
1733
1763
  validated_y,
1734
1764
  eval_set,
@@ -1740,7 +1770,7 @@ class FeaturesEnricher(TransformerMixin):
1740
1770
 
1741
1771
  def __get_sampled_cached_enriched(
1742
1772
  self, datasets_hash: str, exclude_features_sources: Optional[List[str]]
1743
- ) -> _SampledDataForMetrics:
1773
+ ) -> _EnrichedDataForMetrics:
1744
1774
  X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = (
1745
1775
  self.__cached_sampled_datasets[datasets_hash]
1746
1776
  )
@@ -1757,9 +1787,9 @@ class FeaturesEnricher(TransformerMixin):
1757
1787
  search_keys,
1758
1788
  )
1759
1789
 
1760
- def __sample_only_input(
1790
+ def __get_enriched_as_input(
1761
1791
  self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
1762
- ) -> _SampledDataForMetrics:
1792
+ ) -> _EnrichedDataForMetrics:
1763
1793
  eval_set_sampled_dict = {}
1764
1794
 
1765
1795
  df = validated_X.copy()
@@ -1801,24 +1831,13 @@ class FeaturesEnricher(TransformerMixin):
1801
1831
  normalizer = Normalizer(self.bundle, self.logger)
1802
1832
  df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
1803
1833
  columns_renaming = normalizer.columns_renaming
1804
- # columns_renaming = {c: c for c in df.columns}
1805
1834
 
1806
1835
  df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1807
-
1808
- num_samples = _num_samples(df)
1809
- sample_threshold, sample_rows = (
1810
- (Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD, Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS)
1811
- if eval_set is not None
1812
- else (Dataset.FIT_SAMPLE_THRESHOLD, Dataset.FIT_SAMPLE_ROWS)
1813
- )
1814
-
1815
1836
  df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID, TARGET, columns_renaming, silent=True)
1837
+
1816
1838
  # Sample after sorting by system_record_id for idempotency
1817
1839
  df.sort_values(by=SYSTEM_RECORD_ID, inplace=True)
1818
-
1819
- if num_samples > sample_threshold:
1820
- self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
1821
- df = df.sample(n=sample_rows, random_state=self.random_state)
1840
+ df = self.__downsample_for_metrics(df)
1822
1841
 
1823
1842
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
1824
1843
  df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
@@ -1847,12 +1866,12 @@ class FeaturesEnricher(TransformerMixin):
1847
1866
  search_keys,
1848
1867
  )
1849
1868
 
1850
- def __sample_balanced(
1869
+ def __get_enriched_from_fit(
1851
1870
  self,
1852
1871
  eval_set: Optional[List[tuple]],
1853
1872
  trace_id: str,
1854
1873
  remove_outliers_calc_metrics: Optional[bool],
1855
- ) -> _SampledDataForMetrics:
1874
+ ) -> _EnrichedDataForMetrics:
1856
1875
  eval_set_sampled_dict = {}
1857
1876
  search_keys = self.fit_search_keys
1858
1877
 
@@ -1951,7 +1970,7 @@ class FeaturesEnricher(TransformerMixin):
1951
1970
  search_keys,
1952
1971
  )
1953
1972
 
1954
- def __sample_imbalanced(
1973
+ def __get_enriched_from_transform(
1955
1974
  self,
1956
1975
  validated_X: pd.DataFrame,
1957
1976
  validated_y: pd.Series,
@@ -1960,7 +1979,7 @@ class FeaturesEnricher(TransformerMixin):
1960
1979
  trace_id: str,
1961
1980
  progress_bar: Optional[ProgressBar],
1962
1981
  progress_callback: Optional[Callable[[SearchProgress], Any]],
1963
- ) -> _SampledDataForMetrics:
1982
+ ) -> _EnrichedDataForMetrics:
1964
1983
  has_eval_set = eval_set is not None
1965
1984
 
1966
1985
  self.logger.info(f"Transform {'with' if has_eval_set else 'without'} eval_set")
@@ -2017,17 +2036,18 @@ class FeaturesEnricher(TransformerMixin):
2017
2036
  )
2018
2037
 
2019
2038
  def __combine_train_and_eval_sets(
2020
- self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]]
2039
+ self, X: pd.DataFrame, y: Optional[pd.Series] = None, eval_set: Optional[List[tuple]] = None
2021
2040
  ) -> pd.DataFrame:
2022
- df = validated_X.copy()
2023
- df[TARGET] = validated_y
2024
- if eval_set is None:
2041
+ df = X.copy()
2042
+ if y is not None:
2043
+ df[TARGET] = y
2044
+ if not eval_set:
2025
2045
  return df
2026
2046
 
2027
2047
  df[EVAL_SET_INDEX] = 0
2028
2048
 
2029
2049
  for idx, eval_pair in enumerate(eval_set):
2030
- eval_x, eval_y = self._validate_eval_set_pair(validated_X, eval_pair)
2050
+ eval_x, eval_y = eval_pair
2031
2051
  eval_df_with_index = eval_x.copy()
2032
2052
  eval_df_with_index[TARGET] = eval_y
2033
2053
  eval_df_with_index[EVAL_SET_INDEX] = idx + 1
@@ -2036,42 +2056,37 @@ class FeaturesEnricher(TransformerMixin):
2036
2056
  return df
2037
2057
 
2038
2058
  def __downsample_for_metrics(self, df: pd.DataFrame) -> pd.DataFrame:
2059
+ force_downsampling = self.__use_force_downsampling(df)
2060
+
2061
+ sample_columns = SampleColumns(
2062
+ ids=self.id_columns,
2063
+ date=self._get_date_column(self.search_keys),
2064
+ target=TARGET,
2065
+ eval_set_index=EVAL_SET_INDEX,
2066
+ )
2067
+
2068
+ return sample(
2069
+ df,
2070
+ self.model_task_type,
2071
+ self.cv,
2072
+ self.sample_config,
2073
+ sample_columns,
2074
+ self.random_state,
2075
+ force_downsampling=force_downsampling,
2076
+ balance=False,
2077
+ logger=self.logger,
2078
+ bundle=self.bundle,
2079
+ warning_callback=self.__log_warning,
2080
+ )
2081
+
2082
+ def __use_force_downsampling(self, df: pd.DataFrame) -> bool:
2039
2083
  num_samples = _num_samples(df)
2040
- force_downsampling = (
2084
+ return (
2041
2085
  not self.disable_force_downsampling
2042
2086
  and self.columns_for_online_api is not None
2043
2087
  and num_samples > Dataset.FORCE_SAMPLE_SIZE
2044
2088
  )
2045
2089
 
2046
- if force_downsampling:
2047
- self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
2048
- return balance_undersample_forced(
2049
- df=df,
2050
- target_column=TARGET,
2051
- id_columns=self.id_columns,
2052
- date_column=self._get_date_column(self.search_keys),
2053
- task_type=self.model_task_type,
2054
- cv_type=self.cv,
2055
- random_state=self.random_state,
2056
- sample_size=Dataset.FORCE_SAMPLE_SIZE,
2057
- logger=self.logger,
2058
- bundle=self.bundle,
2059
- warning_callback=self.__log_warning,
2060
- )
2061
- elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
2062
- if EVAL_SET_INDEX in df.columns:
2063
- threshold = Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD
2064
- sample_size = Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS
2065
- else:
2066
- threshold = Dataset.FIT_SAMPLE_THRESHOLD
2067
- sample_size = Dataset.FIT_SAMPLE_ROWS
2068
-
2069
- if num_samples > threshold:
2070
- self.logger.info(f"Downsampling from {num_samples} to {sample_size}")
2071
- return df.sample(n=sample_size, random_state=self.random_state)
2072
-
2073
- return df
2074
-
2075
2090
  def __extract_train_data(
2076
2091
  self, enriched_df: pd.DataFrame, x_columns: List[str]
2077
2092
  ) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
@@ -2107,7 +2122,7 @@ class FeaturesEnricher(TransformerMixin):
2107
2122
  eval_set_sampled_dict: Dict[int, Tuple],
2108
2123
  columns_renaming: Dict[str, str],
2109
2124
  search_keys: Dict[str, SearchKey],
2110
- ) -> _SampledDataForMetrics:
2125
+ ) -> _EnrichedDataForMetrics:
2111
2126
 
2112
2127
  self.__cached_sampled_datasets[datasets_hash] = (
2113
2128
  X_sampled,
@@ -2138,7 +2153,7 @@ class FeaturesEnricher(TransformerMixin):
2138
2153
  for k, v in search_keys.items()
2139
2154
  if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
2140
2155
  }
2141
- return FeaturesEnricher._SampledDataForMetrics(
2156
+ return FeaturesEnricher._EnrichedDataForMetrics(
2142
2157
  X_sampled=X_sampled,
2143
2158
  y_sampled=y_sampled,
2144
2159
  enriched_X=enriched_X,
@@ -2286,13 +2301,10 @@ if response.status_code == 200:
2286
2301
  with MDC(trace_id=trace_id, search_id=search_id):
2287
2302
  self.logger.info("Start transform")
2288
2303
 
2289
- validated_X = self._validate_X(X, is_transform=True)
2290
- if y is not None:
2291
- validated_y = self._validate_y(validated_X, y)
2292
- df = self.__combine_train_and_eval_sets(validated_X, validated_y, eval_set=None)
2293
- else:
2294
- validated_y = None
2295
- df = validated_X
2304
+ validated_X, validated_y, validated_eval_set = self._validate_train_eval(
2305
+ X, y, eval_set=None, is_transform=True
2306
+ )
2307
+ df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
2296
2308
 
2297
2309
  validated_Xy = df.copy()
2298
2310
 
@@ -2346,7 +2358,9 @@ if response.status_code == 200:
2346
2358
 
2347
2359
  is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
2348
2360
 
2349
- columns_to_drop = [c for c in df.columns if c in self.feature_names_]
2361
+ columns_to_drop = [
2362
+ c for c in df.columns if c in self.feature_names_ and c in self.external_source_feature_names
2363
+ ]
2350
2364
  if len(columns_to_drop) > 0:
2351
2365
  msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
2352
2366
  self.logger.warning(msg)
@@ -2550,6 +2564,7 @@ if response.status_code == 200:
2550
2564
  id_columns=self.__get_renamed_id_columns(columns_renaming),
2551
2565
  date_column=self._get_date_column(search_keys),
2552
2566
  date_format=self.date_format,
2567
+ sample_config=self.sample_config,
2553
2568
  rest_client=self.rest_client,
2554
2569
  logger=self.logger,
2555
2570
  bundle=self.bundle,
@@ -2653,7 +2668,7 @@ if response.status_code == 200:
2653
2668
  selecting_columns = [
2654
2669
  c
2655
2670
  for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
2656
- if c not in self.zero_shap_client_features
2671
+ if c not in self.zero_shap_client_features or c in (self.id_columns or [])
2657
2672
  ]
2658
2673
  selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
2659
2674
  if add_fit_system_record_id:
@@ -2801,13 +2816,8 @@ if response.status_code == 200:
2801
2816
  self.fit_dropped_features = set()
2802
2817
  self.fit_generated_features = []
2803
2818
 
2804
- validated_X = self._validate_X(X)
2805
- validated_y = self._validate_y(validated_X, y)
2806
- validated_eval_set = (
2807
- [self._validate_eval_set_pair(validated_X, eval_pair) for eval_pair in eval_set]
2808
- if eval_set is not None
2809
- else None
2810
- )
2819
+ validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, eval_set)
2820
+
2811
2821
  is_demo_dataset = hash_input(validated_X, validated_y, validated_eval_set) in DEMO_DATASET_HASHES
2812
2822
  if is_demo_dataset:
2813
2823
  msg = self.bundle.get("demo_dataset_info")
@@ -2852,14 +2862,8 @@ if response.status_code == 200:
2852
2862
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
2853
2863
  )
2854
2864
 
2855
- df = pd.concat([validated_X, validated_y], axis=1)
2856
-
2857
- if validated_eval_set is not None and len(validated_eval_set) > 0:
2858
- df[EVAL_SET_INDEX] = 0
2859
- for idx, (eval_X, eval_y) in enumerate(validated_eval_set):
2860
- eval_df = pd.concat([eval_X, eval_y], axis=1)
2861
- eval_df[EVAL_SET_INDEX] = idx + 1
2862
- df = pd.concat([df, eval_df])
2865
+ df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
2866
+ self.id_columns_encoder = OrdinalEncoder().fit(df[self.id_columns or []])
2863
2867
 
2864
2868
  self.fit_search_keys = self.search_keys.copy()
2865
2869
  df = self.__handle_index_search_keys(df, self.fit_search_keys)
@@ -2970,47 +2974,8 @@ if response.status_code == 200:
2970
2974
  # TODO check maybe need to drop _time column from df_with_original_index
2971
2975
 
2972
2976
  df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys, self.fit_columns_renaming)
2973
-
2974
- # Convert EMAIL to HEM after unnesting to do it only with one column
2975
- email_column = self._get_email_column(self.fit_search_keys)
2976
- hem_column = self._get_hem_column(self.fit_search_keys)
2977
- if email_column:
2978
- converter = EmailSearchKeyConverter(
2979
- email_column,
2980
- hem_column,
2981
- self.fit_search_keys,
2982
- self.fit_columns_renaming,
2983
- list(unnest_search_keys.keys()),
2984
- self.bundle,
2985
- self.logger,
2986
- )
2987
- df = converter.convert(df)
2988
-
2989
- ip_column = self._get_ip_column(self.fit_search_keys)
2990
- if ip_column:
2991
- converter = IpSearchKeyConverter(
2992
- ip_column,
2993
- self.fit_search_keys,
2994
- self.fit_columns_renaming,
2995
- list(unnest_search_keys.keys()),
2996
- self.bundle,
2997
- self.logger,
2998
- )
2999
- df = converter.convert(df)
3000
- phone_column = self._get_phone_column(self.fit_search_keys)
3001
- country_column = self._get_country_column(self.fit_search_keys)
3002
- if phone_column:
3003
- converter = PhoneSearchKeyConverter(phone_column, country_column)
3004
- df = converter.convert(df)
3005
-
3006
- if country_column:
3007
- converter = CountrySearchKeyConverter(country_column)
3008
- df = converter.convert(df)
3009
-
3010
- postal_code = self._get_postal_column(self.fit_search_keys)
3011
- if postal_code:
3012
- converter = PostalCodeSearchKeyConverter(postal_code)
3013
- df = converter.convert(df)
2977
+ # Convert EMAIL to HEM etc after unnesting to do it only with one column
2978
+ df = self.__convert_unnestable_keys(df, unnest_search_keys)
3014
2979
 
3015
2980
  non_feature_columns = [
3016
2981
  self.TARGET_NAME,
@@ -3061,11 +3026,7 @@ if response.status_code == 200:
3061
3026
  runtime_parameters = self._get_copy_of_runtime_parameters()
3062
3027
 
3063
3028
  # Force downsampling to 7000 for API features generation
3064
- force_downsampling = (
3065
- not self.disable_force_downsampling
3066
- and self.columns_for_online_api is not None
3067
- and len(df) > Dataset.FORCE_SAMPLE_SIZE
3068
- )
3029
+ force_downsampling = self.__use_force_downsampling(df)
3069
3030
  if force_downsampling:
3070
3031
  runtime_parameters.properties["fast_fit"] = True
3071
3032
 
@@ -3085,6 +3046,7 @@ if response.status_code == 200:
3085
3046
  logger=self.logger,
3086
3047
  bundle=self.bundle,
3087
3048
  warning_callback=self.__log_warning,
3049
+ sample_config=self.sample_config,
3088
3050
  )
3089
3051
  dataset.columns_renaming = self.fit_columns_renaming
3090
3052
 
@@ -3240,6 +3202,49 @@ if response.status_code == 200:
3240
3202
  if not self.warning_counter.has_warnings():
3241
3203
  self.__display_support_link(self.bundle.get("all_ok_community_invite"))
3242
3204
 
3205
+ def __convert_unnestable_keys(self, df: pd.DataFrame, unnest_search_keys: Dict[str, str]):
3206
+ email_column = self._get_email_column(self.fit_search_keys)
3207
+ hem_column = self._get_hem_column(self.fit_search_keys)
3208
+ if email_column:
3209
+ converter = EmailSearchKeyConverter(
3210
+ email_column,
3211
+ hem_column,
3212
+ self.fit_search_keys,
3213
+ self.fit_columns_renaming,
3214
+ list(unnest_search_keys.keys()),
3215
+ self.bundle,
3216
+ self.logger,
3217
+ )
3218
+ df = converter.convert(df)
3219
+
3220
+ ip_column = self._get_ip_column(self.fit_search_keys)
3221
+ if ip_column:
3222
+ converter = IpSearchKeyConverter(
3223
+ ip_column,
3224
+ self.fit_search_keys,
3225
+ self.fit_columns_renaming,
3226
+ list(unnest_search_keys.keys()),
3227
+ self.bundle,
3228
+ self.logger,
3229
+ )
3230
+ df = converter.convert(df)
3231
+ phone_column = self._get_phone_column(self.fit_search_keys)
3232
+ country_column = self._get_country_column(self.fit_search_keys)
3233
+ if phone_column:
3234
+ converter = PhoneSearchKeyConverter(phone_column, country_column)
3235
+ df = converter.convert(df)
3236
+
3237
+ if country_column:
3238
+ converter = CountrySearchKeyConverter(country_column)
3239
+ df = converter.convert(df)
3240
+
3241
+ postal_code = self._get_postal_column(self.fit_search_keys)
3242
+ if postal_code:
3243
+ converter = PostalCodeSearchKeyConverter(postal_code)
3244
+ df = converter.convert(df)
3245
+
3246
+ return df
3247
+
3243
3248
  def __should_add_date_column(self):
3244
3249
  return self.add_date_if_missing or (self.cv is not None and self.cv.is_time_series())
3245
3250
 
@@ -3282,6 +3287,57 @@ if response.status_code == 200:
3282
3287
  search_keys_with_autodetection = {**self.search_keys, **self.autodetected_search_keys}
3283
3288
  return [c for c, v in search_keys_with_autodetection.items() if v.value.value in keys]
3284
3289
 
3290
+ def _validate_train_eval(
3291
+ self,
3292
+ X: pd.DataFrame,
3293
+ y: Optional[pd.Series] = None,
3294
+ eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]] = None,
3295
+ is_transform: bool = False,
3296
+ ) -> Tuple[pd.DataFrame, pd.Series, Optional[List[Tuple[pd.DataFrame, pd.Series]]]]:
3297
+ validated_X = self._validate_X(X, is_transform)
3298
+ validated_y = self._validate_y(validated_X, y, enforce_y=not is_transform)
3299
+ validated_eval_set = self._validate_eval_set(validated_X, eval_set)
3300
+ return validated_X, validated_y, validated_eval_set
3301
+
3302
+ def _encode_id_columns(
3303
+ self,
3304
+ X: pd.DataFrame,
3305
+ columns_renaming: Optional[Dict[str, str]] = None,
3306
+ ) -> Tuple[pd.DataFrame, Dict[str, List[Any]]]:
3307
+ columns_renaming = columns_renaming or {}
3308
+ unknown_dict = {}
3309
+
3310
+ if self.id_columns and self.id_columns_encoder is not None:
3311
+ inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
3312
+ renamed_id_columns = [
3313
+ inverse_columns_renaming.get(col, col) for col in self.id_columns_encoder.feature_names_in_
3314
+ ]
3315
+ self.logger.info(f"Convert id columns to int: {renamed_id_columns}")
3316
+ encoded = self.id_columns_encoder.transform(X[renamed_id_columns].rename(columns=columns_renaming))
3317
+ for i, c in enumerate(renamed_id_columns):
3318
+ unknown_values = X[encoded[:, i] == -1][c].unique().tolist()
3319
+ if len(unknown_values) > 0:
3320
+ unknown_dict[c] = unknown_values
3321
+ X[renamed_id_columns] = encoded
3322
+ X = X.loc[(X[renamed_id_columns] != -1).all(axis=1)]
3323
+
3324
+ if len(unknown_dict) > 0:
3325
+ self.logger.warning(f"Unknown values in id columns: {unknown_dict}")
3326
+
3327
+ return X, unknown_dict
3328
+
3329
+ def _decode_id_columns(self, X: pd.DataFrame, columns_renaming: Dict[str, str]):
3330
+ columns_renaming = columns_renaming or {}
3331
+ if self.id_columns and self.id_columns_encoder is not None:
3332
+ inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
3333
+ renamed_id_columns = [
3334
+ inverse_columns_renaming.get(col, col) for col in self.id_columns_encoder.feature_names_in_
3335
+ ]
3336
+ decoded = self.id_columns_encoder.inverse_transform(X[renamed_id_columns].rename(columns=columns_renaming))
3337
+ X[renamed_id_columns] = decoded
3338
+
3339
+ return X
3340
+
3285
3341
  def _validate_X(self, X, is_transform=False) -> pd.DataFrame:
3286
3342
  if isinstance(X, pd.DataFrame):
3287
3343
  if isinstance(X.columns, pd.MultiIndex) or isinstance(X.index, pd.MultiIndex):
@@ -3323,7 +3379,9 @@ if response.status_code == 200:
3323
3379
 
3324
3380
  return validated_X
3325
3381
 
3326
- def _validate_y(self, X: pd.DataFrame, y) -> pd.Series:
3382
+ def _validate_y(self, X: pd.DataFrame, y, enforce_y: bool = True) -> Optional[pd.Series]:
3383
+ if y is None and not enforce_y:
3384
+ return None
3327
3385
  if (
3328
3386
  not isinstance(y, pd.Series)
3329
3387
  and not isinstance(y, pd.DataFrame)
@@ -3370,6 +3428,11 @@ if response.status_code == 200:
3370
3428
 
3371
3429
  return validated_y
3372
3430
 
3431
+ def _validate_eval_set(self, X: pd.DataFrame, eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]]):
3432
+ if eval_set is None:
3433
+ return None
3434
+ return [self._validate_eval_set_pair(X, eval_pair) for eval_pair in eval_set]
3435
+
3373
3436
  def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
3374
3437
  if len(eval_pair) != 2:
3375
3438
  raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
@@ -3450,7 +3513,7 @@ if response.status_code == 200:
3450
3513
  raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
3451
3514
 
3452
3515
  # Check for duplicates between train and eval sets by comparing all values
3453
- train_eval_intersection = pd.merge(X, validated_eval_X, how='inner')
3516
+ train_eval_intersection = pd.merge(X, validated_eval_X, how="inner")
3454
3517
  if len(train_eval_intersection) > 0:
3455
3518
  raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
3456
3519
 
@@ -3980,7 +4043,7 @@ if response.status_code == 200:
3980
4043
  if features_meta is None:
3981
4044
  raise Exception(self.bundle.get("missing_features_meta"))
3982
4045
 
3983
- return [f.name for f in features_meta if f.type == "categorical"]
4046
+ return [f.name for f in features_meta if f.type == "categorical" and f.name not in (self.id_columns or [])]
3984
4047
 
3985
4048
  def __prepare_feature_importances(
3986
4049
  self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
@@ -3999,6 +4062,7 @@ if response.status_code == 200:
3999
4062
  df = df.rename(columns=original_names_dict)
4000
4063
 
4001
4064
  self.feature_names_ = []
4065
+ self.external_source_feature_names = []
4002
4066
  self.zero_shap_client_features = []
4003
4067
  self.feature_importances_ = []
4004
4068
  features_info = []
@@ -4030,6 +4094,9 @@ if response.status_code == 200:
4030
4094
  original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
4031
4095
  is_client_feature = original_name in df.columns
4032
4096
 
4097
+ if not is_client_feature:
4098
+ self.external_source_feature_names.append(original_name)
4099
+
4033
4100
  # TODO make a decision about selected features based on special flag from mlb
4034
4101
  if original_shaps.get(feature_meta.name, 0.0) == 0.0:
4035
4102
  if is_client_feature and self.fit_select_features:
@@ -4623,35 +4690,6 @@ if response.status_code == 200:
4623
4690
  self.logger.warning("Failed to dump input files", exc_info=True)
4624
4691
 
4625
4692
 
4626
- def _num_samples(x):
4627
- """Return number of samples in array-like x."""
4628
- if x is None:
4629
- return 0
4630
- message = "Expected sequence or array-like, got %s" % type(x)
4631
- if hasattr(x, "fit") and callable(x.fit):
4632
- # Don't get num_samples from an ensembles length!
4633
- raise TypeError(message)
4634
-
4635
- if not hasattr(x, "__len__") and not hasattr(x, "shape"):
4636
- if hasattr(x, "__array__"):
4637
- x = np.asarray(x)
4638
- else:
4639
- raise TypeError(message)
4640
-
4641
- if hasattr(x, "shape") and x.shape is not None:
4642
- if len(x.shape) == 0:
4643
- raise TypeError("Singleton array %r cannot be considered a valid collection." % x)
4644
- # Check that shape is returning an integer or default to len
4645
- # Dask dataframes may not return numeric shape[0] value
4646
- if isinstance(x.shape[0], numbers.Integral):
4647
- return x.shape[0]
4648
-
4649
- try:
4650
- return len(x)
4651
- except TypeError as type_error:
4652
- raise TypeError(message) from type_error
4653
-
4654
-
4655
4693
  def is_frames_equal(first, second, bundle: ResourceBundle) -> bool:
4656
4694
  if (isinstance(first, pd.DataFrame) and isinstance(second, pd.DataFrame)) or (
4657
4695
  isinstance(first, pd.Series) and isinstance(second, pd.Series)