upgini 1.2.70a3832.dev2__py3-none-any.whl → 1.2.71__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

@@ -12,6 +12,7 @@ import tempfile
12
12
  import time
13
13
  import uuid
14
14
  from collections import Counter
15
+ from copy import deepcopy
15
16
  from dataclasses import dataclass
16
17
  from threading import Thread
17
18
  from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
@@ -54,6 +55,7 @@ from upgini.metadata import (
54
55
  SORT_ID,
55
56
  SYSTEM_RECORD_ID,
56
57
  TARGET,
58
+ AutoFEParameters,
57
59
  CVType,
58
60
  FeaturesMetadataV2,
59
61
  FileColumnMeaningType,
@@ -407,6 +409,7 @@ class FeaturesEnricher(TransformerMixin):
407
409
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
408
410
  search_id_callback: Optional[Callable[[str], Any]] = None,
409
411
  select_features: bool = True,
412
+ auto_fe_parameters: Optional[AutoFEParameters] = None,
410
413
  **kwargs,
411
414
  ):
412
415
  """Fit to data.
@@ -495,6 +498,7 @@ class FeaturesEnricher(TransformerMixin):
495
498
  importance_threshold=importance_threshold,
496
499
  max_features=max_features,
497
500
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
501
+ auto_fe_parameters=auto_fe_parameters,
498
502
  progress_callback=progress_callback,
499
503
  search_id_callback=search_id_callback,
500
504
  )
@@ -550,6 +554,7 @@ class FeaturesEnricher(TransformerMixin):
550
554
  remove_outliers_calc_metrics: Optional[bool] = None,
551
555
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
552
556
  select_features: bool = True,
557
+ auto_fe_parameters: Optional[AutoFEParameters] = None,
553
558
  **kwargs,
554
559
  ) -> pd.DataFrame:
555
560
  """Fit to data, then transform it.
@@ -649,6 +654,7 @@ class FeaturesEnricher(TransformerMixin):
649
654
  importance_threshold=importance_threshold,
650
655
  max_features=max_features,
651
656
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
657
+ auto_fe_parameters=auto_fe_parameters,
652
658
  progress_callback=progress_callback,
653
659
  )
654
660
  self.logger.info("Inner fit finished successfully")
@@ -703,6 +709,7 @@ class FeaturesEnricher(TransformerMixin):
703
709
  self,
704
710
  X: pd.DataFrame,
705
711
  *args,
712
+ y: Optional[pd.Series] = None,
706
713
  exclude_features_sources: Optional[List[str]] = None,
707
714
  keep_input: bool = True,
708
715
  importance_threshold: Optional[float] = None,
@@ -763,9 +770,10 @@ class FeaturesEnricher(TransformerMixin):
763
770
 
764
771
  start_time = time.time()
765
772
  try:
766
- result, _, _ = self.__inner_transform(
773
+ result, _, _, _ = self.__inner_transform(
767
774
  trace_id,
768
775
  X,
776
+ y=y,
769
777
  exclude_features_sources=exclude_features_sources,
770
778
  importance_threshold=importance_threshold,
771
779
  max_features=max_features,
@@ -834,7 +842,7 @@ class FeaturesEnricher(TransformerMixin):
834
842
  max_features: Optional[int] = None,
835
843
  remove_outliers_calc_metrics: Optional[bool] = None,
836
844
  trace_id: Optional[str] = None,
837
- silent: bool = False,
845
+ internal_call: bool = False,
838
846
  progress_bar: Optional[ProgressBar] = None,
839
847
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
840
848
  **kwargs,
@@ -1088,7 +1096,7 @@ class FeaturesEnricher(TransformerMixin):
1088
1096
  enriched_shaps = enriched_cv_result.shap_values
1089
1097
 
1090
1098
  if enriched_shaps is not None:
1091
- self._update_shap_values(trace_id, fitting_X, enriched_shaps)
1099
+ self._update_shap_values(trace_id, fitting_X, enriched_shaps, silent=not internal_call)
1092
1100
 
1093
1101
  if enriched_metric is None:
1094
1102
  self.logger.warning(
@@ -1249,14 +1257,14 @@ class FeaturesEnricher(TransformerMixin):
1249
1257
  if self.raise_validation_error:
1250
1258
  raise e
1251
1259
  else:
1252
- if not silent:
1260
+ if not internal_call:
1253
1261
  self._dump_python_libs()
1254
1262
  self.__display_support_link()
1255
1263
  raise e
1256
1264
  finally:
1257
1265
  self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
1258
1266
 
1259
- def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float]):
1267
+ def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float], silent: bool = False):
1260
1268
  renaming = self.fit_columns_renaming or {}
1261
1269
  new_shaps = {
1262
1270
  renaming.get(feature, feature): _round_shap_value(shap)
@@ -1265,7 +1273,7 @@ class FeaturesEnricher(TransformerMixin):
1265
1273
  }
1266
1274
  self.__prepare_feature_importances(trace_id, df, new_shaps)
1267
1275
 
1268
- if self.features_info_display_handle is not None:
1276
+ if not silent and self.features_info_display_handle is not None:
1269
1277
  try:
1270
1278
  _ = get_ipython() # type: ignore
1271
1279
 
@@ -1277,7 +1285,7 @@ class FeaturesEnricher(TransformerMixin):
1277
1285
  )
1278
1286
  except (ImportError, NameError):
1279
1287
  pass
1280
- if self.data_sources_display_handle is not None:
1288
+ if not silent and self.data_sources_display_handle is not None:
1281
1289
  try:
1282
1290
  _ = get_ipython() # type: ignore
1283
1291
 
@@ -1289,7 +1297,7 @@ class FeaturesEnricher(TransformerMixin):
1289
1297
  )
1290
1298
  except (ImportError, NameError):
1291
1299
  pass
1292
- if self.autofe_features_display_handle is not None:
1300
+ if not silent and self.autofe_features_display_handle is not None:
1293
1301
  try:
1294
1302
  _ = get_ipython() # type: ignore
1295
1303
  autofe_descriptions_df = self.get_autofe_features_description()
@@ -1302,7 +1310,7 @@ class FeaturesEnricher(TransformerMixin):
1302
1310
  )
1303
1311
  except (ImportError, NameError):
1304
1312
  pass
1305
- if self.report_button_handle is not None:
1313
+ if not silent and self.report_button_handle is not None:
1306
1314
  try:
1307
1315
  _ = get_ipython() # type: ignore
1308
1316
 
@@ -1478,12 +1486,12 @@ class FeaturesEnricher(TransformerMixin):
1478
1486
 
1479
1487
  excluding_search_keys = list(search_keys.keys())
1480
1488
  if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
1481
- excluded = set()
1489
+ should_not_exclude = set()
1482
1490
  for sk in excluding_search_keys:
1483
- renamed_sk = columns_renaming.get(sk)
1491
+ renamed_sk = columns_renaming.get(sk, sk)
1484
1492
  if renamed_sk in search_keys_for_metrics or renamed_sk in self.feature_names_:
1485
- excluded.add(sk)
1486
- excluding_search_keys = [sk for sk in excluding_search_keys if sk not in excluded]
1493
+ should_not_exclude.add(sk)
1494
+ excluding_search_keys = [sk for sk in excluding_search_keys if sk not in should_not_exclude]
1487
1495
 
1488
1496
  self.logger.info(f"Excluding search keys: {excluding_search_keys}")
1489
1497
 
@@ -1505,8 +1513,7 @@ class FeaturesEnricher(TransformerMixin):
1505
1513
  self.logger.info(f"Client features column on prepare data for metrics: {client_features}")
1506
1514
 
1507
1515
  filtered_enriched_features = self.__filtered_enriched_features(
1508
- importance_threshold,
1509
- max_features,
1516
+ importance_threshold, max_features, trace_id, validated_X
1510
1517
  )
1511
1518
  filtered_enriched_features = [c for c in filtered_enriched_features if c not in client_features]
1512
1519
 
@@ -1682,7 +1689,6 @@ class FeaturesEnricher(TransformerMixin):
1682
1689
  validated_X,
1683
1690
  validated_y,
1684
1691
  eval_set,
1685
- is_demo_dataset,
1686
1692
  exclude_features_sources,
1687
1693
  trace_id,
1688
1694
  progress_bar,
@@ -1698,8 +1704,14 @@ class FeaturesEnricher(TransformerMixin):
1698
1704
  if exclude_features_sources:
1699
1705
  enriched_X = enriched_X.drop(columns=exclude_features_sources, errors="ignore")
1700
1706
 
1701
- return self.__mk_sampled_data_tuple(
1702
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
1707
+ return self.__cache_and_return_results(
1708
+ datasets_hash,
1709
+ X_sampled,
1710
+ y_sampled,
1711
+ enriched_X,
1712
+ eval_set_sampled_dict,
1713
+ columns_renaming,
1714
+ search_keys,
1703
1715
  )
1704
1716
 
1705
1717
  def __sample_only_input(
@@ -1776,17 +1788,14 @@ class FeaturesEnricher(TransformerMixin):
1776
1788
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
1777
1789
 
1778
1790
  datasets_hash = hash_input(X_sampled, y_sampled, eval_set_sampled_dict)
1779
- self.__cached_sampled_datasets[datasets_hash] = (
1791
+ return self.__cache_and_return_results(
1792
+ datasets_hash,
1780
1793
  X_sampled,
1781
1794
  y_sampled,
1782
1795
  enriched_X,
1783
1796
  eval_set_sampled_dict,
1784
- search_keys,
1785
1797
  columns_renaming,
1786
- )
1787
-
1788
- return self.__mk_sampled_data_tuple(
1789
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
1798
+ search_keys,
1790
1799
  )
1791
1800
 
1792
1801
  def __sample_balanced(
@@ -1825,13 +1834,34 @@ class FeaturesEnricher(TransformerMixin):
1825
1834
  # index in each dataset (X, eval set) may be reordered and non unique, but index in validated datasets
1826
1835
  # can differs from it
1827
1836
  fit_features = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
1828
- enriched_Xy, enriched_eval_sets = self.__enrich(
1837
+
1838
+ # Pre-process features if we need to drop outliers
1839
+ if rows_to_drop is not None:
1840
+ self.logger.info(f"Before dropping target outliers size: {len(fit_features)}")
1841
+ fit_features = fit_features[
1842
+ ~fit_features[ENTITY_SYSTEM_RECORD_ID].isin(rows_to_drop[ENTITY_SYSTEM_RECORD_ID])
1843
+ ]
1844
+ self.logger.info(f"After dropping target outliers size: {len(fit_features)}")
1845
+
1846
+ enriched_eval_sets = {}
1847
+ enriched_Xy = self.__enrich(
1829
1848
  self.df_with_original_index,
1830
1849
  fit_features,
1831
- rows_to_drop=rows_to_drop,
1850
+ how="inner",
1832
1851
  drop_system_record_id=False,
1833
1852
  )
1834
1853
 
1854
+ # Handle eval sets extraction based on EVAL_SET_INDEX
1855
+ if EVAL_SET_INDEX in enriched_Xy.columns:
1856
+ eval_set_indices = list(enriched_Xy[EVAL_SET_INDEX].unique())
1857
+ if 0 in eval_set_indices:
1858
+ eval_set_indices.remove(0)
1859
+ for eval_set_index in eval_set_indices:
1860
+ enriched_eval_sets[eval_set_index] = enriched_Xy.loc[
1861
+ enriched_Xy[EVAL_SET_INDEX] == eval_set_index
1862
+ ].copy()
1863
+ enriched_Xy = enriched_Xy.loc[enriched_Xy[EVAL_SET_INDEX] == 0].copy()
1864
+
1835
1865
  x_columns = [c for c in self.df_with_original_index.columns if c not in [EVAL_SET_INDEX, TARGET]]
1836
1866
  X_sampled = enriched_Xy[x_columns].copy()
1837
1867
  y_sampled = enriched_Xy[TARGET].copy()
@@ -1855,17 +1885,14 @@ class FeaturesEnricher(TransformerMixin):
1855
1885
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
1856
1886
 
1857
1887
  datasets_hash = hash_input(self.X, self.y, self.eval_set)
1858
- self.__cached_sampled_datasets[datasets_hash] = (
1888
+ return self.__cache_and_return_results(
1889
+ datasets_hash,
1859
1890
  X_sampled,
1860
1891
  y_sampled,
1861
1892
  enriched_X,
1862
1893
  eval_set_sampled_dict,
1863
- search_keys,
1864
1894
  self.fit_columns_renaming,
1865
- )
1866
-
1867
- return self.__mk_sampled_data_tuple(
1868
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, self.fit_columns_renaming
1895
+ search_keys,
1869
1896
  )
1870
1897
 
1871
1898
  def __sample_imbalanced(
@@ -1873,169 +1900,162 @@ class FeaturesEnricher(TransformerMixin):
1873
1900
  validated_X: pd.DataFrame,
1874
1901
  validated_y: pd.Series,
1875
1902
  eval_set: Optional[List[tuple]],
1876
- is_demo_dataset: bool,
1877
1903
  exclude_features_sources: Optional[List[str]],
1878
1904
  trace_id: str,
1879
1905
  progress_bar: Optional[ProgressBar],
1880
1906
  progress_callback: Optional[Callable[[SearchProgress], Any]],
1881
1907
  ) -> _SampledDataForMetrics:
1882
- eval_set_sampled_dict = {}
1883
- if eval_set is not None:
1884
- self.logger.info("Transform with eval_set")
1885
- # concatenate X and eval_set with eval_set_index
1886
- df = validated_X.copy()
1887
- df[TARGET] = validated_y
1888
- df[EVAL_SET_INDEX] = 0
1889
- for idx, eval_pair in enumerate(eval_set):
1890
- eval_x, eval_y = self._validate_eval_set_pair(validated_X, eval_pair)
1891
- eval_df_with_index = eval_x.copy()
1892
- eval_df_with_index[TARGET] = eval_y
1893
- eval_df_with_index[EVAL_SET_INDEX] = idx + 1
1894
- df = pd.concat([df, eval_df_with_index])
1895
-
1896
- df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1897
-
1898
- # downsample if need to eval_set threshold
1899
- num_samples = _num_samples(df)
1900
- force_downsampling = (
1901
- not self.disable_force_downsampling
1902
- and self.columns_for_online_api is not None
1903
- and num_samples > Dataset.FORCE_SAMPLE_SIZE
1904
- )
1905
- # TODO: check that system_record_id was added before this step
1906
- if force_downsampling:
1907
- self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
1908
- df = balance_undersample_forced(
1909
- df=df,
1910
- target_column=TARGET,
1911
- id_columns=self.id_columns,
1912
- date_column=self._get_date_column(self.search_keys),
1913
- task_type=self.model_task_type,
1914
- cv_type=self.cv,
1915
- random_state=self.random_state,
1916
- sample_size=Dataset.FORCE_SAMPLE_SIZE,
1917
- logger=self.logger,
1918
- bundle=self.bundle,
1919
- warning_callback=self.__log_warning,
1920
- )
1921
- elif num_samples > Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD:
1922
- self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS}")
1923
- df = df.sample(n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state)
1908
+ has_eval_set = eval_set is not None
1924
1909
 
1925
- eval_set_sampled_dict = {}
1910
+ self.logger.info(f"Transform {'with' if has_eval_set else 'without'} eval_set")
1926
1911
 
1927
- tmp_target_name = "__target"
1928
- df = df.rename(columns={TARGET: tmp_target_name})
1912
+ # Prepare
1913
+ df = self.__combine_train_and_eval_sets(validated_X, validated_y, eval_set)
1914
+ df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1915
+ df = self.__downsample_for_metrics(df)
1929
1916
 
1930
- enriched_df, columns_renaming, generated_features = self.__inner_transform(
1931
- trace_id,
1932
- df,
1933
- exclude_features_sources=exclude_features_sources,
1934
- silent_mode=True,
1935
- metrics_calculation=True,
1936
- progress_bar=progress_bar,
1937
- progress_callback=progress_callback,
1938
- add_fit_system_record_id=True,
1939
- target_name=tmp_target_name,
1940
- )
1941
- if enriched_df is None:
1942
- return None
1917
+ # Transform
1918
+ enriched_df, columns_renaming, generated_features, search_keys = self.__inner_transform(
1919
+ trace_id,
1920
+ X=df.drop(columns=[TARGET]),
1921
+ y=df[TARGET],
1922
+ exclude_features_sources=exclude_features_sources,
1923
+ silent_mode=True,
1924
+ metrics_calculation=True,
1925
+ progress_bar=progress_bar,
1926
+ progress_callback=progress_callback,
1927
+ add_fit_system_record_id=True,
1928
+ )
1929
+ if enriched_df is None:
1930
+ return None
1943
1931
 
1944
- enriched_df = enriched_df.rename(columns={tmp_target_name: TARGET})
1932
+ x_columns = [
1933
+ c
1934
+ for c in (validated_X.columns.tolist() + generated_features + [SYSTEM_RECORD_ID])
1935
+ if c in enriched_df.columns
1936
+ ]
1945
1937
 
1946
- x_columns = [
1947
- c
1948
- for c in (validated_X.columns.tolist() + generated_features + [SYSTEM_RECORD_ID])
1949
- if c in enriched_df.columns
1950
- ]
1938
+ X_sampled, y_sampled, enriched_X = self.__extract_train_data(enriched_df, x_columns)
1939
+ eval_set_sampled_dict = self.__extract_eval_data(
1940
+ enriched_df, x_columns, enriched_X.columns.tolist(), len(eval_set) if has_eval_set else 0
1941
+ )
1951
1942
 
1952
- enriched_Xy = enriched_df.query(f"{EVAL_SET_INDEX} == 0")
1953
- X_sampled = enriched_Xy[x_columns].copy()
1954
- y_sampled = enriched_Xy[TARGET].copy()
1955
- enriched_X = enriched_Xy.drop(columns=[TARGET, EVAL_SET_INDEX])
1956
- enriched_X_columns = enriched_X.columns.tolist()
1943
+ # Cache and return results
1944
+ datasets_hash = hash_input(validated_X, validated_y, eval_set)
1945
+ return self.__cache_and_return_results(
1946
+ datasets_hash,
1947
+ X_sampled,
1948
+ y_sampled,
1949
+ enriched_X,
1950
+ eval_set_sampled_dict,
1951
+ columns_renaming,
1952
+ search_keys,
1953
+ )
1957
1954
 
1958
- for idx in range(len(eval_set)):
1959
- enriched_eval_xy = enriched_df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
1960
- eval_x_sampled = enriched_eval_xy[x_columns].copy()
1961
- eval_y_sampled = enriched_eval_xy[TARGET].copy()
1962
- enriched_eval_x = enriched_eval_xy[enriched_X_columns].copy()
1963
- eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
1964
- else:
1965
- self.logger.info("Transform without eval_set")
1966
- df = validated_X.copy()
1955
+ def __combine_train_and_eval_sets(
1956
+ self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]]
1957
+ ) -> pd.DataFrame:
1958
+ df = validated_X.copy()
1959
+ df[TARGET] = validated_y
1960
+ if eval_set is None:
1961
+ return df
1962
+
1963
+ df[EVAL_SET_INDEX] = 0
1967
1964
 
1968
- df[TARGET] = validated_y
1965
+ for idx, eval_pair in enumerate(eval_set):
1966
+ eval_x, eval_y = self._validate_eval_set_pair(validated_X, eval_pair)
1967
+ eval_df_with_index = eval_x.copy()
1968
+ eval_df_with_index[TARGET] = eval_y
1969
+ eval_df_with_index[EVAL_SET_INDEX] = idx + 1
1970
+ df = pd.concat([df, eval_df_with_index])
1969
1971
 
1970
- df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1972
+ return df
1971
1973
 
1972
- num_samples = _num_samples(df)
1973
- force_downsampling = (
1974
- not self.disable_force_downsampling
1975
- and self.columns_for_online_api is not None
1976
- and num_samples > Dataset.FORCE_SAMPLE_SIZE
1974
+ def __downsample_for_metrics(self, df: pd.DataFrame) -> pd.DataFrame:
1975
+ num_samples = _num_samples(df)
1976
+ force_downsampling = (
1977
+ not self.disable_force_downsampling
1978
+ and self.columns_for_online_api is not None
1979
+ and num_samples > Dataset.FORCE_SAMPLE_SIZE
1980
+ )
1981
+
1982
+ if force_downsampling:
1983
+ self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
1984
+ return balance_undersample_forced(
1985
+ df=df,
1986
+ target_column=TARGET,
1987
+ id_columns=self.id_columns,
1988
+ date_column=self._get_date_column(self.search_keys),
1989
+ task_type=self.model_task_type,
1990
+ cv_type=self.cv,
1991
+ random_state=self.random_state,
1992
+ sample_size=Dataset.FORCE_SAMPLE_SIZE,
1993
+ logger=self.logger,
1994
+ bundle=self.bundle,
1995
+ warning_callback=self.__log_warning,
1977
1996
  )
1997
+ elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
1998
+ if EVAL_SET_INDEX in df.columns:
1999
+ threshold = Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD
2000
+ sample_size = Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS
2001
+ else:
2002
+ threshold = Dataset.FIT_SAMPLE_THRESHOLD
2003
+ sample_size = Dataset.FIT_SAMPLE_ROWS
1978
2004
 
1979
- if force_downsampling:
1980
- self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
1981
- df = balance_undersample_forced(
1982
- df=df,
1983
- target_column=TARGET,
1984
- id_columns=self.id_columns,
1985
- date_column=self._get_date_column(self.search_keys),
1986
- task_type=self.model_task_type,
1987
- cv_type=self.cv,
1988
- random_state=self.random_state,
1989
- sample_size=Dataset.FORCE_SAMPLE_SIZE,
1990
- logger=self.logger,
1991
- bundle=self.bundle,
1992
- warning_callback=self.__log_warning,
1993
- )
1994
- elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
1995
- self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
1996
- df = df.sample(n=Dataset.FIT_SAMPLE_ROWS, random_state=self.random_state)
2005
+ if num_samples > threshold:
2006
+ self.logger.info(f"Downsampling from {num_samples} to {sample_size}")
2007
+ return df.sample(n=sample_size, random_state=self.random_state)
1997
2008
 
1998
- tmp_target_name = "__target"
1999
- df = df.rename(columns={TARGET: tmp_target_name})
2009
+ return df
2000
2010
 
2001
- enriched_Xy, columns_renaming, generated_features = self.__inner_transform(
2002
- trace_id,
2003
- df,
2004
- exclude_features_sources=exclude_features_sources,
2005
- silent_mode=True,
2006
- metrics_calculation=True,
2007
- progress_bar=progress_bar,
2008
- progress_callback=progress_callback,
2009
- add_fit_system_record_id=True,
2010
- target_name=tmp_target_name,
2011
- )
2012
- if enriched_Xy is None:
2013
- return None
2011
+ def __extract_train_data(
2012
+ self, enriched_df: pd.DataFrame, x_columns: List[str]
2013
+ ) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
2014
+ if EVAL_SET_INDEX in enriched_df.columns:
2015
+ enriched_Xy = enriched_df.query(f"{EVAL_SET_INDEX} == 0")
2016
+ else:
2017
+ enriched_Xy = enriched_df
2018
+ X_sampled = enriched_Xy[x_columns].copy()
2019
+ y_sampled = enriched_Xy[TARGET].copy()
2020
+ enriched_X = enriched_Xy.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
2021
+ return X_sampled, y_sampled, enriched_X
2014
2022
 
2015
- enriched_Xy = enriched_Xy.rename(columns={tmp_target_name: TARGET})
2023
+ def __extract_eval_data(
2024
+ self, enriched_df: pd.DataFrame, x_columns: List[str], enriched_X_columns: List[str], eval_set_len: int
2025
+ ) -> Dict[int, Tuple]:
2026
+ eval_set_sampled_dict = {}
2016
2027
 
2017
- x_columns = [
2018
- c
2019
- for c in (validated_X.columns.tolist() + generated_features + [SYSTEM_RECORD_ID])
2020
- if c in enriched_Xy.columns
2021
- ]
2028
+ for idx in range(eval_set_len):
2029
+ enriched_eval_xy = enriched_df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
2030
+ eval_x_sampled = enriched_eval_xy[x_columns].copy()
2031
+ eval_y_sampled = enriched_eval_xy[TARGET].copy()
2032
+ enriched_eval_x = enriched_eval_xy[enriched_X_columns].copy()
2033
+ eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
2022
2034
 
2023
- X_sampled = enriched_Xy[x_columns].copy()
2024
- y_sampled = enriched_Xy[TARGET].copy()
2025
- enriched_X = enriched_Xy.drop(columns=TARGET)
2035
+ return eval_set_sampled_dict
2036
+
2037
+ def __cache_and_return_results(
2038
+ self,
2039
+ datasets_hash: str,
2040
+ X_sampled: pd.DataFrame,
2041
+ y_sampled: pd.Series,
2042
+ enriched_X: pd.DataFrame,
2043
+ eval_set_sampled_dict: Dict[int, Tuple],
2044
+ columns_renaming: Dict[str, str],
2045
+ search_keys: Dict[str, SearchKey],
2046
+ ) -> _SampledDataForMetrics:
2026
2047
 
2027
- datasets_hash = hash_input(validated_X, validated_y, eval_set)
2028
2048
  self.__cached_sampled_datasets[datasets_hash] = (
2029
2049
  X_sampled,
2030
2050
  y_sampled,
2031
2051
  enriched_X,
2032
2052
  eval_set_sampled_dict,
2033
- self.search_keys,
2053
+ search_keys,
2034
2054
  columns_renaming,
2035
2055
  )
2036
2056
 
2037
2057
  return self.__mk_sampled_data_tuple(
2038
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, self.search_keys, columns_renaming
2058
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
2039
2059
  )
2040
2060
 
2041
2061
  def __mk_sampled_data_tuple(
@@ -2047,7 +2067,11 @@ class FeaturesEnricher(TransformerMixin):
2047
2067
  search_keys: Dict,
2048
2068
  columns_renaming: Dict[str, str],
2049
2069
  ):
2050
- search_keys = {k: v for k, v in search_keys.items() if k in X_sampled.columns.to_list()}
2070
+ search_keys = {
2071
+ columns_renaming.get(k, k): v
2072
+ for k, v in search_keys.items()
2073
+ if columns_renaming.get(k, k) in X_sampled.columns.to_list()
2074
+ }
2051
2075
  return FeaturesEnricher._SampledDataForMetrics(
2052
2076
  X_sampled=X_sampled,
2053
2077
  y_sampled=y_sampled,
@@ -2161,6 +2185,7 @@ if response.status_code == 200:
2161
2185
  trace_id: str,
2162
2186
  X: pd.DataFrame,
2163
2187
  *,
2188
+ y: Optional[pd.Series] = None,
2164
2189
  exclude_features_sources: Optional[List[str]] = None,
2165
2190
  importance_threshold: Optional[float] = None,
2166
2191
  max_features: Optional[int] = None,
@@ -2169,8 +2194,7 @@ if response.status_code == 200:
2169
2194
  progress_bar: Optional[ProgressBar] = None,
2170
2195
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
2171
2196
  add_fit_system_record_id: bool = False,
2172
- target_name: Optional[str] = None,
2173
- ) -> Tuple[pd.DataFrame, Dict[str, str], List[str]]:
2197
+ ) -> Tuple[pd.DataFrame, Dict[str, str], List[str], Dict[str, SearchKey]]:
2174
2198
  if self._search_task is None:
2175
2199
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
2176
2200
 
@@ -2179,20 +2203,28 @@ if response.status_code == 200:
2179
2203
  self.logger.info("Start transform")
2180
2204
 
2181
2205
  validated_X = self._validate_X(X, is_transform=True)
2206
+ if y is not None:
2207
+ validated_y = self._validate_y(validated_X, y)
2208
+ df = self.__combine_train_and_eval_sets(validated_X, validated_y, eval_set=None)
2209
+ else:
2210
+ validated_y = None
2211
+ df = validated_X
2212
+
2213
+ validated_Xy = df.copy()
2182
2214
 
2183
- self.__log_debug_information(validated_X, exclude_features_sources=exclude_features_sources)
2215
+ self.__log_debug_information(validated_X, validated_y, exclude_features_sources=exclude_features_sources)
2184
2216
 
2185
2217
  self.__validate_search_keys(self.search_keys, self.search_id)
2186
2218
 
2187
2219
  if len(self.feature_names_) == 0:
2188
2220
  self.logger.warning(self.bundle.get("no_important_features_for_transform"))
2189
- return X, {c: c for c in X.columns}, []
2221
+ return X, {c: c for c in X.columns}, [], {}
2190
2222
 
2191
2223
  if self._has_paid_features(exclude_features_sources):
2192
2224
  msg = self.bundle.get("transform_with_paid_features")
2193
2225
  self.logger.warning(msg)
2194
2226
  self.__display_support_link(msg)
2195
- return None, {c: c for c in X.columns}, []
2227
+ return None, {c: c for c in X.columns}, [], {}
2196
2228
 
2197
2229
  features_meta = self._search_task.get_all_features_metadata_v2()
2198
2230
  online_api_features = [fm.name for fm in features_meta if fm.from_online_api and fm.shap_value > 0]
@@ -2215,7 +2247,7 @@ if response.status_code == 200:
2215
2247
  self.logger.warning(msg)
2216
2248
  print(msg)
2217
2249
  show_request_quote_button()
2218
- return None, {c: c for c in X.columns}, []
2250
+ return None, {c: c for c in X.columns}, [], {}
2219
2251
  else:
2220
2252
  msg = self.bundle.get("transform_usage_info").format(
2221
2253
  transform_usage.limit, transform_usage.transformed_rows
@@ -2223,29 +2255,27 @@ if response.status_code == 200:
2223
2255
  self.logger.info(msg)
2224
2256
  print(msg)
2225
2257
 
2226
- is_demo_dataset = hash_input(validated_X) in DEMO_DATASET_HASHES
2258
+ is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
2227
2259
 
2228
2260
  columns_to_drop = [
2229
- c for c in validated_X.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
2261
+ c for c in df.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
2230
2262
  ]
2231
2263
  if len(columns_to_drop) > 0:
2232
2264
  msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
2233
2265
  self.logger.warning(msg)
2234
2266
  print(msg)
2235
- validated_X = validated_X.drop(columns=columns_to_drop)
2267
+ df = df.drop(columns=columns_to_drop)
2236
2268
 
2237
2269
  search_keys = self.search_keys.copy()
2238
2270
  if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
2239
- self.search_keys.update(
2271
+ search_keys.update(
2240
2272
  {col: SearchKey.CUSTOM_KEY for col in self.id_columns if col not in self.search_keys}
2241
2273
  )
2242
2274
 
2243
2275
  search_keys = self.__prepare_search_keys(
2244
- validated_X, search_keys, is_demo_dataset, is_transform=True, silent_mode=silent_mode
2276
+ df, search_keys, is_demo_dataset, is_transform=True, silent_mode=silent_mode
2245
2277
  )
2246
2278
 
2247
- df = validated_X.copy()
2248
-
2249
2279
  df = self.__handle_index_search_keys(df, search_keys)
2250
2280
 
2251
2281
  if DEFAULT_INDEX in df.columns:
@@ -2253,7 +2283,7 @@ if response.status_code == 200:
2253
2283
  self.logger.info(msg)
2254
2284
  print(msg)
2255
2285
  df.drop(columns=DEFAULT_INDEX, inplace=True)
2256
- validated_X.drop(columns=DEFAULT_INDEX, inplace=True)
2286
+ validated_Xy.drop(columns=DEFAULT_INDEX, inplace=True)
2257
2287
 
2258
2288
  df = self.__add_country_code(df, search_keys)
2259
2289
 
@@ -2284,8 +2314,11 @@ if response.status_code == 200:
2284
2314
  features_for_transform = self._search_task.get_features_for_transform() or []
2285
2315
  if len(features_for_transform) > 0:
2286
2316
  missing_features_for_transform = [
2287
- columns_renaming.get(f) for f in features_for_transform if f not in df.columns
2317
+ columns_renaming.get(f) or f for f in features_for_transform if f not in df.columns
2288
2318
  ]
2319
+ if TARGET in missing_features_for_transform:
2320
+ raise ValidationError(self.bundle.get("missing_target_for_transform"))
2321
+
2289
2322
  if len(missing_features_for_transform) > 0:
2290
2323
  raise ValidationError(
2291
2324
  self.bundle.get("missing_features_for_transform").format(missing_features_for_transform)
@@ -2341,11 +2374,10 @@ if response.status_code == 200:
2341
2374
  converter = PostalCodeSearchKeyConverter(postal_code)
2342
2375
  df = converter.convert(df)
2343
2376
 
2344
- # generated_features = [f for f in generated_features if f in self.fit_generated_features]
2377
+ meaning_types = {}
2378
+ meaning_types.update({col: FileColumnMeaningType.FEATURE for col in features_for_transform})
2379
+ meaning_types.update({col: key.value for col, key in search_keys.items()})
2345
2380
 
2346
- meaning_types = {col: key.value for col, key in search_keys.items()}
2347
- for col in features_for_transform:
2348
- meaning_types[col] = FileColumnMeaningType.FEATURE
2349
2381
  features_not_to_pass = [
2350
2382
  c
2351
2383
  for c in df.columns
@@ -2354,13 +2386,12 @@ if response.status_code == 200:
2354
2386
  and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
2355
2387
  ]
2356
2388
 
2357
- if add_fit_system_record_id and target_name is not None:
2358
- reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
2389
+ if add_fit_system_record_id:
2359
2390
  df = self.__add_fit_system_record_id(
2360
2391
  df,
2361
2392
  search_keys,
2362
2393
  SYSTEM_RECORD_ID,
2363
- reversed_columns_renaming.get(target_name, target_name),
2394
+ TARGET,
2364
2395
  columns_renaming,
2365
2396
  silent=True,
2366
2397
  )
@@ -2485,28 +2516,34 @@ if response.status_code == 200:
2485
2516
  if progress_callback is not None:
2486
2517
  progress_callback(progress)
2487
2518
 
2488
- def enrich():
2489
- res, _ = self.__enrich(
2490
- df_with_original_index,
2491
- validation_task.get_all_validation_raw_features(trace_id, metrics_calculation),
2492
- validated_X,
2493
- is_transform=True,
2494
- )
2495
- return res
2496
-
2497
2519
  if not silent_mode:
2498
2520
  print(self.bundle.get("transform_start"))
2499
- # with Spinner():
2500
- result = enrich()
2501
- else:
2502
- result = enrich()
2521
+
2522
+ # Prepare input DataFrame for __enrich by concatenating generated ids and client features
2523
+ combined_df = pd.concat(
2524
+ [
2525
+ validated_Xy.reset_index(drop=True),
2526
+ df_with_original_index.reset_index(drop=True),
2527
+ ],
2528
+ axis=1,
2529
+ ).set_index(validated_Xy.index)
2530
+
2531
+ result_features = validation_task.get_all_validation_raw_features(trace_id, metrics_calculation)
2532
+
2533
+ result = self.__enrich(
2534
+ combined_df,
2535
+ result_features,
2536
+ how="left",
2537
+ )
2503
2538
 
2504
2539
  selecting_columns = [
2505
2540
  c
2506
- for c in itertools.chain(validated_X.columns.tolist(), generated_features)
2541
+ for c in itertools.chain(validated_Xy.columns.tolist(), generated_features)
2507
2542
  if c not in self.dropped_client_feature_names_
2508
2543
  ]
2509
- filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
2544
+ filtered_columns = self.__filtered_enriched_features(
2545
+ importance_threshold, max_features, trace_id, validated_X
2546
+ )
2510
2547
  selecting_columns.extend(
2511
2548
  c for c in filtered_columns if c in result.columns and c not in validated_X.columns
2512
2549
  )
@@ -2515,7 +2552,7 @@ if response.status_code == 200:
2515
2552
 
2516
2553
  selecting_columns = list(set(selecting_columns))
2517
2554
  # sorting: first columns from X, then generated features, then enriched features
2518
- sorted_selecting_columns = [c for c in validated_X.columns if c in selecting_columns]
2555
+ sorted_selecting_columns = [c for c in validated_Xy.columns if c in selecting_columns]
2519
2556
  for c in generated_features:
2520
2557
  if c in selecting_columns and c not in sorted_selecting_columns:
2521
2558
  sorted_selecting_columns.append(c)
@@ -2533,7 +2570,7 @@ if response.status_code == 200:
2533
2570
  if add_fit_system_record_id:
2534
2571
  result = result.rename(columns={SORT_ID: SYSTEM_RECORD_ID})
2535
2572
 
2536
- return result, columns_renaming, generated_features
2573
+ return result, columns_renaming, generated_features, search_keys
2537
2574
 
2538
2575
  def _get_excluded_features(self, max_features: Optional[int], importance_threshold: Optional[float]) -> List[str]:
2539
2576
  features_info = self._internal_features_info
@@ -2643,6 +2680,7 @@ if response.status_code == 200:
2643
2680
  importance_threshold: Optional[float],
2644
2681
  max_features: Optional[int],
2645
2682
  remove_outliers_calc_metrics: Optional[bool],
2683
+ auto_fe_parameters: Optional[AutoFEParameters] = None,
2646
2684
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
2647
2685
  search_id_callback: Optional[Callable[[str], Any]] = None,
2648
2686
  ):
@@ -2948,6 +2986,7 @@ if response.status_code == 200:
2948
2986
  runtime_parameters=runtime_parameters,
2949
2987
  exclude_features_sources=exclude_features_sources,
2950
2988
  force_downsampling=force_downsampling,
2989
+ auto_fe_parameters=auto_fe_parameters,
2951
2990
  )
2952
2991
 
2953
2992
  if search_id_callback is not None:
@@ -3211,8 +3250,7 @@ if response.status_code == 200:
3211
3250
  def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
3212
3251
  if len(eval_pair) != 2:
3213
3252
  raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
3214
- eval_X = eval_pair[0]
3215
- eval_y = eval_pair[1]
3253
+ eval_X, eval_y = eval_pair
3216
3254
 
3217
3255
  if _num_samples(eval_X) == 0:
3218
3256
  raise ValidationError(self.bundle.get("eval_x_is_empty"))
@@ -3712,23 +3750,19 @@ if response.status_code == 200:
3712
3750
 
3713
3751
  def __enrich(
3714
3752
  self,
3715
- df_with_original_index: pd.DataFrame,
3753
+ input_df: pd.DataFrame,
3716
3754
  result_features: Optional[pd.DataFrame],
3717
- X: Optional[pd.DataFrame] = None,
3718
- is_transform=False,
3719
- rows_to_drop: Optional[pd.DataFrame] = None,
3755
+ how: str = "inner",
3720
3756
  drop_system_record_id=True,
3721
- ) -> Tuple[pd.DataFrame, Dict[int, pd.DataFrame]]:
3757
+ ) -> pd.DataFrame:
3722
3758
  if result_features is None:
3723
3759
  self.logger.error(f"result features not found by search_task_id: {self.get_search_id()}")
3724
3760
  raise RuntimeError(self.bundle.get("features_wasnt_returned"))
3725
- result_features = (
3726
- result_features.drop(columns=EVAL_SET_INDEX)
3727
- if EVAL_SET_INDEX in result_features.columns
3728
- else result_features
3729
- )
3730
3761
 
3731
- comparing_columns = X.columns if is_transform else df_with_original_index.columns
3762
+ if EVAL_SET_INDEX in result_features.columns:
3763
+ result_features = result_features.drop(columns=EVAL_SET_INDEX)
3764
+
3765
+ comparing_columns = input_df.columns
3732
3766
  dup_features = [
3733
3767
  c
3734
3768
  for c in comparing_columns
@@ -3738,63 +3772,80 @@ if response.status_code == 200:
3738
3772
  self.logger.warning(f"X contain columns with same name as returned from backend: {dup_features}")
3739
3773
  raise ValidationError(self.bundle.get("returned_features_same_as_passed").format(dup_features))
3740
3774
 
3741
- # index overrites from result_features
3742
- original_index_name = df_with_original_index.index.name
3743
- df_with_original_index = df_with_original_index.reset_index()
3775
+ # Handle index and column renaming
3776
+ original_index_name = input_df.index.name
3777
+ renamed_column = None
3778
+
3779
+ # Handle column rename if it conflicts with index name
3780
+ if original_index_name in input_df.columns:
3781
+ renamed_column = f"{original_index_name}_renamed"
3782
+ input_df = input_df.rename(columns={original_index_name: renamed_column})
3783
+
3784
+ # Reset index for the merge operation
3785
+ input_df = input_df.reset_index()
3786
+
3744
3787
  # TODO drop system_record_id before merge
3788
+ # Merge with result features
3745
3789
  result_features = pd.merge(
3746
- df_with_original_index,
3790
+ input_df,
3747
3791
  result_features,
3748
3792
  on=ENTITY_SYSTEM_RECORD_ID,
3749
- how="left" if is_transform else "inner",
3793
+ how=how,
3750
3794
  )
3795
+
3796
+ # Restore the index
3751
3797
  result_features = result_features.set_index(original_index_name or DEFAULT_INDEX)
3752
3798
  result_features.index.name = original_index_name
3753
3799
 
3754
- if rows_to_drop is not None:
3755
- self.logger.info(f"Before dropping target outliers size: {len(result_features)}")
3756
- result_features = result_features[
3757
- ~result_features[ENTITY_SYSTEM_RECORD_ID].isin(rows_to_drop[ENTITY_SYSTEM_RECORD_ID])
3758
- ]
3759
- self.logger.info(f"After dropping target outliers size: {len(result_features)}")
3760
-
3761
- result_eval_sets = {}
3762
- if not is_transform and EVAL_SET_INDEX in result_features.columns:
3763
- result_train_features = result_features.loc[result_features[EVAL_SET_INDEX] == 0].copy()
3764
- eval_set_indices = list(result_features[EVAL_SET_INDEX].unique())
3765
- if 0 in eval_set_indices:
3766
- eval_set_indices.remove(0)
3767
- for eval_set_index in eval_set_indices:
3768
- result_eval_sets[eval_set_index] = result_features.loc[
3769
- result_features[EVAL_SET_INDEX] == eval_set_index
3770
- ].copy()
3771
- result_train_features = result_train_features.drop(columns=EVAL_SET_INDEX)
3772
- else:
3773
- result_train_features = result_features
3774
-
3775
- if is_transform:
3776
- index_name = X.index.name
3777
- renamed_column = None
3778
- if index_name in X.columns:
3779
- renamed_column = f"{index_name}_renamed"
3780
- X = X.rename(columns={index_name: renamed_column})
3781
- result_train = pd.concat([X.reset_index(), result_train_features.reset_index(drop=True)], axis=1).set_index(
3782
- index_name or DEFAULT_INDEX
3783
- )
3784
- result_train.index.name = index_name
3785
- if renamed_column is not None:
3786
- result_train = result_train.rename(columns={renamed_column: index_name})
3787
- else:
3788
- result_train = result_train_features
3800
+ # Restore renamed column if needed
3801
+ if renamed_column is not None:
3802
+ result_features = result_features.rename(columns={renamed_column: original_index_name})
3789
3803
 
3790
3804
  if drop_system_record_id:
3791
- result_train = result_train.drop(columns=[SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID], errors="ignore")
3792
- for eval_set_index in result_eval_sets.keys():
3793
- result_eval_sets[eval_set_index] = result_eval_sets[eval_set_index].drop(
3794
- columns=[SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID], errors="ignore"
3795
- )
3805
+ result_features = result_features.drop(columns=[SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID], errors="ignore")
3796
3806
 
3797
- return result_train, result_eval_sets
3807
+ return result_features
3808
+
3809
+ def __get_features_importance_from_server(self, trace_id: str, df: pd.DataFrame):
3810
+ if self._search_task is None:
3811
+ raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
3812
+ features_meta = self._search_task.get_all_features_metadata_v2()
3813
+ if features_meta is None:
3814
+ raise Exception(self.bundle.get("missing_features_meta"))
3815
+ features_meta = deepcopy(features_meta)
3816
+
3817
+ original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
3818
+ df = df.rename(columns=original_names_dict)
3819
+
3820
+ features_meta.sort(key=lambda m: (-m.shap_value, m.name))
3821
+
3822
+ importances = {}
3823
+
3824
+ for feature_meta in features_meta:
3825
+ if feature_meta.name in original_names_dict.keys():
3826
+ feature_meta.name = original_names_dict[feature_meta.name]
3827
+
3828
+ is_client_feature = feature_meta.name in df.columns
3829
+
3830
+ if feature_meta.shap_value == 0.0:
3831
+ continue
3832
+
3833
+ # Use only important features
3834
+ if (
3835
+ feature_meta.name == COUNTRY
3836
+ # In select_features mode we select also from etalon features and need to show them
3837
+ or (not self.fit_select_features and is_client_feature)
3838
+ ):
3839
+ continue
3840
+
3841
+ # Temporary workaround for duplicate features metadata
3842
+ if feature_meta.name in importances:
3843
+ self.logger.warning(f"WARNING: Duplicate feature metadata: {feature_meta}")
3844
+ continue
3845
+
3846
+ importances[feature_meta.name] = feature_meta.shap_value
3847
+
3848
+ return importances
3798
3849
 
3799
3850
  def __prepare_feature_importances(
3800
3851
  self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
@@ -3804,6 +3855,7 @@ if response.status_code == 200:
3804
3855
  features_meta = self._search_task.get_all_features_metadata_v2()
3805
3856
  if features_meta is None:
3806
3857
  raise Exception(self.bundle.get("missing_features_meta"))
3858
+ features_meta = deepcopy(features_meta)
3807
3859
 
3808
3860
  original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
3809
3861
  features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
@@ -3819,15 +3871,23 @@ if response.status_code == 200:
3819
3871
 
3820
3872
  original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
3821
3873
 
3822
- if updated_shaps is not None:
3823
- for fm in features_meta:
3824
- fm.shap_value = updated_shaps.get(fm.name, 0.0)
3825
-
3826
- features_meta.sort(key=lambda m: (-m.shap_value, m.name))
3827
3874
  for feature_meta in features_meta:
3828
3875
  if feature_meta.name in original_names_dict.keys():
3829
3876
  feature_meta.name = original_names_dict[feature_meta.name]
3830
3877
 
3878
+ if updated_shaps is not None:
3879
+ updating_shap = updated_shaps.get(feature_meta.name)
3880
+ if updating_shap is None:
3881
+ self.logger.warning(
3882
+ f"WARNING: Shap value for feature {feature_meta.name} not found and will be set to 0.0"
3883
+ )
3884
+ updating_shap = 0.0
3885
+ feature_meta.shap_value = updating_shap
3886
+
3887
+ features_meta.sort(key=lambda m: (-m.shap_value, m.name))
3888
+
3889
+ for feature_meta in features_meta:
3890
+
3831
3891
  is_client_feature = feature_meta.name in df.columns
3832
3892
 
3833
3893
  # TODO make a decision about selected features based on special flag from mlb
@@ -3839,7 +3899,7 @@ if response.status_code == 200:
3839
3899
  # Use only important features
3840
3900
  if (
3841
3901
  # feature_meta.name in self.fit_generated_features or
3842
- feature_meta.name == COUNTRY
3902
+ feature_meta.name == COUNTRY # constant synthetic column
3843
3903
  # In select_features mode we select also from etalon features and need to show them
3844
3904
  or (not self.fit_select_features and is_client_feature)
3845
3905
  ):
@@ -3981,16 +4041,19 @@ if response.status_code == 200:
3981
4041
  )
3982
4042
 
3983
4043
  def __filtered_importance_names(
3984
- self, importance_threshold: Optional[float], max_features: Optional[int]
4044
+ self, importance_threshold: Optional[float], max_features: Optional[int], trace_id: str, df: pd.DataFrame
3985
4045
  ) -> List[str]:
3986
- if len(self.feature_names_) == 0:
3987
- return []
4046
+ # get features importance from server
4047
+ filtered_importances = self.__get_features_importance_from_server(trace_id, df)
3988
4048
 
3989
- filtered_importances = list(zip(self.feature_names_, self.feature_importances_))
4049
+ if len(filtered_importances) == 0:
4050
+ return []
3990
4051
 
3991
4052
  if importance_threshold is not None:
3992
4053
  filtered_importances = [
3993
- (name, importance) for name, importance in filtered_importances if importance > importance_threshold
4054
+ (name, importance)
4055
+ for name, importance in filtered_importances.items()
4056
+ if importance > importance_threshold
3994
4057
  ]
3995
4058
  if max_features is not None:
3996
4059
  filtered_importances = list(filtered_importances)[:max_features]
@@ -4129,7 +4192,7 @@ if response.status_code == 200:
4129
4192
  max_features=max_features,
4130
4193
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
4131
4194
  trace_id=trace_id,
4132
- silent=True,
4195
+ internal_call=True,
4133
4196
  progress_bar=progress_bar,
4134
4197
  progress_callback=progress_callback,
4135
4198
  )
@@ -4203,11 +4266,13 @@ if response.status_code == 200:
4203
4266
  self,
4204
4267
  importance_threshold: Optional[float],
4205
4268
  max_features: Optional[int],
4269
+ trace_id: str,
4270
+ df: pd.DataFrame,
4206
4271
  ) -> List[str]:
4207
4272
  importance_threshold = self.__validate_importance_threshold(importance_threshold)
4208
4273
  max_features = self.__validate_max_features(max_features)
4209
4274
 
4210
- return self.__filtered_importance_names(importance_threshold, max_features)
4275
+ return self.__filtered_importance_names(importance_threshold, max_features, trace_id, df)
4211
4276
 
4212
4277
  def __detect_missing_search_keys(
4213
4278
  self,