upgini 1.2.114a4__py3-none-any.whl → 1.2.115a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,6 @@
1
1
  import dataclasses
2
2
  import datetime
3
3
  import gc
4
- import hashlib
5
- import itertools
6
4
  import json
7
5
  import logging
8
6
  import os
@@ -14,7 +12,7 @@ from collections import Counter
14
12
  from copy import deepcopy
15
13
  from dataclasses import dataclass
16
14
  from threading import Thread
17
- from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
15
+ from typing import Any, Callable
18
16
 
19
17
  import numpy as np
20
18
  import pandas as pd
@@ -101,7 +99,7 @@ from upgini.utils.email_utils import (
101
99
  from upgini.utils.feature_info import FeatureInfo, _round_shap_value
102
100
  from upgini.utils.features_validator import FeaturesValidator
103
101
  from upgini.utils.format import Format
104
- from upgini.utils.hash_utils import file_hash
102
+ from upgini.utils.hash_utils import file_hash, hash_input
105
103
  from upgini.utils.ip_utils import IpSearchKeyConverter
106
104
  from upgini.utils.phone_utils import PhoneSearchKeyDetector
107
105
  from upgini.utils.postal_code_utils import PostalCodeSearchKeyDetector
@@ -113,10 +111,11 @@ except Exception:
113
111
  CustomFallbackProgressBar as ProgressBar,
114
112
  )
115
113
 
114
+ from upgini.utils.config import SampleConfig
116
115
  from upgini.utils.psi import calculate_features_psi, calculate_sparsity_psi
117
- from upgini.utils.sample_utils import SampleColumns, SampleConfig, _num_samples, sample
116
+ from upgini.utils.sample_utils import SampleColumns, _num_samples, sample
118
117
  from upgini.utils.sort import sort_columns
119
- from upgini.utils.target_utils import calculate_psi, define_task
118
+ from upgini.utils.target_utils import calculate_psi, define_task, is_imbalanced
120
119
  from upgini.utils.warning_counter import WarningCounter
121
120
  from upgini.version_validator import validate_version
122
121
 
@@ -132,7 +131,7 @@ class FeaturesEnricher(TransformerMixin):
132
131
  Parameters
133
132
  ----------
134
133
  search_keys: dict of str->SearchKey or int->SearchKey
135
- Dictionary with column names or indices mapping to key types.
134
+ dictionary with column names or indices mapping to key types.
136
135
  Each of this columns will be used as a search key to find features.
137
136
 
138
137
  country_code: str, optional (default=None)
@@ -164,7 +163,7 @@ class FeaturesEnricher(TransformerMixin):
164
163
  Custom loss function to use for feature selection and metrics calculation.
165
164
 
166
165
  shared_datasets: list of str, optional (default=None)
167
- List of private shared dataset ids for custom search
166
+ list of private shared dataset ids for custom search
168
167
  """
169
168
 
170
169
  TARGET_NAME = "target"
@@ -208,34 +207,34 @@ class FeaturesEnricher(TransformerMixin):
208
207
 
209
208
  def __init__(
210
209
  self,
211
- search_keys: Optional[Dict[str, SearchKey]] = None,
212
- country_code: Optional[str] = None,
213
- model_task_type: Optional[Union[ModelTaskType, str]] = None,
214
- api_key: Optional[str] = None,
215
- endpoint: Optional[str] = None,
216
- search_id: Optional[str] = None,
217
- shared_datasets: Optional[List[str]] = None,
218
- runtime_parameters: Optional[RuntimeParameters] = None,
219
- date_format: Optional[str] = None,
210
+ search_keys: dict[str, SearchKey] | None = None,
211
+ country_code: str | None = None,
212
+ model_task_type: ModelTaskType | str | None = None,
213
+ api_key: str | None = None,
214
+ endpoint: str | None = None,
215
+ search_id: str | None = None,
216
+ shared_datasets: list[str] | None = None,
217
+ runtime_parameters: RuntimeParameters | None = None,
218
+ date_format: str | None = None,
220
219
  random_state: int = 42,
221
- cv: Optional[CVType] = None,
222
- loss: Optional[str] = None,
220
+ cv: CVType | None = None,
221
+ loss: str | None = None,
223
222
  autodetect_search_keys: bool = True,
224
- generate_features: Optional[List[str]] = None,
225
- columns_for_online_api: Optional[List[str]] = None,
226
- round_embeddings: Optional[int] = None,
223
+ generate_features: list[str] | None = None,
224
+ columns_for_online_api: list[str] | None = None,
225
+ round_embeddings: int | None = None,
227
226
  logs_enabled: bool = True,
228
227
  raise_validation_error: bool = True,
229
- exclude_columns: Optional[List[str]] = None,
230
- baseline_score_column: Optional[Any] = None,
231
- client_ip: Optional[str] = None,
232
- client_visitorid: Optional[str] = None,
233
- custom_bundle_config: Optional[str] = None,
228
+ exclude_columns: list[str] | None = None,
229
+ baseline_score_column: Any | None = None,
230
+ client_ip: str | None = None,
231
+ client_visitorid: str | None = None,
232
+ custom_bundle_config: str | None = None,
234
233
  add_date_if_missing: bool = True,
235
234
  disable_force_downsampling: bool = False,
236
- id_columns: Optional[List[str]] = None,
235
+ id_columns: list[str] | None = None,
237
236
  generate_search_key_features: bool = True,
238
- sample_config: Optional[SampleConfig] = None,
237
+ sample_config: SampleConfig | None = None,
239
238
  print_trace_id: bool = False,
240
239
  **kwargs,
241
240
  ):
@@ -259,21 +258,21 @@ class FeaturesEnricher(TransformerMixin):
259
258
  self.logger.warning(msg)
260
259
  print(msg)
261
260
 
262
- self.passed_features: List[str] = []
263
- self.df_with_original_index: Optional[pd.DataFrame] = None
264
- self.fit_columns_renaming: Optional[Dict[str, str]] = None
261
+ self.passed_features: list[str] = []
262
+ self.df_with_original_index: pd.DataFrame | None = None
263
+ self.fit_columns_renaming: dict[str, str] | None = None
265
264
  self.country_added = False
266
- self.fit_generated_features: List[str] = []
267
- self.fit_dropped_features: Set[str] = set()
265
+ self.fit_generated_features: list[str] = []
266
+ self.fit_dropped_features: set[str] = set()
268
267
  self.fit_search_keys = search_keys
269
268
  self.warning_counter = WarningCounter()
270
- self.X: Optional[pd.DataFrame] = None
271
- self.y: Optional[pd.Series] = None
272
- self.eval_set: Optional[List[Tuple]] = None
273
- self.autodetected_search_keys: Dict[str, SearchKey] = {}
269
+ self.X: pd.DataFrame | None = None
270
+ self.y: pd.Series | None = None
271
+ self.eval_set: list[tuple] | None = None
272
+ self.autodetected_search_keys: dict[str, SearchKey] = {}
274
273
  self.imbalanced = False
275
274
  self.fit_select_features = True
276
- self.__cached_sampled_datasets: Dict[str, Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict, Dict]] = (
275
+ self.__cached_sampled_datasets: dict[str, tuple[pd.DataFrame, pd.DataFrame, pd.Series, dict, dict, dict]] = (
277
276
  dict()
278
277
  )
279
278
 
@@ -289,19 +288,17 @@ class FeaturesEnricher(TransformerMixin):
289
288
  self.model_task_type = ModelTaskType.parse(model_task_type)
290
289
  self.model_task_type = model_task_type
291
290
  self.endpoint = endpoint
292
- self._search_task: Optional[SearchTask] = None
291
+ self._search_task: SearchTask | None = None
293
292
  self.features_info: pd.DataFrame = self.EMPTY_FEATURES_INFO
294
293
  self._features_info_without_links: pd.DataFrame = self.EMPTY_FEATURES_INFO
295
294
  self._internal_features_info: pd.DataFrame = self.EMPTY_INTERNAL_FEATURES_INFO
296
295
  self.relevant_data_sources: pd.DataFrame = self.EMPTY_DATA_SOURCES
297
296
  self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
298
- self.metrics: Optional[pd.DataFrame] = None
297
+ self.metrics: pd.DataFrame | None = None
299
298
  self.feature_names_ = []
300
299
  self.external_source_feature_names = []
301
- self.zero_shap_client_features = []
302
- self.unstable_client_features = []
303
300
  self.feature_importances_ = []
304
- self.psi_values: Optional[Dict[str, float]] = None
301
+ self.psi_values: dict[str, float] | None = None
305
302
  self.search_id = search_id
306
303
  self.disable_force_downsampling = disable_force_downsampling
307
304
  self.print_trace_id = print_trace_id
@@ -321,7 +318,8 @@ class FeaturesEnricher(TransformerMixin):
321
318
  x_columns = [c.name for c in file_metadata.columns]
322
319
  self.fit_columns_renaming = {c.name: c.originalName for c in file_metadata.columns}
323
320
  df = pd.DataFrame(columns=x_columns)
324
- self.__prepare_feature_importances(trace_id, df, silent=True)
321
+ self.__prepare_feature_importances(trace_id, df, silent=True, update_selected_features=False)
322
+ self.__show_selected_features()
325
323
  # TODO validate search_keys with search_keys from file_metadata
326
324
  print(self.bundle.get("search_by_task_id_finish"))
327
325
  self.logger.debug(f"Successfully initialized with search_id: {search_id}")
@@ -377,7 +375,7 @@ class FeaturesEnricher(TransformerMixin):
377
375
  self.autofe_features_display_handle = None
378
376
  self.report_button_handle = None
379
377
 
380
- def _get_sample_config(self, sample_config: Optional[SampleConfig] = None):
378
+ def _get_sample_config(self, sample_config: SampleConfig | None = None):
381
379
  sample_config = sample_config or SampleConfig(force_sample_size=Dataset.FORCE_SAMPLE_SIZE)
382
380
 
383
381
  maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
@@ -432,22 +430,21 @@ class FeaturesEnricher(TransformerMixin):
432
430
 
433
431
  def fit(
434
432
  self,
435
- X: Union[pd.DataFrame, pd.Series, np.ndarray],
436
- y: Union[pd.Series, np.ndarray, List],
437
- eval_set: Optional[Union[List[tuple], tuple]] = None,
433
+ X: pd.DataFrame | pd.Series | np.ndarray,
434
+ y: pd.Series | np.ndarray | list,
435
+ eval_set: list[tuple] | tuple | None = None,
438
436
  *args,
439
- exclude_features_sources: Optional[List[str]] = None,
440
- calculate_metrics: Optional[bool] = None,
441
- estimator: Optional[Any] = None,
442
- scoring: Union[Callable, str, None] = None,
443
- importance_threshold: Optional[float] = None,
444
- max_features: Optional[int] = None,
445
- remove_outliers_calc_metrics: Optional[bool] = None,
446
- progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
447
- search_id_callback: Optional[Callable[[str], Any]] = None,
437
+ exclude_features_sources: list[str] | None = None,
438
+ calculate_metrics: bool | None = None,
439
+ estimator: Any | None = None,
440
+ scoring: Callable | str | None = None,
441
+ remove_outliers_calc_metrics: bool | None = None,
442
+ progress_callback: Callable[[SearchProgress], Any] | None = None,
443
+ search_id_callback: Callable[[str], Any] | None = None,
448
444
  select_features: bool = True,
449
- auto_fe_parameters: Optional[AutoFEParameters] = None,
450
- stability_threshold: float = 0.15,
445
+ auto_fe_parameters: AutoFEParameters | None = None,
446
+ stability_threshold: float = 0.2,
447
+ stability_agg_func: str = "max",
451
448
  **kwargs,
452
449
  ):
453
450
  """Fit to data.
@@ -462,14 +459,8 @@ class FeaturesEnricher(TransformerMixin):
462
459
  y: array-like of shape (n_samples,)
463
460
  Target values.
464
461
 
465
- eval_set: List[tuple], optional (default=None)
466
- List of pairs (X, y) for validation.
467
-
468
- importance_threshold: float, optional (default=None)
469
- Minimum SHAP value to select a feature. Default value is 0.0.
470
-
471
- max_features: int, optional (default=None)
472
- Maximum number of most important features to select. If None, the number is unlimited.
462
+ eval_set: list[tuple], optional (default=None)
463
+ list of pairs (X, y) for validation.
473
464
 
474
465
  calculate_metrics: bool, optional (default=None)
475
466
  Whether to calculate and show metrics.
@@ -487,6 +478,13 @@ class FeaturesEnricher(TransformerMixin):
487
478
  select_features: bool, optional (default=False)
488
479
  If True, return only selected features both from input and data sources.
489
480
  Otherwise, return all features from input and only selected features from data sources.
481
+
482
+ stability_threshold: float, optional (default=0.2)
483
+ Stability threshold for selected features PSI calculation. If PSI is less than this threshold,
484
+ then feature will be dropped.
485
+
486
+ stability_agg_func: str, optional (default="max")
487
+ Function to aggregate stability values. Can be "max", "min", "mean".
490
488
  """
491
489
  trace_id = str(uuid.uuid4())
492
490
  if self.print_trace_id:
@@ -536,9 +534,8 @@ class FeaturesEnricher(TransformerMixin):
536
534
  calculate_metrics=calculate_metrics,
537
535
  estimator=estimator,
538
536
  scoring=scoring,
539
- importance_threshold=importance_threshold,
540
537
  stability_threshold=stability_threshold,
541
- max_features=max_features,
538
+ stability_agg_func=stability_agg_func,
542
539
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
543
540
  auto_fe_parameters=auto_fe_parameters,
544
541
  progress_callback=progress_callback,
@@ -582,28 +579,26 @@ class FeaturesEnricher(TransformerMixin):
582
579
 
583
580
  def fit_transform(
584
581
  self,
585
- X: Union[pd.DataFrame, pd.Series, np.ndarray],
586
- y: Union[pd.DataFrame, pd.Series, np.ndarray, List],
587
- eval_set: Optional[Union[List[tuple], tuple]] = None,
582
+ X: pd.DataFrame | pd.Series | np.ndarray,
583
+ y: pd.DataFrame | pd.Series | np.ndarray | list,
584
+ eval_set: list[tuple] | tuple | None = None,
588
585
  *args,
589
- exclude_features_sources: Optional[List[str]] = None,
590
- keep_input: bool = True,
591
- importance_threshold: Optional[float] = None,
592
- max_features: Optional[int] = None,
593
- calculate_metrics: Optional[bool] = None,
594
- scoring: Union[Callable, str, None] = None,
595
- estimator: Optional[Any] = None,
596
- remove_outliers_calc_metrics: Optional[bool] = None,
597
- progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
586
+ exclude_features_sources: list[str] | None | None = None,
587
+ keep_input: bool | None = None,
588
+ calculate_metrics: bool | None = None,
589
+ scoring: Callable | str | None = None,
590
+ estimator: Any | None = None,
591
+ remove_outliers_calc_metrics: bool | None = None,
592
+ progress_callback: Callable[[SearchProgress], Any] | None = None,
598
593
  select_features: bool = True,
599
- auto_fe_parameters: Optional[AutoFEParameters] = None,
600
- stability_threshold: float = 0.15,
594
+ auto_fe_parameters: AutoFEParameters | None = None,
595
+ stability_threshold: float = 0.2,
596
+ stability_agg_func: str = "max",
601
597
  **kwargs,
602
598
  ) -> pd.DataFrame:
603
599
  """Fit to data, then transform it.
604
600
 
605
601
  Fits transformer to `X` and `y` and returns a transformed version of `X`.
606
- If keep_input is True, then all input columns are copied to the output dataframe.
607
602
 
608
603
  Parameters
609
604
  ----------
@@ -613,20 +608,14 @@ class FeaturesEnricher(TransformerMixin):
613
608
  y: array-like of shape (n_samples,)
614
609
  Target values.
615
610
 
616
- eval_set: List[tuple], optional (default=None)
617
- List of pairs (X, y) for validation.
611
+ eval_set: list[tuple], optional (default=None)
612
+ list of pairs (X, y) for validation.
618
613
 
619
614
  keep_input: bool, optional (default=True)
615
+ keep_input: bool, optional (default=None)
620
616
  If True, copy original input columns to the output dataframe.
621
-
622
- importance_threshold: float, optional (default=None)
623
- Minimum SHAP value to select a feature. Default value is 0.0.
624
-
625
- max_features: int, optional (default=None)
626
- Maximum number of most important features to select. If None, the number is unlimited.
627
-
628
- calculate_metrics: bool, optional (default=None)
629
- Whether to calculate and show metrics.
617
+ If False, then only enriched columns are returned.
618
+ If None, then all search keys, ID columns, selected client features and enriched columns will be returned.
630
619
 
631
620
  estimator: sklearn-compatible estimator, optional (default=None)
632
621
  Custom estimator for metrics calculation.
@@ -642,10 +631,13 @@ class FeaturesEnricher(TransformerMixin):
642
631
  If True, return only selected features both from input and data sources.
643
632
  Otherwise, return all features from input and only selected features from data sources.
644
633
 
645
- stability_threshold: float, optional (default=0.15)
634
+ stability_threshold: float, optional (default=0.2)
646
635
  Stability threshold for selected features PSI calculation. If PSI is less than this threshold,
647
636
  then feature will be dropped.
648
637
 
638
+ stability_agg_func: str, optional (default="max")
639
+ Function to aggregate stability values. Can be "max", "min", "mean".
640
+
649
641
  Returns
650
642
  -------
651
643
  X_new: pandas.DataFrame of shape (n_samples, n_features_new)
@@ -701,9 +693,8 @@ class FeaturesEnricher(TransformerMixin):
701
693
  calculate_metrics=calculate_metrics,
702
694
  scoring=scoring,
703
695
  estimator=estimator,
704
- importance_threshold=importance_threshold,
705
696
  stability_threshold=stability_threshold,
706
- max_features=max_features,
697
+ stability_agg_func=stability_agg_func,
707
698
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
708
699
  auto_fe_parameters=auto_fe_parameters,
709
700
  progress_callback=progress_callback,
@@ -746,8 +737,6 @@ class FeaturesEnricher(TransformerMixin):
746
737
  X,
747
738
  exclude_features_sources=exclude_features_sources,
748
739
  keep_input=keep_input,
749
- importance_threshold=importance_threshold,
750
- max_features=max_features,
751
740
  trace_id=trace_id,
752
741
  silent_mode=True,
753
742
  progress_bar=progress_bar,
@@ -760,36 +749,29 @@ class FeaturesEnricher(TransformerMixin):
760
749
  self,
761
750
  X: pd.DataFrame,
762
751
  *args,
763
- y: Optional[pd.Series] = None,
764
- exclude_features_sources: Optional[List[str]] = None,
765
- keep_input: bool = True,
766
- importance_threshold: Optional[float] = None,
767
- max_features: Optional[int] = None,
768
- trace_id: Optional[str] = None,
752
+ y: pd.Series | None = None,
753
+ exclude_features_sources: list[str] | None = None,
754
+ keep_input: bool | None = None,
755
+ trace_id: str | None = None,
769
756
  metrics_calculation: bool = False,
770
757
  silent_mode=False,
771
- progress_bar: Optional[ProgressBar] = None,
772
- progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
758
+ progress_bar: ProgressBar | None = None,
759
+ progress_callback: Callable[[SearchProgress], Any] | None = None,
773
760
  **kwargs,
774
761
  ) -> pd.DataFrame:
775
762
  """Transform `X`.
776
763
 
777
764
  Returns a transformed version of `X`.
778
- If keep_input is True, then all input columns are copied to the output dataframe.
779
765
 
780
766
  Parameters
781
767
  ----------
782
768
  X: pandas.DataFrame of shape (n_samples, n_features)
783
769
  Input samples.
784
770
 
785
- keep_input: bool, optional (default=True)
771
+ keep_input: bool, optional (default=None)
786
772
  If True, copy original input columns to the output dataframe.
787
-
788
- importance_threshold: float, optional (default=None)
789
- Minimum SHAP value to select a feature. Default value is 0.0.
790
-
791
- max_features: int, optional (default=None)
792
- Maximum number of most important features to select. If None, the number is unlimited.
773
+ If False, then only enriched columns are returned.
774
+ If None, then all search keys, ID columns, selected client features and enriched columns will be returned.
793
775
 
794
776
  Returns
795
777
  -------
@@ -827,11 +809,10 @@ class FeaturesEnricher(TransformerMixin):
827
809
  X,
828
810
  y=y,
829
811
  exclude_features_sources=exclude_features_sources,
830
- importance_threshold=importance_threshold,
831
- max_features=max_features,
832
812
  metrics_calculation=metrics_calculation,
833
813
  silent_mode=silent_mode,
834
814
  progress_bar=progress_bar,
815
+ keep_input=keep_input,
835
816
  )
836
817
  self.logger.info("Transform finished successfully")
837
818
  search_progress = SearchProgress(100.0, ProgressStage.FINISHED)
@@ -873,32 +854,26 @@ class FeaturesEnricher(TransformerMixin):
873
854
  raise e
874
855
  finally:
875
856
  self.logger.info(f"Transform elapsed time: {time.time() - start_time}")
876
-
877
- if result is not None:
878
- if keep_input:
879
- return result
880
- else:
881
- return result.drop(columns=X.columns, errors="ignore")
857
+
858
+ return result
882
859
 
883
860
  def calculate_metrics(
884
861
  self,
885
- X: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
886
- y: Union[pd.DataFrame, pd.Series, np.ndarray, List, None] = None,
887
- eval_set: Optional[Union[List[tuple], tuple]] = None,
862
+ X: pd.DataFrame | pd.Series | np.ndarray | None = None,
863
+ y: pd.DataFrame | pd.Series | np.ndarray | list | None = None,
864
+ eval_set: list[tuple] | tuple | None = None,
888
865
  *args,
889
- scoring: Union[Callable, str, None] = None,
890
- cv: Union[BaseCrossValidator, CVType, None] = None,
866
+ scoring: Callable | str | None = None,
867
+ cv: BaseCrossValidator | CVType | str | None = None,
891
868
  estimator=None,
892
- exclude_features_sources: Optional[List[str]] = None,
893
- importance_threshold: Optional[float] = None,
894
- max_features: Optional[int] = None,
895
- remove_outliers_calc_metrics: Optional[bool] = None,
896
- trace_id: Optional[str] = None,
869
+ exclude_features_sources: list[str] | None = None,
870
+ remove_outliers_calc_metrics: bool | None = None,
871
+ trace_id: str | None = None,
897
872
  internal_call: bool = False,
898
- progress_bar: Optional[ProgressBar] = None,
899
- progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
873
+ progress_bar: ProgressBar | None = None,
874
+ progress_callback: Callable[[SearchProgress], Any] | None = None,
900
875
  **kwargs,
901
- ) -> Optional[pd.DataFrame]:
876
+ ) -> pd.DataFrame | None:
902
877
  """Calculate metrics
903
878
 
904
879
  Parameters
@@ -909,8 +884,8 @@ class FeaturesEnricher(TransformerMixin):
909
884
  y: array-like of shape (n_samples,), optional (default=None)
910
885
  Target values. If X not passed then y from fit will be used
911
886
 
912
- eval_set: List[tuple], optional (default=None)
913
- List of pairs (X, y) for validation. If X not passed then eval_set from fit will be used
887
+ eval_set: list[tuple], optional (default=None)
888
+ list of pairs (X, y) for validation. If X not passed then eval_set from fit will be used
914
889
 
915
890
  scoring: string or callable, optional (default=None)
916
891
  A string or a scorer callable object / function with signature scorer(estimator, X, y).
@@ -922,12 +897,6 @@ class FeaturesEnricher(TransformerMixin):
922
897
  estimator: sklearn-compatible estimator, optional (default=None)
923
898
  Custom estimator for metrics calculation. If not passed then CatBoost will be used.
924
899
 
925
- importance_threshold: float, optional (default=None)
926
- Minimum SHAP value to select a feature. Default value is 0.0.
927
-
928
- max_features: int, optional (default=None)
929
- Maximum number of most important features to select. If None, the number is unlimited.
930
-
931
900
  remove_outliers_calc_metrics, optional (default=True)
932
901
  If True then rows with target ouliers will be dropped on metrics calculation
933
902
 
@@ -990,8 +959,6 @@ class FeaturesEnricher(TransformerMixin):
990
959
  validated_eval_set,
991
960
  exclude_features_sources=exclude_features_sources,
992
961
  cv=cv if cv is not None else self.cv,
993
- importance_threshold=importance_threshold,
994
- max_features=max_features,
995
962
  scoring=scoring,
996
963
  estimator=estimator,
997
964
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
@@ -1032,20 +999,19 @@ class FeaturesEnricher(TransformerMixin):
1032
999
  search_keys_for_metrics.extend([c for c in self.id_columns or [] if c not in search_keys_for_metrics])
1033
1000
  self.logger.info(f"Search keys for metrics: {search_keys_for_metrics}")
1034
1001
 
1035
- prepared_data = self._prepare_data_for_metrics(
1002
+ prepared_data = self._get_cached_enriched_data(
1036
1003
  trace_id=trace_id,
1037
1004
  X=X,
1038
1005
  y=y,
1039
1006
  eval_set=eval_set,
1040
1007
  exclude_features_sources=exclude_features_sources,
1041
- importance_threshold=importance_threshold,
1042
- max_features=max_features,
1043
1008
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
1044
1009
  cv_override=cv,
1045
1010
  search_keys_for_metrics=search_keys_for_metrics,
1046
1011
  progress_bar=progress_bar,
1047
1012
  progress_callback=progress_callback,
1048
1013
  client_cat_features=client_cat_features,
1014
+ is_for_metrics=True,
1049
1015
  )
1050
1016
  if prepared_data is None:
1051
1017
  return None
@@ -1345,17 +1311,16 @@ class FeaturesEnricher(TransformerMixin):
1345
1311
  def _select_features_by_psi(
1346
1312
  self,
1347
1313
  trace_id: str,
1348
- X: Union[pd.DataFrame, pd.Series, np.ndarray],
1349
- y: Union[pd.DataFrame, pd.Series, np.ndarray, List],
1350
- eval_set: Optional[Union[List[tuple], tuple]],
1314
+ X: pd.DataFrame | pd.Series | np.ndarray,
1315
+ y: pd.DataFrame | pd.Series | np.ndarray | list,
1316
+ eval_set: list[tuple] | tuple | None,
1351
1317
  stability_threshold: float,
1352
- cv: Union[BaseCrossValidator, CVType, str, None] = None,
1318
+ stability_agg_func: Callable,
1319
+ cv: BaseCrossValidator | CVType | str | None = None,
1353
1320
  estimator=None,
1354
- exclude_features_sources: Optional[List[str]] = None,
1355
- importance_threshold: Optional[float] = None,
1356
- max_features: Optional[int] = None,
1321
+ exclude_features_sources: list[str] | None = None,
1357
1322
  progress_bar: bool = True,
1358
- progress_callback: Optional[Callable] = None,
1323
+ progress_callback: Callable | None = None,
1359
1324
  ):
1360
1325
  search_keys = self.search_keys.copy()
1361
1326
  validated_X, _, validated_eval_set = self._validate_train_eval(X, y, eval_set, silent=True)
@@ -1392,14 +1357,12 @@ class FeaturesEnricher(TransformerMixin):
1392
1357
  c for c in client_cat_features if c not in self.id_columns_encoder.feature_names_in_
1393
1358
  ]
1394
1359
 
1395
- prepared_data = self._prepare_data_for_metrics(
1360
+ prepared_data = self._get_cached_enriched_data(
1396
1361
  trace_id=trace_id,
1397
1362
  X=X,
1398
1363
  y=y,
1399
1364
  eval_set=eval_set,
1400
1365
  exclude_features_sources=exclude_features_sources,
1401
- importance_threshold=importance_threshold,
1402
- max_features=max_features,
1403
1366
  remove_outliers_calc_metrics=False,
1404
1367
  cv_override=cv,
1405
1368
  search_keys_for_metrics=search_keys_for_metrics,
@@ -1412,15 +1375,15 @@ class FeaturesEnricher(TransformerMixin):
1412
1375
 
1413
1376
  (
1414
1377
  validated_X,
1415
- fitting_X,
1378
+ _,
1416
1379
  y_sorted,
1417
- fitting_enriched_X,
1380
+ _,
1418
1381
  _,
1419
1382
  fitting_eval_set_dict,
1420
1383
  _,
1421
1384
  _,
1422
1385
  _,
1423
- columns_renaming,
1386
+ _,
1424
1387
  eval_set_dates,
1425
1388
  ) = prepared_data
1426
1389
 
@@ -1435,43 +1398,28 @@ class FeaturesEnricher(TransformerMixin):
1435
1398
  eval_set_dates,
1436
1399
  search_keys,
1437
1400
  stability_threshold,
1401
+ stability_agg_func,
1438
1402
  cat_features,
1439
1403
  model_task_type,
1440
1404
  )
1441
- client_features_df = self.df_with_original_index.rename(columns=columns_renaming)
1442
- # decoded_X = self._decode_id_columns(fitting_X, columns_renaming)
1443
- self._update_report_psi(trace_id, client_features_df)
1444
1405
 
1445
1406
  if unstable_features:
1446
- msg = f"Some features are unstable: {unstable_features} and will be dropped"
1407
+ msg = f"{len(unstable_features)} feature(s) are unstable: {unstable_features} and will be dropped"
1447
1408
  self.logger.warning(msg)
1448
1409
  print(msg)
1449
- fitting_X = fitting_X.drop(columns=unstable_features, errors="ignore")
1450
- fitting_enriched_X = fitting_enriched_X.drop(columns=unstable_features, errors="ignore")
1451
- msg = f"Threre are {len(fitting_enriched_X.columns)} stable selected features left"
1452
- self.logger.info(msg)
1453
- print(msg)
1454
- for idx, (
1455
- eval_X,
1456
- eval_y,
1457
- eval_enriched_X,
1458
- eval_enriched_y,
1459
- ) in fitting_eval_set_dict.items():
1460
- eval_X = eval_X.drop(columns=unstable_features, errors="ignore")
1461
- eval_enriched_X = eval_enriched_X.drop(columns=unstable_features, errors="ignore")
1462
- fitting_eval_set_dict[idx] = (eval_X, eval_y, eval_enriched_X, eval_enriched_y)
1463
1410
 
1464
1411
  def _check_stability(
1465
1412
  self,
1466
1413
  X: pd.DataFrame,
1467
- eval_set: List[Tuple[pd.DataFrame, pd.Series]],
1468
- enriched_eval_set: Dict,
1469
- eval_set_dates: Dict[int, pd.Series],
1470
- search_keys: Dict[str, SearchKey],
1414
+ eval_set: list[tuple[pd.DataFrame, pd.Series]],
1415
+ enriched_eval_set: dict,
1416
+ eval_set_dates: dict[int, pd.Series],
1417
+ search_keys: dict[str, SearchKey],
1471
1418
  stability_threshold: float,
1472
- cat_features: List[str],
1419
+ stability_agg_func: str | None,
1420
+ cat_features: list[str],
1473
1421
  model_task_type: ModelTaskType,
1474
- ) -> List[str]:
1422
+ ) -> list[str]:
1475
1423
  # Find latest eval set or earliest if all eval sets are before train set
1476
1424
  date_column = self._get_date_column(search_keys)
1477
1425
 
@@ -1521,17 +1469,17 @@ class FeaturesEnricher(TransformerMixin):
1521
1469
 
1522
1470
  unstable_by_sparsity = [feature for feature, psi in psi_values_sparse.items() if psi > stability_threshold]
1523
1471
  if unstable_by_sparsity:
1524
- self.logger.info(f"Unstable by sparsity features: {sorted(unstable_by_sparsity)}")
1472
+ self.logger.info(f"Unstable by sparsity features ({stability_threshold}): {sorted(unstable_by_sparsity)}")
1525
1473
 
1526
1474
  psi_values = calculate_features_psi(
1527
- checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
1475
+ checking_eval_set_df, cat_features, date_column, self.logger, model_task_type, stability_agg_func
1528
1476
  )
1529
1477
 
1530
1478
  self.logger.info(f"PSI values by value: {psi_values}")
1531
1479
 
1532
1480
  unstable_by_value = [feature for feature, psi in psi_values.items() if psi > stability_threshold]
1533
1481
  if unstable_by_value:
1534
- self.logger.info(f"Unstable by value features: {sorted(unstable_by_value)}")
1482
+ self.logger.info(f"Unstable by value features ({stability_threshold}): {sorted(unstable_by_value)}")
1535
1483
 
1536
1484
  self.psi_values = {
1537
1485
  feature: psi_value for feature, psi_value in psi_values.items() if psi_value <= stability_threshold
@@ -1541,7 +1489,7 @@ class FeaturesEnricher(TransformerMixin):
1541
1489
 
1542
1490
  return total_unstable_features
1543
1491
 
1544
- def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float], silent: bool = False):
1492
+ def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: dict[str, float], silent: bool = False):
1545
1493
  renaming = self.fit_columns_renaming or {}
1546
1494
  self.logger.info(f"Updating SHAP values: {new_shaps}")
1547
1495
  new_shaps = {
@@ -1558,7 +1506,7 @@ class FeaturesEnricher(TransformerMixin):
1558
1506
  display_html_dataframe(
1559
1507
  self.features_info,
1560
1508
  self._features_info_without_links,
1561
- self.bundle.get("relevant_features_header"),
1509
+ self.bundle.get("relevant_features_header").format(len(self.feature_names_)),
1562
1510
  display_handle=self.features_info_display_handle,
1563
1511
  )
1564
1512
  except (ImportError, NameError):
@@ -1596,56 +1544,6 @@ class FeaturesEnricher(TransformerMixin):
1596
1544
  except (ImportError, NameError):
1597
1545
  pass
1598
1546
 
1599
- def _update_report_psi(self, trace_id: str, clients_features_df: pd.DataFrame):
1600
- self.__prepare_feature_importances(trace_id, clients_features_df)
1601
-
1602
- if self.features_info_display_handle is not None:
1603
- try:
1604
- _ = get_ipython() # type: ignore
1605
-
1606
- display_html_dataframe(
1607
- self.features_info,
1608
- self._features_info_without_links,
1609
- self.bundle.get("relevant_features_header"),
1610
- display_handle=self.features_info_display_handle,
1611
- )
1612
- except (ImportError, NameError):
1613
- pass
1614
-
1615
- if self.data_sources_display_handle is not None:
1616
- try:
1617
- _ = get_ipython() # type: ignore
1618
-
1619
- display_html_dataframe(
1620
- self.relevant_data_sources,
1621
- self._relevant_data_sources_wo_links,
1622
- self.bundle.get("relevant_data_sources_header"),
1623
- display_handle=self.data_sources_display_handle,
1624
- )
1625
- except (ImportError, NameError):
1626
- pass
1627
-
1628
- if self.autofe_features_display_handle is not None:
1629
- try:
1630
- _ = get_ipython() # type: ignore
1631
- autofe_descriptions_df = self.get_autofe_features_description()
1632
- if autofe_descriptions_df is not None:
1633
- display_html_dataframe(
1634
- df=autofe_descriptions_df,
1635
- internal_df=autofe_descriptions_df,
1636
- header=self.bundle.get("autofe_descriptions_header"),
1637
- display_handle=self.autofe_features_display_handle,
1638
- )
1639
- except (ImportError, NameError):
1640
- pass
1641
- if self.report_button_handle is not None:
1642
- try:
1643
- _ = get_ipython() # type: ignore
1644
-
1645
- self.__show_report_button(display_handle=self.report_button_handle)
1646
- except (ImportError, NameError):
1647
- pass
1648
-
1649
1547
  def _check_train_and_eval_target_distribution(self, y, eval_set_dict):
1650
1548
  uneven_distribution = False
1651
1549
  for eval_set in eval_set_dict.values():
@@ -1659,13 +1557,13 @@ class FeaturesEnricher(TransformerMixin):
1659
1557
  self.logger.warning(msg)
1660
1558
 
1661
1559
  def _has_features_with_commercial_schema(
1662
- self, commercial_schema: str, exclude_features_sources: Optional[List[str]]
1560
+ self, commercial_schema: str, exclude_features_sources: list[str] | None
1663
1561
  ) -> bool:
1664
1562
  return len(self._get_features_with_commercial_schema(commercial_schema, exclude_features_sources)) > 0
1665
1563
 
1666
1564
  def _get_features_with_commercial_schema(
1667
- self, commercial_schema: str, exclude_features_sources: Optional[List[str]]
1668
- ) -> List[str]:
1565
+ self, commercial_schema: str, exclude_features_sources: list[str] | None
1566
+ ) -> list[str]:
1669
1567
  if exclude_features_sources:
1670
1568
  filtered_features_info = self._internal_features_info[
1671
1569
  ~self._internal_features_info[self.bundle.get("features_info_name")].isin(exclude_features_sources)
@@ -1679,15 +1577,15 @@ class FeaturesEnricher(TransformerMixin):
1679
1577
  ].values
1680
1578
  )
1681
1579
 
1682
- def _has_paid_features(self, exclude_features_sources: Optional[List[str]]) -> bool:
1580
+ def _has_paid_features(self, exclude_features_sources: list[str] | None) -> bool:
1683
1581
  return self._has_features_with_commercial_schema(CommercialSchema.PAID.value, exclude_features_sources)
1684
1582
 
1685
1583
  def _is_input_same_as_fit(
1686
1584
  self,
1687
- X: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
1688
- y: Union[pd.DataFrame, pd.Series, np.ndarray, List, None] = None,
1689
- eval_set: Optional[List[tuple]] = None,
1690
- ) -> Tuple:
1585
+ X: pd.DataFrame | pd.Series | np.ndarray | None = None,
1586
+ y: pd.DataFrame | pd.Series | np.ndarray | list | None = None,
1587
+ eval_set: list[tuple] | None = None,
1588
+ ) -> tuple:
1691
1589
  if X is None:
1692
1590
  return True, self.X, self.y, self.eval_set
1693
1591
 
@@ -1717,9 +1615,9 @@ class FeaturesEnricher(TransformerMixin):
1717
1615
  def _get_cv_and_groups(
1718
1616
  self,
1719
1617
  X: pd.DataFrame,
1720
- cv_override: Union[BaseCrossValidator, CVType, str, None],
1721
- search_keys: Dict[str, SearchKey],
1722
- ) -> Tuple[BaseCrossValidator, Optional[np.ndarray]]:
1618
+ cv_override: BaseCrossValidator | CVType | str | None,
1619
+ search_keys: dict[str, SearchKey],
1620
+ ) -> tuple[BaseCrossValidator, np.ndarray] | None:
1723
1621
  _cv = cv_override or self.cv
1724
1622
  group_columns = sorted(self._get_group_columns(X, search_keys))
1725
1623
  groups = None
@@ -1747,8 +1645,8 @@ class FeaturesEnricher(TransformerMixin):
1747
1645
  return _cv, groups
1748
1646
 
1749
1647
  def _get_and_validate_client_cat_features(
1750
- self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
1751
- ) -> Tuple[Optional[List[str]], List[str]]:
1648
+ self, estimator: Any | None, X: pd.DataFrame, search_keys: dict[str, SearchKey]
1649
+ ) -> tuple[list[str] | None, list[str]]:
1752
1650
  cat_features = []
1753
1651
  search_keys_for_metrics = []
1754
1652
  if (
@@ -1777,41 +1675,41 @@ class FeaturesEnricher(TransformerMixin):
1777
1675
  raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
1778
1676
  return cat_features, search_keys_for_metrics
1779
1677
 
1780
- def _prepare_data_for_metrics(
1678
+ def _get_cached_enriched_data(
1781
1679
  self,
1782
1680
  trace_id: str,
1783
- X: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
1784
- y: Union[pd.DataFrame, pd.Series, np.ndarray, List, None] = None,
1785
- eval_set: Optional[Union[List[tuple], tuple]] = None,
1786
- exclude_features_sources: Optional[List[str]] = None,
1787
- importance_threshold: Optional[float] = None,
1788
- max_features: Optional[int] = None,
1789
- remove_outliers_calc_metrics: Optional[bool] = None,
1790
- cv_override: Union[BaseCrossValidator, CVType, str, None] = None,
1791
- search_keys_for_metrics: Optional[List[str]] = None,
1792
- progress_bar: Optional[ProgressBar] = None,
1793
- progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
1794
- client_cat_features: Optional[List[str]] = None,
1681
+ X: pd.DataFrame | pd.Series | np.ndarray | None = None,
1682
+ y: pd.DataFrame | pd.Series | np.ndarray | list | None = None,
1683
+ eval_set: list[tuple] | tuple | None = None,
1684
+ exclude_features_sources: list[str] | None = None,
1685
+ remove_outliers_calc_metrics: bool | None = None,
1686
+ cv_override: BaseCrossValidator | CVType | str | None = None,
1687
+ search_keys_for_metrics: list[str] | None = None,
1688
+ progress_bar: ProgressBar | None = None,
1689
+ progress_callback: Callable[[SearchProgress], Any] | None = None,
1690
+ client_cat_features: list[str] | None = None,
1691
+ is_for_metrics: bool = False,
1795
1692
  ):
1796
1693
  is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
1797
1694
  is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
1798
1695
  checked_eval_set = self._check_eval_set(eval_set, X)
1799
1696
  validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set, silent=True)
1800
1697
 
1801
- sampled_data = self._get_enriched_for_metrics(
1802
- trace_id,
1803
- validated_X,
1804
- validated_y,
1805
- validated_eval_set,
1806
- exclude_features_sources,
1807
- is_input_same_as_fit,
1808
- is_demo_dataset,
1809
- remove_outliers_calc_metrics,
1810
- progress_bar,
1811
- progress_callback,
1698
+ sampled_data = self._get_enriched_datasets(
1699
+ trace_id=trace_id,
1700
+ validated_X=validated_X,
1701
+ validated_y=validated_y,
1702
+ eval_set=validated_eval_set,
1703
+ exclude_features_sources=exclude_features_sources,
1704
+ is_input_same_as_fit=is_input_same_as_fit,
1705
+ is_demo_dataset=is_demo_dataset,
1706
+ remove_outliers_calc_metrics=remove_outliers_calc_metrics,
1707
+ progress_bar=progress_bar,
1708
+ progress_callback=progress_callback,
1709
+ is_for_metrics=is_for_metrics,
1812
1710
  )
1813
- (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming) = dataclasses.astuple(
1814
- sampled_data
1711
+ (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming, generated_features) = (
1712
+ dataclasses.astuple(sampled_data)
1815
1713
  )
1816
1714
 
1817
1715
  excluding_search_keys = list(search_keys.keys())
@@ -1827,14 +1725,9 @@ class FeaturesEnricher(TransformerMixin):
1827
1725
 
1828
1726
  client_features = [
1829
1727
  c
1830
- for c in X_sampled.columns.to_list()
1831
- if (
1832
- not self.fit_select_features
1833
- or c in set(self.feature_names_).union(self.id_columns or [])
1834
- or (self.fit_columns_renaming or {}).get(c, c) in set(self.feature_names_).union(self.id_columns or [])
1835
- )
1836
- and c
1837
- not in (
1728
+ for c in (validated_X.columns.to_list() + generated_features)
1729
+ if (not self.fit_select_features or c in set(self.feature_names_).union(self.id_columns or []))
1730
+ and c not in (
1838
1731
  excluding_search_keys
1839
1732
  + list(self.fit_dropped_features)
1840
1733
  + [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
@@ -1842,20 +1735,17 @@ class FeaturesEnricher(TransformerMixin):
1842
1735
  ]
1843
1736
  self.logger.info(f"Client features column on prepare data for metrics: {client_features}")
1844
1737
 
1845
- filtered_enriched_features = self.__filtered_enriched_features(
1846
- importance_threshold, max_features, trace_id, validated_X
1847
- )
1848
- filtered_enriched_features = [c for c in filtered_enriched_features if c not in client_features]
1738
+ selected_enriched_features = [c for c in self.feature_names_ if c not in client_features]
1849
1739
 
1850
1740
  X_sorted, y_sorted = self._sort_by_system_record_id(X_sampled, y_sampled, self.cv)
1851
1741
  enriched_X_sorted, enriched_y_sorted = self._sort_by_system_record_id(enriched_X, y_sampled, self.cv)
1852
1742
 
1853
1743
  cv, groups = self._get_cv_and_groups(enriched_X_sorted, cv_override, search_keys)
1854
1744
 
1855
- existing_filtered_enriched_features = [c for c in filtered_enriched_features if c in enriched_X_sorted.columns]
1745
+ existing_selected_enriched_features = [c for c in selected_enriched_features if c in enriched_X_sorted.columns]
1856
1746
 
1857
1747
  fitting_X = X_sorted[client_features].copy()
1858
- fitting_enriched_X = enriched_X_sorted[client_features + existing_filtered_enriched_features].copy()
1748
+ fitting_enriched_X = enriched_X_sorted[client_features + existing_selected_enriched_features].copy()
1859
1749
 
1860
1750
  renamed_generate_features = [columns_renaming.get(c, c) for c in (self.generate_features or [])]
1861
1751
  renamed_client_cat_features = [columns_renaming.get(c, c) for c in (client_cat_features or [])]
@@ -1995,29 +1885,31 @@ class FeaturesEnricher(TransformerMixin):
1995
1885
  X_sampled: pd.DataFrame
1996
1886
  y_sampled: pd.Series
1997
1887
  enriched_X: pd.DataFrame
1998
- eval_set_sampled_dict: Dict[int, Tuple[pd.DataFrame, pd.Series]]
1999
- search_keys: Dict[str, SearchKey]
2000
- columns_renaming: Dict[str, str]
1888
+ eval_set_sampled_dict: dict[int, tuple[pd.DataFrame, pd.Series]]
1889
+ search_keys: dict[str, SearchKey]
1890
+ columns_renaming: dict[str, str]
1891
+ generated_features: list[str]
2001
1892
 
2002
- def _get_enriched_for_metrics(
1893
+ def _get_enriched_datasets(
2003
1894
  self,
2004
1895
  trace_id: str,
2005
- validated_X: Union[pd.DataFrame, pd.Series, np.ndarray, None],
2006
- validated_y: Union[pd.DataFrame, pd.Series, np.ndarray, List, None],
2007
- eval_set: Optional[List[tuple]],
2008
- exclude_features_sources: Optional[List[str]],
1896
+ validated_X: pd.DataFrame | pd.Series | np.ndarray | None,
1897
+ validated_y: pd.DataFrame | pd.Series | np.ndarray | list | None,
1898
+ eval_set: list[tuple] | None,
1899
+ exclude_features_sources: list[str] | None,
2009
1900
  is_input_same_as_fit: bool,
2010
1901
  is_demo_dataset: bool,
2011
- remove_outliers_calc_metrics: Optional[bool],
2012
- progress_bar: Optional[ProgressBar],
2013
- progress_callback: Optional[Callable[[SearchProgress], Any]],
1902
+ remove_outliers_calc_metrics: bool | None,
1903
+ progress_bar: ProgressBar | None,
1904
+ progress_callback: Callable[[SearchProgress], Any] | None,
1905
+ is_for_metrics: bool = False,
2014
1906
  ) -> _EnrichedDataForMetrics:
2015
1907
  datasets_hash = hash_input(validated_X, validated_y, eval_set)
2016
1908
  cached_sampled_datasets = self.__cached_sampled_datasets.get(datasets_hash)
2017
1909
  if cached_sampled_datasets is not None and is_input_same_as_fit and remove_outliers_calc_metrics is None:
2018
1910
  self.logger.info("Cached enriched dataset found - use it")
2019
1911
  return self.__get_sampled_cached_enriched(datasets_hash, exclude_features_sources)
2020
- elif len(self.feature_importances_) == 0:
1912
+ elif len(self.feature_names_) == 0 or all([f in validated_X.columns for f in self.feature_names_]):
2021
1913
  self.logger.info("No external features selected. So use only input datasets for metrics calculation")
2022
1914
  return self.__get_enriched_as_input(validated_X, validated_y, eval_set, is_demo_dataset)
2023
1915
  # TODO save and check if dataset was deduplicated - use imbalance branch for such case
@@ -2043,12 +1935,13 @@ class FeaturesEnricher(TransformerMixin):
2043
1935
  trace_id,
2044
1936
  progress_bar,
2045
1937
  progress_callback,
1938
+ is_for_metrics=is_for_metrics,
2046
1939
  )
2047
1940
 
2048
1941
  def __get_sampled_cached_enriched(
2049
- self, datasets_hash: str, exclude_features_sources: Optional[List[str]]
1942
+ self, datasets_hash: str, exclude_features_sources: list[str] | None
2050
1943
  ) -> _EnrichedDataForMetrics:
2051
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = (
1944
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming, generated_features = (
2052
1945
  self.__cached_sampled_datasets[datasets_hash]
2053
1946
  )
2054
1947
  if exclude_features_sources:
@@ -2062,10 +1955,11 @@ class FeaturesEnricher(TransformerMixin):
2062
1955
  eval_set_sampled_dict,
2063
1956
  columns_renaming,
2064
1957
  search_keys,
1958
+ generated_features,
2065
1959
  )
2066
1960
 
2067
1961
  def __get_enriched_as_input(
2068
- self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
1962
+ self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: list[tuple] | None, is_demo_dataset: bool
2069
1963
  ) -> _EnrichedDataForMetrics:
2070
1964
  eval_set_sampled_dict = {}
2071
1965
 
@@ -2130,6 +2024,10 @@ class FeaturesEnricher(TransformerMixin):
2130
2024
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2131
2025
  df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
2132
2026
 
2027
+ df = df.rename(columns=columns_renaming)
2028
+ generated_features = [columns_renaming.get(c, c) for c in generated_features]
2029
+ search_keys = {columns_renaming.get(k, k): v for k, v in search_keys.items()}
2030
+
2133
2031
  train_df = df.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df
2134
2032
  X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
2135
2033
  y_sampled = train_df[TARGET].copy()
@@ -2152,13 +2050,14 @@ class FeaturesEnricher(TransformerMixin):
2152
2050
  eval_set_sampled_dict,
2153
2051
  columns_renaming,
2154
2052
  search_keys,
2053
+ generated_features,
2155
2054
  )
2156
2055
 
2157
2056
  def __get_enriched_from_fit(
2158
2057
  self,
2159
- eval_set: Optional[List[tuple]],
2058
+ eval_set: list[tuple] | None,
2160
2059
  trace_id: str,
2161
- remove_outliers_calc_metrics: Optional[bool],
2060
+ remove_outliers_calc_metrics: bool | None,
2162
2061
  ) -> _EnrichedDataForMetrics:
2163
2062
  eval_set_sampled_dict = {}
2164
2063
  search_keys = self.fit_search_keys.copy()
@@ -2246,6 +2145,7 @@ class FeaturesEnricher(TransformerMixin):
2246
2145
  eval_X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
2247
2146
  enriched_eval_X.rename(columns=self.fit_columns_renaming, inplace=True)
2248
2147
  search_keys = {self.fit_columns_renaming.get(k, k): v for k, v in search_keys.items()}
2148
+ generated_features = [self.fit_columns_renaming.get(c, c) for c in self.fit_generated_features]
2249
2149
 
2250
2150
  datasets_hash = hash_input(self.X, self.y, self.eval_set)
2251
2151
  return self.__cache_and_return_results(
@@ -2256,17 +2156,19 @@ class FeaturesEnricher(TransformerMixin):
2256
2156
  eval_set_sampled_dict,
2257
2157
  self.fit_columns_renaming,
2258
2158
  search_keys,
2159
+ generated_features,
2259
2160
  )
2260
2161
 
2261
2162
  def __get_enriched_from_transform(
2262
2163
  self,
2263
2164
  validated_X: pd.DataFrame,
2264
2165
  validated_y: pd.Series,
2265
- eval_set: Optional[List[tuple]],
2266
- exclude_features_sources: Optional[List[str]],
2166
+ eval_set: list[tuple] | None,
2167
+ exclude_features_sources: list[str] | None,
2267
2168
  trace_id: str,
2268
- progress_bar: Optional[ProgressBar],
2269
- progress_callback: Optional[Callable[[SearchProgress], Any]],
2169
+ progress_bar: ProgressBar | None,
2170
+ progress_callback: Callable[[SearchProgress], Any] | None,
2171
+ is_for_metrics: bool = False,
2270
2172
  ) -> _EnrichedDataForMetrics:
2271
2173
  has_eval_set = eval_set is not None
2272
2174
 
@@ -2274,6 +2176,16 @@ class FeaturesEnricher(TransformerMixin):
2274
2176
 
2275
2177
  # Prepare
2276
2178
  df = self.__combine_train_and_eval_sets(validated_X, validated_y, eval_set)
2179
+
2180
+ # Exclude OOT eval sets from transform because they are not used for metrics calculation
2181
+ if not is_for_metrics and EVAL_SET_INDEX in df.columns:
2182
+ for eval_index in df[EVAL_SET_INDEX].unique():
2183
+ if eval_index == 0:
2184
+ continue
2185
+ eval_df = df.query(f"{EVAL_SET_INDEX} == {eval_index}")
2186
+ if eval_df[TARGET].isna().all():
2187
+ df = df.query(f"{EVAL_SET_INDEX} != {eval_index}")
2188
+
2277
2189
  df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
2278
2190
  df = self.__downsample_for_metrics(df)
2279
2191
 
@@ -2315,10 +2227,11 @@ class FeaturesEnricher(TransformerMixin):
2315
2227
  eval_set_sampled_dict,
2316
2228
  columns_renaming,
2317
2229
  search_keys,
2230
+ generated_features,
2318
2231
  )
2319
2232
 
2320
2233
  def __combine_train_and_eval_sets(
2321
- self, X: pd.DataFrame, y: Optional[pd.Series] = None, eval_set: Optional[List[tuple]] = None
2234
+ self, X: pd.DataFrame, y: pd.Series | None = None, eval_set: list[tuple] | None = None
2322
2235
  ) -> pd.DataFrame:
2323
2236
  df = X.copy()
2324
2237
  if y is not None:
@@ -2370,8 +2283,8 @@ class FeaturesEnricher(TransformerMixin):
2370
2283
  )
2371
2284
 
2372
2285
  def __extract_train_data(
2373
- self, enriched_df: pd.DataFrame, x_columns: List[str]
2374
- ) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
2286
+ self, enriched_df: pd.DataFrame, x_columns: list[str]
2287
+ ) -> tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
2375
2288
  if EVAL_SET_INDEX in enriched_df.columns:
2376
2289
  enriched_Xy = enriched_df.query(f"{EVAL_SET_INDEX} == 0")
2377
2290
  else:
@@ -2382,8 +2295,8 @@ class FeaturesEnricher(TransformerMixin):
2382
2295
  return X_sampled, y_sampled, enriched_X
2383
2296
 
2384
2297
  def __extract_eval_data(
2385
- self, enriched_df: pd.DataFrame, x_columns: List[str], enriched_X_columns: List[str], eval_set_len: int
2386
- ) -> Tuple[Dict[int, Tuple], Dict[int, pd.Series]]:
2298
+ self, enriched_df: pd.DataFrame, x_columns: list[str], enriched_X_columns: list[str], eval_set_len: int
2299
+ ) -> tuple[dict[int, tuple], dict[int, pd.Series]]:
2387
2300
  eval_set_sampled_dict = {}
2388
2301
 
2389
2302
  for idx in range(eval_set_len):
@@ -2401,9 +2314,10 @@ class FeaturesEnricher(TransformerMixin):
2401
2314
  X_sampled: pd.DataFrame,
2402
2315
  y_sampled: pd.Series,
2403
2316
  enriched_X: pd.DataFrame,
2404
- eval_set_sampled_dict: Dict[int, Tuple],
2405
- columns_renaming: Dict[str, str],
2406
- search_keys: Dict[str, SearchKey],
2317
+ eval_set_sampled_dict: dict[int, tuple],
2318
+ columns_renaming: dict[str, str],
2319
+ search_keys: dict[str, SearchKey],
2320
+ generated_features: list[str],
2407
2321
  ) -> _EnrichedDataForMetrics:
2408
2322
 
2409
2323
  self.__cached_sampled_datasets[datasets_hash] = (
@@ -2413,10 +2327,11 @@ class FeaturesEnricher(TransformerMixin):
2413
2327
  eval_set_sampled_dict,
2414
2328
  search_keys,
2415
2329
  columns_renaming,
2330
+ generated_features,
2416
2331
  )
2417
2332
 
2418
2333
  return self.__mk_sampled_data_tuple(
2419
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
2334
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming, generated_features
2420
2335
  )
2421
2336
 
2422
2337
  def __mk_sampled_data_tuple(
@@ -2424,17 +2339,11 @@ class FeaturesEnricher(TransformerMixin):
2424
2339
  X_sampled: pd.DataFrame,
2425
2340
  y_sampled: pd.Series,
2426
2341
  enriched_X: pd.DataFrame,
2427
- eval_set_sampled_dict: Dict,
2428
- search_keys: Dict,
2429
- columns_renaming: Dict[str, str],
2342
+ eval_set_sampled_dict: dict,
2343
+ search_keys: dict,
2344
+ columns_renaming: dict[str, str],
2345
+ generated_features: list[str],
2430
2346
  ):
2431
- # X_sampled - with hash-suffixes
2432
- # reversed_renaming = {v: k for k, v in columns_renaming.items()}
2433
- # search_keys = {
2434
- # reversed_renaming.get(k, k): v
2435
- # for k, v in search_keys.items()
2436
- # if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
2437
- # }
2438
2347
  return FeaturesEnricher._EnrichedDataForMetrics(
2439
2348
  X_sampled=X_sampled,
2440
2349
  y_sampled=y_sampled,
@@ -2442,9 +2351,10 @@ class FeaturesEnricher(TransformerMixin):
2442
2351
  eval_set_sampled_dict=eval_set_sampled_dict,
2443
2352
  search_keys=search_keys,
2444
2353
  columns_renaming=columns_renaming,
2354
+ generated_features=generated_features,
2445
2355
  )
2446
2356
 
2447
- def get_search_id(self) -> Optional[str]:
2357
+ def get_search_id(self) -> str | None:
2448
2358
  """Returns search_id of the fitted enricher. Not available before a successful fit."""
2449
2359
  return self._search_task.search_task_id if self._search_task else None
2450
2360
 
@@ -2457,7 +2367,7 @@ class FeaturesEnricher(TransformerMixin):
2457
2367
 
2458
2368
  return self.features_info
2459
2369
 
2460
- def get_progress(self, trace_id: Optional[str] = None, search_task: Optional[SearchTask] = None) -> SearchProgress:
2370
+ def get_progress(self, trace_id: str | None = None, search_task: SearchTask | None = None) -> SearchProgress:
2461
2371
  search_task = search_task or self._search_task
2462
2372
  if search_task is not None:
2463
2373
  trace_id = trace_id or uuid.uuid4()
@@ -2565,16 +2475,15 @@ if response.status_code == 200:
2565
2475
  trace_id: str,
2566
2476
  X: pd.DataFrame,
2567
2477
  *,
2568
- y: Optional[pd.Series] = None,
2569
- exclude_features_sources: Optional[List[str]] = None,
2570
- importance_threshold: Optional[float] = None,
2571
- max_features: Optional[int] = None,
2478
+ y: pd.Series | None = None,
2479
+ exclude_features_sources: list[str] | None = None,
2572
2480
  metrics_calculation: bool = False,
2573
2481
  silent_mode: bool = False,
2574
- progress_bar: Optional[ProgressBar] = None,
2575
- progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
2482
+ progress_bar: ProgressBar | None = None,
2483
+ progress_callback: Callable[[SearchProgress], Any] | None = None,
2576
2484
  add_fit_system_record_id: bool = False,
2577
- ) -> Tuple[pd.DataFrame, Dict[str, str], List[str], Dict[str, SearchKey]]:
2485
+ keep_input: bool | None = None,
2486
+ ) -> tuple[pd.DataFrame, dict[str, str], list[str], dict[str, SearchKey]]:
2578
2487
  if self._search_task is None:
2579
2488
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
2580
2489
 
@@ -2592,11 +2501,8 @@ if response.status_code == 200:
2592
2501
 
2593
2502
  self.__log_debug_information(validated_X, validated_y, exclude_features_sources=exclude_features_sources)
2594
2503
 
2595
- filtered_columns = self.__filtered_enriched_features(
2596
- importance_threshold, max_features, trace_id, validated_X
2597
- )
2598
2504
  # If there are no important features, return original dataframe
2599
- if not filtered_columns:
2505
+ if len(self.feature_names_) == 0:
2600
2506
  msg = self.bundle.get("no_important_features_for_transform")
2601
2507
  self.__log_warning(msg, show_support_link=True)
2602
2508
  return X, {c: c for c in X.columns}, [], dict()
@@ -2703,9 +2609,9 @@ if response.status_code == 200:
2703
2609
  if not external_features:
2704
2610
  self.logger.warning(
2705
2611
  "No external features found, returning original dataframe"
2706
- f" with generated important features: {filtered_columns}"
2612
+ f" with generated important features: {self.feature_names_}"
2707
2613
  )
2708
- filtered_columns = [c for c in filtered_columns if c in df.columns]
2614
+ filtered_columns = [c for c in self.feature_names_ if c in df.columns]
2709
2615
  self.logger.warning(f"Filtered columns by existance in dataframe: {filtered_columns}")
2710
2616
  return df[filtered_columns], columns_renaming, generated_features, search_keys
2711
2617
 
@@ -2843,16 +2749,6 @@ if response.status_code == 200:
2843
2749
  )
2844
2750
  dataset.columns_renaming = columns_renaming
2845
2751
 
2846
- if max_features is not None or importance_threshold is not None:
2847
- exclude_features_sources = list(
2848
- set(
2849
- (exclude_features_sources or [])
2850
- + self._get_excluded_features(max_features, importance_threshold)
2851
- )
2852
- )
2853
- if len(exclude_features_sources) == 0:
2854
- exclude_features_sources = None
2855
-
2856
2752
  validation_task = self._search_task.validation(
2857
2753
  trace_id,
2858
2754
  dataset,
@@ -2880,7 +2776,7 @@ if response.status_code == 200:
2880
2776
  progress_bar.progress = progress.to_progress_bar()
2881
2777
  if progress_callback is not None:
2882
2778
  progress_callback(progress)
2883
- prev_progress: Optional[SearchProgress] = None
2779
+ prev_progress: SearchProgress | None = None
2884
2780
  polling_period_seconds = 1
2885
2781
  try:
2886
2782
  while progress.stage != ProgressStage.DOWNLOADING.value:
@@ -2917,6 +2813,8 @@ if response.status_code == 200:
2917
2813
  print(self.bundle.get("transform_start"))
2918
2814
 
2919
2815
  # Prepare input DataFrame for __enrich by concatenating generated ids and client features
2816
+ df_before_explode = df_before_explode.rename(columns=columns_renaming)
2817
+ generated_features = [columns_renaming.get(c, c) for c in generated_features]
2920
2818
  combined_df = pd.concat(
2921
2819
  [
2922
2820
  validated_Xy.reset_index(drop=True),
@@ -2934,15 +2832,28 @@ if response.status_code == 200:
2934
2832
  )
2935
2833
 
2936
2834
  selected_generated_features = [
2937
- c for c in generated_features if not self.fit_select_features or c in filtered_columns
2938
- ]
2939
- selecting_columns = [
2940
- c
2941
- for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
2942
- if (c not in self.zero_shap_client_features and c not in self.unstable_client_features)
2943
- or c in (self.id_columns or [])
2835
+ c for c in generated_features if not self.fit_select_features or c in self.feature_names_
2944
2836
  ]
2945
- selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
2837
+ if keep_input is None:
2838
+ selected_input_columns = [
2839
+ c
2840
+ for c in validated_Xy.columns
2841
+ if not self.fit_select_features
2842
+ or c in self.feature_names_
2843
+ or c in self.search_keys
2844
+ or c in (self.id_columns or [])
2845
+ or c in [EVAL_SET_INDEX, TARGET] # transform for metrics calculation
2846
+ ]
2847
+ elif keep_input is True:
2848
+ selected_input_columns = validated_Xy.columns.to_list()
2849
+ else:
2850
+ selected_input_columns = []
2851
+
2852
+ selecting_columns = selected_input_columns + selected_generated_features
2853
+ selecting_columns.extend(
2854
+ c for c in result.columns
2855
+ if c in self.feature_names_ and c not in selecting_columns and c not in validated_Xy.columns
2856
+ )
2946
2857
  if add_fit_system_record_id:
2947
2858
  selecting_columns.append(SORT_ID)
2948
2859
 
@@ -2968,29 +2879,7 @@ if response.status_code == 200:
2968
2879
 
2969
2880
  return result, columns_renaming, generated_features, search_keys
2970
2881
 
2971
- def _get_excluded_features(self, max_features: Optional[int], importance_threshold: Optional[float]) -> List[str]:
2972
- features_info = self._internal_features_info
2973
- comm_schema_header = self.bundle.get("features_info_commercial_schema")
2974
- shap_value_header = self.bundle.get("features_info_shap")
2975
- feature_name_header = self.bundle.get("features_info_name")
2976
- external_features = features_info[features_info[comm_schema_header].str.len() > 0]
2977
- filtered_features = external_features
2978
- if importance_threshold is not None:
2979
- filtered_features = filtered_features[filtered_features[shap_value_header] >= importance_threshold]
2980
- if max_features is not None and len(filtered_features) > max_features:
2981
- filtered_features = filtered_features.iloc[:max_features, :]
2982
- if len(filtered_features) == len(external_features):
2983
- return []
2984
- else:
2985
- if len(filtered_features[filtered_features[comm_schema_header].isin([CommercialSchema.PAID.value])]):
2986
- return []
2987
- excluded_features = external_features[~external_features.index.isin(filtered_features.index)].copy()
2988
- excluded_features = excluded_features[
2989
- excluded_features[comm_schema_header].isin([CommercialSchema.PAID.value])
2990
- ]
2991
- return excluded_features[feature_name_header].values.tolist()
2992
-
2993
- def __validate_search_keys(self, search_keys: Dict[str, SearchKey], search_id: Optional[str] = None):
2882
+ def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id: str | None = None):
2994
2883
  if (search_keys is None or len(search_keys) == 0) and self.country_code is None:
2995
2884
  if search_id:
2996
2885
  self.logger.debug(f"search_id {search_id} provided without search_keys")
@@ -3063,23 +2952,22 @@ if response.status_code == 200:
3063
2952
  def __inner_fit(
3064
2953
  self,
3065
2954
  trace_id: str,
3066
- X: Union[pd.DataFrame, pd.Series, np.ndarray],
3067
- y: Union[pd.DataFrame, pd.Series, np.ndarray, List, None],
3068
- eval_set: Optional[List[tuple]],
3069
- progress_bar: Optional[ProgressBar],
2955
+ X: pd.DataFrame | pd.Series | np.ndarray,
2956
+ y: pd.DataFrame | pd.Series | np.ndarray | list | None,
2957
+ eval_set: list[tuple] | None,
2958
+ progress_bar: ProgressBar | None,
3070
2959
  start_time: int,
3071
2960
  *,
3072
- exclude_features_sources: Optional[List[str]] = None,
3073
- calculate_metrics: Optional[bool],
3074
- scoring: Union[Callable, str, None],
3075
- estimator: Optional[Any],
3076
- importance_threshold: Optional[float],
2961
+ exclude_features_sources: list[str] | None = None,
2962
+ calculate_metrics: bool | None,
2963
+ scoring: Callable | str | None,
2964
+ estimator: Any | None,
3077
2965
  stability_threshold: float,
3078
- max_features: Optional[int],
3079
- remove_outliers_calc_metrics: Optional[bool],
2966
+ stability_agg_func: str,
2967
+ remove_outliers_calc_metrics: bool | None,
3080
2968
  auto_fe_parameters: AutoFEParameters,
3081
- progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
3082
- search_id_callback: Optional[Callable[[str], Any]] = None,
2969
+ progress_callback: Callable[[SearchProgress], Any] | None = None,
2970
+ search_id_callback: Callable[[str], Any] | None = None,
3083
2971
  ):
3084
2972
  self._search_task = None
3085
2973
  self.warning_counter.reset()
@@ -3140,7 +3028,6 @@ if response.status_code == 200:
3140
3028
  )
3141
3029
 
3142
3030
  df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
3143
- self.id_columns_encoder = OrdinalEncoder().fit(df[self.id_columns or []])
3144
3031
 
3145
3032
  self.fit_search_keys = self.search_keys.copy()
3146
3033
  df = self.__handle_index_search_keys(df, self.fit_search_keys)
@@ -3148,8 +3035,22 @@ if response.status_code == 200:
3148
3035
 
3149
3036
  maybe_date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3150
3037
  has_date = maybe_date_column is not None
3038
+
3151
3039
  self.model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
3152
3040
 
3041
+ if EVAL_SET_INDEX in df.columns:
3042
+ only_train_df = df.query(f"{EVAL_SET_INDEX} == 0")
3043
+ only_train_df = only_train_df.drop(columns=[EVAL_SET_INDEX])
3044
+ else:
3045
+ only_train_df = df
3046
+
3047
+ self.imbalanced = is_imbalanced(only_train_df, self.model_task_type, self.sample_config, self.bundle)
3048
+ if self.imbalanced:
3049
+ # Exclude eval sets from fit because they will be transformed before metrics calculation
3050
+ df = only_train_df
3051
+
3052
+ self.id_columns_encoder = OrdinalEncoder().fit(df[self.id_columns or []])
3053
+
3153
3054
  self._validate_binary_observations(validated_y, self.model_task_type)
3154
3055
 
3155
3056
  self.runtime_parameters = get_runtime_params_custom_loss(
@@ -3365,6 +3266,7 @@ if response.status_code == 200:
3365
3266
  model_task_type=self.model_task_type,
3366
3267
  cv_type=self.cv,
3367
3268
  id_columns=self.__get_renamed_id_columns(),
3269
+ is_imbalanced=self.imbalanced,
3368
3270
  date_column=self._get_date_column(self.fit_search_keys),
3369
3271
  date_format=self.date_format,
3370
3272
  random_state=self.random_state,
@@ -3444,8 +3346,6 @@ if response.status_code == 200:
3444
3346
  if progress_callback is not None:
3445
3347
  progress_callback(progress)
3446
3348
 
3447
- self.imbalanced = dataset.imbalanced
3448
-
3449
3349
  zero_hit_search_keys = self._search_task.get_zero_hit_rate_search_keys()
3450
3350
  if zero_hit_search_keys:
3451
3351
  self.logger.warning(
@@ -3468,33 +3368,24 @@ if response.status_code == 200:
3468
3368
 
3469
3369
  self.__prepare_feature_importances(trace_id, df)
3470
3370
 
3471
- self.__show_selected_features(self.fit_search_keys)
3472
-
3473
- autofe_description = self.get_autofe_features_description()
3474
- if autofe_description is not None and len(autofe_description) > 0:
3475
- self.logger.info(f"AutoFE descriptions: {autofe_description}")
3476
- self.autofe_features_display_handle = display_html_dataframe(
3477
- df=autofe_description,
3478
- internal_df=autofe_description,
3479
- header=self.bundle.get("autofe_descriptions_header"),
3480
- display_id=f"autofe_descriptions_{uuid.uuid4()}",
3481
- )
3482
-
3483
3371
  self._select_features_by_psi(
3484
3372
  trace_id=trace_id,
3485
3373
  X=X,
3486
3374
  y=y,
3487
3375
  eval_set=eval_set,
3488
3376
  stability_threshold=stability_threshold,
3377
+ stability_agg_func=stability_agg_func,
3489
3378
  cv=self.cv,
3490
3379
  estimator=estimator,
3491
3380
  exclude_features_sources=exclude_features_sources,
3492
- importance_threshold=importance_threshold,
3493
- max_features=max_features,
3494
3381
  progress_bar=progress_bar,
3495
3382
  progress_callback=progress_callback,
3496
3383
  )
3497
3384
 
3385
+ self.__prepare_feature_importances(trace_id, df)
3386
+
3387
+ self.__show_selected_features()
3388
+
3498
3389
  if self._has_paid_features(exclude_features_sources):
3499
3390
  if calculate_metrics is not None and calculate_metrics:
3500
3391
  msg = self.bundle.get("metrics_with_paid_features")
@@ -3525,8 +3416,6 @@ if response.status_code == 200:
3525
3416
  self.__show_metrics(
3526
3417
  scoring,
3527
3418
  estimator,
3528
- importance_threshold,
3529
- max_features,
3530
3419
  remove_outliers_calc_metrics,
3531
3420
  trace_id,
3532
3421
  progress_bar,
@@ -3543,7 +3432,7 @@ if response.status_code == 200:
3543
3432
  if not self.warning_counter.has_warnings():
3544
3433
  self.__display_support_link(self.bundle.get("all_ok_community_invite"))
3545
3434
 
3546
- def __convert_unnestable_keys(self, df: pd.DataFrame, unnest_search_keys: Dict[str, str]):
3435
+ def __convert_unnestable_keys(self, df: pd.DataFrame, unnest_search_keys: dict[str, str]):
3547
3436
  email_column = self._get_email_column(self.fit_search_keys)
3548
3437
  hem_column = self._get_hem_column(self.fit_search_keys)
3549
3438
  if email_column:
@@ -3575,7 +3464,7 @@ if response.status_code == 200:
3575
3464
  def __should_add_date_column(self):
3576
3465
  return self.add_date_if_missing or (self.cv is not None and self.cv.is_time_series())
3577
3466
 
3578
- def __get_renamed_id_columns(self, renaming: Optional[Dict[str, str]] = None):
3467
+ def __get_renamed_id_columns(self, renaming: dict[str, str] | None = None):
3579
3468
  renaming = renaming or self.fit_columns_renaming
3580
3469
  reverse_renaming = {v: k for k, v in renaming.items()}
3581
3470
  return None if self.id_columns is None else [reverse_renaming.get(c) or c for c in self.id_columns]
@@ -3609,7 +3498,7 @@ if response.status_code == 200:
3609
3498
  self.cv = cv
3610
3499
  self.runtime_parameters.properties["cv_type"] = self.cv.name
3611
3500
 
3612
- def get_columns_by_search_keys(self, keys: List[str]):
3501
+ def get_columns_by_search_keys(self, keys: list[str]):
3613
3502
  if "HEM" in keys:
3614
3503
  keys.append("EMAIL")
3615
3504
  if "DATE" in keys:
@@ -3620,11 +3509,11 @@ if response.status_code == 200:
3620
3509
  def _validate_train_eval(
3621
3510
  self,
3622
3511
  X: pd.DataFrame,
3623
- y: Optional[pd.Series] = None,
3624
- eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]] = None,
3512
+ y: pd.Series | None = None,
3513
+ eval_set: list[tuple[pd.DataFrame, pd.Series]] | None = None,
3625
3514
  is_transform: bool = False,
3626
3515
  silent: bool = False,
3627
- ) -> Tuple[pd.DataFrame, pd.Series, Optional[List[Tuple[pd.DataFrame, pd.Series]]]]:
3516
+ ) -> tuple[pd.DataFrame, pd.Series, list[tuple[pd.DataFrame, pd.Series]]] | None:
3628
3517
  validated_X = self._validate_X(X, is_transform)
3629
3518
  validated_y = self._validate_y(validated_X, y, enforce_y=not is_transform)
3630
3519
  validated_eval_set = self._validate_eval_set(validated_X, eval_set, silent=silent)
@@ -3633,7 +3522,7 @@ if response.status_code == 200:
3633
3522
  def _encode_id_columns(
3634
3523
  self,
3635
3524
  X: pd.DataFrame,
3636
- ) -> Tuple[pd.DataFrame, Dict[str, List[Any]]]:
3525
+ ) -> tuple[pd.DataFrame, dict[str, list[Any]]]:
3637
3526
  unknown_dict = {}
3638
3527
 
3639
3528
  if self.id_columns and self.id_columns_encoder is not None:
@@ -3703,7 +3592,7 @@ if response.status_code == 200:
3703
3592
 
3704
3593
  return validated_X
3705
3594
 
3706
- def _validate_y(self, X: pd.DataFrame, y, enforce_y: bool = True) -> Optional[pd.Series]:
3595
+ def _validate_y(self, X: pd.DataFrame, y, enforce_y: bool = True) -> pd.Series | None:
3707
3596
  if y is None and not enforce_y:
3708
3597
  return None
3709
3598
  if (
@@ -3753,7 +3642,7 @@ if response.status_code == 200:
3753
3642
  return validated_y
3754
3643
 
3755
3644
  def _validate_eval_set(
3756
- self, X: pd.DataFrame, eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]], silent: bool = False
3645
+ self, X: pd.DataFrame, eval_set: list[tuple[pd.DataFrame, pd.Series]] | None, silent: bool = False
3757
3646
  ):
3758
3647
  if eval_set is None:
3759
3648
  return None
@@ -3777,7 +3666,7 @@ if response.status_code == 200:
3777
3666
 
3778
3667
  return validated_eval_set
3779
3668
 
3780
- def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
3669
+ def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: tuple) -> tuple[pd.DataFrame, pd.Series]:
3781
3670
  if len(eval_pair) != 2:
3782
3671
  raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
3783
3672
  eval_X, eval_y = eval_pair
@@ -3865,7 +3754,7 @@ if response.status_code == 200:
3865
3754
 
3866
3755
  return validated_eval_X, validated_eval_y
3867
3756
 
3868
- def _validate_baseline_score(self, X: pd.DataFrame, eval_set: Optional[List[Tuple]]):
3757
+ def _validate_baseline_score(self, X: pd.DataFrame, eval_set: list[tuple] | None):
3869
3758
  if self.baseline_score_column is not None:
3870
3759
  if self.baseline_score_column not in X.columns:
3871
3760
  raise ValidationError(
@@ -3885,15 +3774,15 @@ if response.status_code == 200:
3885
3774
  raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
3886
3775
 
3887
3776
  @staticmethod
3888
- def _sample_X_and_y(X: pd.DataFrame, y: pd.Series, enriched_X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
3777
+ def _sample_X_and_y(X: pd.DataFrame, y: pd.Series, enriched_X: pd.DataFrame) -> tuple[pd.DataFrame, pd.Series]:
3889
3778
  Xy = pd.concat([X, y], axis=1)
3890
3779
  Xy = pd.merge(Xy, enriched_X, left_index=True, right_index=True, how="inner", suffixes=("", "enriched"))
3891
3780
  return Xy[X.columns].copy(), Xy[TARGET].copy()
3892
3781
 
3893
3782
  @staticmethod
3894
3783
  def _sort_by_system_record_id(
3895
- X: pd.DataFrame, y: pd.Series, cv: Optional[CVType]
3896
- ) -> Tuple[pd.DataFrame, pd.Series]:
3784
+ X: pd.DataFrame, y: pd.Series, cv: CVType | None
3785
+ ) -> tuple[pd.DataFrame, pd.Series]:
3897
3786
  if cv not in [CVType.time_series, CVType.blocked_time_series]:
3898
3787
  record_id_column = ENTITY_SYSTEM_RECORD_ID if ENTITY_SYSTEM_RECORD_ID in X else SYSTEM_RECORD_ID
3899
3788
  Xy = X.copy()
@@ -3910,8 +3799,8 @@ if response.status_code == 200:
3910
3799
  # Deprecated
3911
3800
  @staticmethod
3912
3801
  def _sort_by_keys(
3913
- X: pd.DataFrame, y: pd.Series, search_keys: Dict[str, SearchKey], cv: Optional[CVType]
3914
- ) -> Tuple[pd.DataFrame, pd.Series]:
3802
+ X: pd.DataFrame, y: pd.Series, search_keys: dict[str, SearchKey], cv: CVType | None
3803
+ ) -> tuple[pd.DataFrame, pd.Series]:
3915
3804
  if cv not in [CVType.time_series, CVType.blocked_time_series]:
3916
3805
  if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
3917
3806
  date_column = DateTimeSearchKeyConverter.DATETIME_COL
@@ -3950,16 +3839,14 @@ if response.status_code == 200:
3950
3839
  def __log_debug_information(
3951
3840
  self,
3952
3841
  X: pd.DataFrame,
3953
- y: Union[pd.Series, np.ndarray, list, None] = None,
3954
- eval_set: Optional[List[tuple]] = None,
3955
- exclude_features_sources: Optional[List[str]] = None,
3956
- calculate_metrics: Optional[bool] = None,
3957
- cv: Optional[Any] = None,
3958
- importance_threshold: Optional[Any] = None,
3959
- max_features: Optional[Any] = None,
3960
- scoring: Optional[Any] = None,
3961
- estimator: Optional[Any] = None,
3962
- remove_outliers_calc_metrics: Optional[bool] = None,
3842
+ y: pd.Series | np.ndarray | list | None = None,
3843
+ eval_set: list[tuple] | None = None,
3844
+ exclude_features_sources: list[str] | None = None,
3845
+ calculate_metrics: bool | None = None,
3846
+ cv: Any | None = None,
3847
+ scoring: Any | None = None,
3848
+ estimator: Any | None = None,
3849
+ remove_outliers_calc_metrics: bool | None = None,
3963
3850
  ):
3964
3851
  try:
3965
3852
  resolved_api_key = self.api_key or os.environ.get(UPGINI_API_KEY)
@@ -3972,8 +3859,6 @@ if response.status_code == 200:
3972
3859
  f"Runtime parameters: {self.runtime_parameters}\n"
3973
3860
  f"Date format: {self.date_format}\n"
3974
3861
  f"CV: {cv}\n"
3975
- f"importance_threshold: {importance_threshold}\n"
3976
- f"max_features: {max_features}\n"
3977
3862
  f"Shared datasets: {self.shared_datasets}\n"
3978
3863
  f"Random state: {self.random_state}\n"
3979
3864
  f"Generate features: {self.generate_features}\n"
@@ -4037,7 +3922,7 @@ if response.status_code == 200:
4037
3922
  except Exception:
4038
3923
  self.logger.warning("Failed to log debug information", exc_info=True)
4039
3924
 
4040
- def __handle_index_search_keys(self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> pd.DataFrame:
3925
+ def __handle_index_search_keys(self, df: pd.DataFrame, search_keys: dict[str, SearchKey]) -> pd.DataFrame:
4041
3926
  index_names = df.index.names if df.index.names != [None] else [DEFAULT_INDEX]
4042
3927
  index_search_keys = set(index_names).intersection(search_keys.keys())
4043
3928
  if len(index_search_keys) > 0:
@@ -4056,7 +3941,7 @@ if response.status_code == 200:
4056
3941
  return df
4057
3942
 
4058
3943
  def _add_current_date_as_key(
4059
- self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], bundle: ResourceBundle, silent: bool = False
3944
+ self, df: pd.DataFrame, search_keys: dict[str, SearchKey], bundle: ResourceBundle, silent: bool = False
4060
3945
  ) -> pd.DataFrame:
4061
3946
  if (
4062
3947
  set(search_keys.values()) == {SearchKey.PHONE}
@@ -4073,7 +3958,7 @@ if response.status_code == 200:
4073
3958
  return df
4074
3959
 
4075
3960
  @staticmethod
4076
- def _get_group_columns(df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> List[str]:
3961
+ def _get_group_columns(df: pd.DataFrame, search_keys: dict[str, SearchKey]) -> list[str]:
4077
3962
  search_key_priority = [SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM, SearchKey.IP]
4078
3963
  for key_type in search_key_priority:
4079
3964
  if key_type in search_keys.values():
@@ -4086,7 +3971,7 @@ if response.status_code == 200:
4086
3971
  ]
4087
3972
 
4088
3973
  @staticmethod
4089
- def _get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3974
+ def _get_email_column(search_keys: dict[str, SearchKey]) -> str | None:
4090
3975
  cols = [col for col, t in search_keys.items() if t == SearchKey.EMAIL]
4091
3976
  if len(cols) > 1:
4092
3977
  raise Exception("More than one email column found after unnest")
@@ -4094,7 +3979,7 @@ if response.status_code == 200:
4094
3979
  return cols[0]
4095
3980
 
4096
3981
  @staticmethod
4097
- def _get_hem_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3982
+ def _get_hem_column(search_keys: dict[str, SearchKey]) -> str | None:
4098
3983
  cols = [col for col, t in search_keys.items() if t == SearchKey.HEM]
4099
3984
  if len(cols) > 1:
4100
3985
  raise Exception("More than one hem column found after unnest")
@@ -4102,7 +3987,7 @@ if response.status_code == 200:
4102
3987
  return cols[0]
4103
3988
 
4104
3989
  @staticmethod
4105
- def _get_ip_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3990
+ def _get_ip_column(search_keys: dict[str, SearchKey]) -> str | None:
4106
3991
  cols = [col for col, t in search_keys.items() if t == SearchKey.IP]
4107
3992
  if len(cols) > 1:
4108
3993
  raise Exception("More than one ip column found after unnest")
@@ -4110,32 +3995,32 @@ if response.status_code == 200:
4110
3995
  return cols[0]
4111
3996
 
4112
3997
  @staticmethod
4113
- def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3998
+ def _get_phone_column(search_keys: dict[str, SearchKey]) -> str | None:
4114
3999
  for col, t in search_keys.items():
4115
4000
  if t == SearchKey.PHONE:
4116
4001
  return col
4117
4002
 
4118
4003
  @staticmethod
4119
- def _get_country_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
4004
+ def _get_country_column(search_keys: dict[str, SearchKey]) -> str | None:
4120
4005
  for col, t in search_keys.items():
4121
4006
  if t == SearchKey.COUNTRY:
4122
4007
  return col
4123
4008
 
4124
4009
  @staticmethod
4125
- def _get_postal_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
4010
+ def _get_postal_column(search_keys: dict[str, SearchKey]) -> str | None:
4126
4011
  for col, t in search_keys.items():
4127
4012
  if t == SearchKey.POSTAL_CODE:
4128
4013
  return col
4129
4014
 
4130
4015
  @staticmethod
4131
- def _get_date_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
4016
+ def _get_date_column(search_keys: dict[str, SearchKey]) -> str | None:
4132
4017
  return SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
4133
4018
 
4134
4019
  def _explode_multiple_search_keys(
4135
- self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], columns_renaming: Dict[str, str]
4136
- ) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
4020
+ self, df: pd.DataFrame, search_keys: dict[str, SearchKey], columns_renaming: dict[str, str]
4021
+ ) -> tuple[pd.DataFrame, dict[str, list[str]]]:
4137
4022
  # find groups of multiple search keys
4138
- search_key_names_by_type: Dict[SearchKey, List[str]] = {}
4023
+ search_key_names_by_type: dict[SearchKey, list[str]] = {}
4139
4024
  for key_name, key_type in search_keys.items():
4140
4025
  search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
4141
4026
  search_key_names_by_type = {
@@ -4171,14 +4056,14 @@ if response.status_code == 200:
4171
4056
  @staticmethod
4172
4057
  def _add_fit_system_record_id(
4173
4058
  df: pd.DataFrame,
4174
- search_keys: Dict[str, SearchKey],
4059
+ search_keys: dict[str, SearchKey],
4175
4060
  id_name: str,
4176
4061
  target_name: str,
4177
- columns_renaming: Dict[str, str],
4178
- id_columns: Optional[List[str]],
4179
- cv: Optional[CVType],
4062
+ columns_renaming: dict[str, str],
4063
+ id_columns: list[str] | None,
4064
+ cv: CVType | None,
4180
4065
  model_task_type: ModelTaskType,
4181
- logger: Optional[logging.Logger] = None,
4066
+ logger: logging.Logger | None = None,
4182
4067
  bundle: ResourceBundle = bundle,
4183
4068
  ) -> pd.DataFrame:
4184
4069
  original_index_name = df.index.name
@@ -4296,7 +4181,7 @@ if response.status_code == 200:
4296
4181
 
4297
4182
  return df
4298
4183
 
4299
- def __add_country_code(self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> pd.DataFrame:
4184
+ def __add_country_code(self, df: pd.DataFrame, search_keys: dict[str, SearchKey]) -> pd.DataFrame:
4300
4185
  self.country_added = False
4301
4186
 
4302
4187
  if self.country_code is not None and SearchKey.COUNTRY not in search_keys.values():
@@ -4314,7 +4199,7 @@ if response.status_code == 200:
4314
4199
  def __enrich(
4315
4200
  self,
4316
4201
  input_df: pd.DataFrame,
4317
- result_features: Optional[pd.DataFrame],
4202
+ result_features: pd.DataFrame | None,
4318
4203
  how: str = "inner",
4319
4204
  drop_system_record_id=True,
4320
4205
  ) -> pd.DataFrame:
@@ -4422,7 +4307,7 @@ if response.status_code == 200:
4422
4307
 
4423
4308
  return importances
4424
4309
 
4425
- def __get_categorical_features(self) -> List[str]:
4310
+ def __get_categorical_features(self) -> list[str]:
4426
4311
  features_meta = self._search_task.get_all_features_metadata_v2()
4427
4312
  if features_meta is None:
4428
4313
  raise Exception(self.bundle.get("missing_features_meta"))
@@ -4433,11 +4318,13 @@ if response.status_code == 200:
4433
4318
  self,
4434
4319
  trace_id: str,
4435
4320
  clients_features_df: pd.DataFrame,
4436
- updated_shaps: Optional[Dict[str, float]] = None,
4321
+ updated_shaps: dict[str, float] | None = None,
4322
+ update_selected_features: bool = True,
4437
4323
  silent=False,
4438
4324
  ):
4439
4325
  if self._search_task is None:
4440
4326
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
4327
+ selected_features = self._search_task.get_selected_features(trace_id)
4441
4328
  features_meta = self._search_task.get_all_features_metadata_v2()
4442
4329
  if features_meta is None:
4443
4330
  raise Exception(self.bundle.get("missing_features_meta"))
@@ -4451,8 +4338,6 @@ if response.status_code == 200:
4451
4338
 
4452
4339
  self.feature_names_ = []
4453
4340
  self.external_source_feature_names = []
4454
- self.zero_shap_client_features = []
4455
- self.unstable_client_features = []
4456
4341
  self.feature_importances_ = []
4457
4342
  features_info = []
4458
4343
  features_info_without_links = []
@@ -4460,12 +4345,19 @@ if response.status_code == 200:
4460
4345
 
4461
4346
  original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
4462
4347
 
4348
+ selected_features_meta = []
4463
4349
  for feature_meta in features_meta:
4464
4350
  original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
4465
4351
  feature_meta.name = original_name
4466
4352
 
4467
4353
  is_client_feature = original_name in clients_features_df.columns
4468
4354
 
4355
+ if selected_features is not None and feature_meta.name not in selected_features:
4356
+ self.logger.info(f"Feature {feature_meta.name} is not selected before and skipped")
4357
+ continue
4358
+
4359
+ selected_features_meta.append(feature_meta)
4360
+
4469
4361
  # Show and update shap values for client features only if select_features is True
4470
4362
  if updated_shaps is not None and (not is_client_feature or self.fit_select_features):
4471
4363
  updating_shap = updated_shaps.get(feature_meta.name)
@@ -4477,9 +4369,9 @@ if response.status_code == 200:
4477
4369
  updating_shap = 0.0
4478
4370
  feature_meta.shap_value = updating_shap
4479
4371
 
4480
- features_meta.sort(key=lambda m: (-m.shap_value, m.name))
4372
+ selected_features_meta.sort(key=lambda m: (-m.shap_value, m.name))
4481
4373
 
4482
- for feature_meta in features_meta:
4374
+ for feature_meta in selected_features_meta:
4483
4375
  original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
4484
4376
  is_client_feature = original_name in clients_features_df.columns
4485
4377
 
@@ -4490,15 +4382,11 @@ if response.status_code == 200:
4490
4382
  if original_name in self.psi_values:
4491
4383
  feature_meta.psi_value = self.psi_values[original_name]
4492
4384
  else:
4493
- if is_client_feature and self.fit_select_features:
4494
- self.unstable_client_features.append(original_name)
4495
4385
  continue
4496
4386
 
4497
4387
  # TODO make a decision about selected features based on special flag from mlb
4498
4388
 
4499
4389
  if original_shaps.get(feature_meta.name, 0.0) == 0.0:
4500
- if is_client_feature and self.fit_select_features:
4501
- self.zero_shap_client_features.append(original_name)
4502
4390
  continue
4503
4391
 
4504
4392
  # Use only important features
@@ -4525,6 +4413,9 @@ if response.status_code == 200:
4525
4413
  features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
4526
4414
  internal_features_info.append(feature_info.to_internal_row(self.bundle))
4527
4415
 
4416
+ if update_selected_features:
4417
+ self._search_task.update_selected_features(trace_id, self.feature_names_)
4418
+
4528
4419
  if len(features_info) > 0:
4529
4420
  self.features_info = pd.DataFrame(features_info)
4530
4421
  if self.features_info[self.bundle.get("features_info_psi")].isna().all():
@@ -4652,32 +4543,10 @@ if response.status_code == 200:
4652
4543
  )
4653
4544
  )
4654
4545
 
4655
- def __filtered_importance_names(
4656
- self, importance_threshold: Optional[float], max_features: Optional[int], trace_id: str, df: pd.DataFrame
4657
- ) -> List[str]:
4658
- # get features importance from server
4659
- filtered_importances = self.__get_features_importance_from_server(trace_id, df)
4660
-
4661
- if len(filtered_importances) == 0:
4662
- return []
4663
-
4664
- if importance_threshold is not None:
4665
- filtered_importances = [
4666
- (name, importance)
4667
- for name, importance in filtered_importances.items()
4668
- if importance > importance_threshold
4669
- ]
4670
- if max_features is not None:
4671
- filtered_importances = list(filtered_importances)[:max_features]
4672
- if len(filtered_importances) == 0:
4673
- return []
4674
- filtered_importance_names, _ = zip(*filtered_importances)
4675
- return list(filtered_importance_names)
4676
-
4677
4546
  def __prepare_search_keys(
4678
4547
  self,
4679
4548
  x: pd.DataFrame,
4680
- search_keys: Dict[str, SearchKey],
4549
+ search_keys: dict[str, SearchKey],
4681
4550
  is_demo_dataset: bool,
4682
4551
  is_transform=False,
4683
4552
  silent_mode=False,
@@ -4788,20 +4657,16 @@ if response.status_code == 200:
4788
4657
 
4789
4658
  def __show_metrics(
4790
4659
  self,
4791
- scoring: Union[Callable, str, None],
4792
- estimator: Optional[Any],
4793
- importance_threshold: Optional[float],
4794
- max_features: Optional[int],
4795
- remove_outliers_calc_metrics: Optional[bool],
4660
+ scoring: Callable | str | None,
4661
+ estimator: Any | None,
4662
+ remove_outliers_calc_metrics: bool | None,
4796
4663
  trace_id: str,
4797
- progress_bar: Optional[ProgressBar] = None,
4798
- progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
4664
+ progress_bar: ProgressBar | None = None,
4665
+ progress_callback: Callable[[SearchProgress], Any] | None = None,
4799
4666
  ):
4800
4667
  self.metrics = self.calculate_metrics(
4801
4668
  scoring=scoring,
4802
4669
  estimator=estimator,
4803
- importance_threshold=importance_threshold,
4804
- max_features=max_features,
4805
4670
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
4806
4671
  trace_id=trace_id,
4807
4672
  internal_call=True,
@@ -4812,22 +4677,15 @@ if response.status_code == 200:
4812
4677
  msg = self.bundle.get("quality_metrics_header")
4813
4678
  display_html_dataframe(self.metrics, self.metrics, msg)
4814
4679
 
4815
- def __show_selected_features(self, search_keys: Dict[str, SearchKey]):
4816
- search_key_names = [col for col, tpe in search_keys.items() if tpe != SearchKey.CUSTOM_KEY]
4817
- if self.fit_columns_renaming:
4818
- search_key_names = sorted(set([self.fit_columns_renaming.get(col, col) for col in search_key_names]))
4819
- msg = self.bundle.get("features_info_header").format(len(self.feature_names_), search_key_names)
4820
-
4680
+ def __show_selected_features(self):
4821
4681
  try:
4822
4682
  _ = get_ipython() # type: ignore
4823
4683
 
4824
- print(Format.GREEN + Format.BOLD + msg + Format.END)
4825
- self.logger.info(msg)
4826
4684
  if len(self.feature_names_) > 0:
4827
4685
  self.features_info_display_handle = display_html_dataframe(
4828
4686
  self.features_info,
4829
4687
  self._features_info_without_links,
4830
- self.bundle.get("relevant_features_header"),
4688
+ self.bundle.get("relevant_features_header").format(len(self.feature_names_)),
4831
4689
  display_id=f"features_info_{uuid.uuid4()}",
4832
4690
  )
4833
4691
 
@@ -4838,14 +4696,23 @@ if response.status_code == 200:
4838
4696
  self.bundle.get("relevant_data_sources_header"),
4839
4697
  display_id=f"data_sources_{uuid.uuid4()}",
4840
4698
  )
4699
+
4700
+ autofe_description = self.get_autofe_features_description()
4701
+ if autofe_description is not None and len(autofe_description) > 0:
4702
+ self.logger.info(f"AutoFE descriptions: {autofe_description}")
4703
+ self.autofe_features_display_handle = display_html_dataframe(
4704
+ df=autofe_description,
4705
+ internal_df=autofe_description,
4706
+ header=self.bundle.get("autofe_descriptions_header"),
4707
+ display_id=f"autofe_descriptions_{uuid.uuid4()}",
4708
+ )
4841
4709
  else:
4842
4710
  msg = self.bundle.get("features_info_zero_important_features")
4843
4711
  self.__log_warning(msg, show_support_link=True)
4844
4712
  except (ImportError, NameError):
4845
- print(msg)
4846
4713
  print(self._internal_features_info)
4847
4714
 
4848
- def __show_report_button(self, display_id: Optional[str] = None, display_handle=None):
4715
+ def __show_report_button(self, display_id: str | None = None, display_handle=None):
4849
4716
  try:
4850
4717
  return prepare_and_show_report(
4851
4718
  relevant_features_df=self._features_info_without_links,
@@ -4861,40 +4728,14 @@ if response.status_code == 200:
4861
4728
  except Exception:
4862
4729
  pass
4863
4730
 
4864
- def __validate_importance_threshold(self, importance_threshold: Optional[float]) -> float:
4865
- try:
4866
- return float(importance_threshold) if importance_threshold is not None else 0.0
4867
- except ValueError:
4868
- self.logger.exception(f"Invalid importance_threshold provided: {importance_threshold}")
4869
- raise ValidationError(self.bundle.get("invalid_importance_threshold"))
4870
-
4871
- def __validate_max_features(self, max_features: Optional[int]) -> int:
4872
- try:
4873
- return int(max_features) if max_features is not None else 400
4874
- except ValueError:
4875
- self.logger.exception(f"Invalid max_features provided: {max_features}")
4876
- raise ValidationError(self.bundle.get("invalid_max_features"))
4877
-
4878
- def __filtered_enriched_features(
4879
- self,
4880
- importance_threshold: Optional[float],
4881
- max_features: Optional[int],
4882
- trace_id: str,
4883
- df: pd.DataFrame,
4884
- ) -> List[str]:
4885
- importance_threshold = self.__validate_importance_threshold(importance_threshold)
4886
- max_features = self.__validate_max_features(max_features)
4887
-
4888
- return self.__filtered_importance_names(importance_threshold, max_features, trace_id, df)
4889
-
4890
4731
  def __detect_missing_search_keys(
4891
4732
  self,
4892
4733
  df: pd.DataFrame,
4893
- search_keys: Dict[str, SearchKey],
4734
+ search_keys: dict[str, SearchKey],
4894
4735
  is_demo_dataset: bool,
4895
4736
  silent_mode=False,
4896
4737
  is_transform=False,
4897
- ) -> Dict[str, SearchKey]:
4738
+ ) -> dict[str, SearchKey]:
4898
4739
  sample = df.head(100)
4899
4740
 
4900
4741
  def check_need_detect(search_key: SearchKey):
@@ -5011,7 +4852,7 @@ if response.status_code == 200:
5011
4852
  except Exception:
5012
4853
  self.logger.exception("Failed to dump python libs")
5013
4854
 
5014
- def __display_support_link(self, link_text: Optional[str] = None):
4855
+ def __display_support_link(self, link_text: str | None = None):
5015
4856
  support_link = self.bundle.get("support_link")
5016
4857
  link_text = link_text or self.bundle.get("support_text")
5017
4858
  try:
@@ -5038,9 +4879,9 @@ if response.status_code == 200:
5038
4879
  def dump_input(
5039
4880
  self,
5040
4881
  trace_id: str,
5041
- X: Union[pd.DataFrame, pd.Series],
5042
- y: Union[pd.DataFrame, pd.Series, None] = None,
5043
- eval_set: Union[Tuple, None] = None,
4882
+ X: pd.DataFrame | pd.Series,
4883
+ y: pd.DataFrame | pd.Series | None = None,
4884
+ eval_set: tuple | None = None,
5044
4885
  ):
5045
4886
  def dump_task(X_, y_, eval_set_):
5046
4887
  with MDC(trace_id=trace_id):
@@ -5131,28 +4972,10 @@ def is_frames_equal(first, second, bundle: ResourceBundle) -> bool:
5131
4972
  raise ValidationError(bundle.get("x_and_eval_x_diff_types").format(type(first), type(second)))
5132
4973
 
5133
4974
 
5134
- def drop_duplicates(df: Union[pd.DataFrame, np.ndarray]) -> pd.DataFrame:
4975
+ def drop_duplicates(df: pd.DataFrame | np.ndarray | Any) -> pd.DataFrame:
5135
4976
  if isinstance(df, pd.DataFrame):
5136
4977
  return df.drop_duplicates()
5137
4978
  elif isinstance(df, np.ndarray):
5138
4979
  return pd.DataFrame(df).drop_duplicates()
5139
4980
  else:
5140
4981
  return df
5141
-
5142
-
5143
- def hash_input(X: pd.DataFrame, y: Optional[pd.Series] = None, eval_set: Optional[List[Tuple]] = None) -> str:
5144
- hashed_objects = []
5145
- try:
5146
- hashed_objects.append(pd.util.hash_pandas_object(X, index=False).values)
5147
- if y is not None:
5148
- hashed_objects.append(pd.util.hash_pandas_object(y, index=False).values)
5149
- if eval_set is not None:
5150
- if isinstance(eval_set, tuple):
5151
- eval_set = [eval_set]
5152
- for eval_X, eval_y in eval_set:
5153
- hashed_objects.append(pd.util.hash_pandas_object(eval_X, index=False).values)
5154
- hashed_objects.append(pd.util.hash_pandas_object(eval_y, index=False).values)
5155
- common_hash = hashlib.sha256(np.concatenate(hashed_objects)).hexdigest()
5156
- return common_hash
5157
- except Exception:
5158
- return ""