upgini 1.2.113a3974.dev1__py3-none-any.whl → 1.2.114__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,6 @@
1
1
  import dataclasses
2
2
  import datetime
3
3
  import gc
4
- import hashlib
5
- import itertools
6
4
  import json
7
5
  import logging
8
6
  import os
@@ -14,7 +12,7 @@ from collections import Counter
14
12
  from copy import deepcopy
15
13
  from dataclasses import dataclass
16
14
  from threading import Thread
17
- from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
15
+ from typing import Any, Callable, Optional, Set, Union
18
16
 
19
17
  import numpy as np
20
18
  import pandas as pd
@@ -101,6 +99,7 @@ from upgini.utils.email_utils import (
101
99
  from upgini.utils.feature_info import FeatureInfo, _round_shap_value
102
100
  from upgini.utils.features_validator import FeaturesValidator
103
101
  from upgini.utils.format import Format
102
+ from upgini.utils.hash_utils import file_hash, hash_input
104
103
  from upgini.utils.ip_utils import IpSearchKeyConverter
105
104
  from upgini.utils.phone_utils import PhoneSearchKeyDetector
106
105
  from upgini.utils.postal_code_utils import PostalCodeSearchKeyDetector
@@ -112,9 +111,11 @@ except Exception:
112
111
  CustomFallbackProgressBar as ProgressBar,
113
112
  )
114
113
 
115
- from upgini.utils.sample_utils import SampleColumns, SampleConfig, _num_samples, sample
114
+ from upgini.utils.config import SampleConfig
115
+ from upgini.utils.psi import calculate_features_psi, calculate_sparsity_psi
116
+ from upgini.utils.sample_utils import SampleColumns, _num_samples, sample
116
117
  from upgini.utils.sort import sort_columns
117
- from upgini.utils.target_utils import calculate_psi, define_task
118
+ from upgini.utils.target_utils import calculate_psi, define_task, is_imbalanced
118
119
  from upgini.utils.warning_counter import WarningCounter
119
120
  from upgini.version_validator import validate_version
120
121
 
@@ -130,7 +131,7 @@ class FeaturesEnricher(TransformerMixin):
130
131
  Parameters
131
132
  ----------
132
133
  search_keys: dict of str->SearchKey or int->SearchKey
133
- Dictionary with column names or indices mapping to key types.
134
+ dictionary with column names or indices mapping to key types.
134
135
  Each of this columns will be used as a search key to find features.
135
136
 
136
137
  country_code: str, optional (default=None)
@@ -162,7 +163,7 @@ class FeaturesEnricher(TransformerMixin):
162
163
  Custom loss function to use for feature selection and metrics calculation.
163
164
 
164
165
  shared_datasets: list of str, optional (default=None)
165
- List of private shared dataset ids for custom search
166
+ list of private shared dataset ids for custom search
166
167
  """
167
168
 
168
169
  TARGET_NAME = "target"
@@ -206,32 +207,32 @@ class FeaturesEnricher(TransformerMixin):
206
207
 
207
208
  def __init__(
208
209
  self,
209
- search_keys: Optional[Dict[str, SearchKey]] = None,
210
+ search_keys: Optional[dict[str, SearchKey]] = None,
210
211
  country_code: Optional[str] = None,
211
212
  model_task_type: Optional[Union[ModelTaskType, str]] = None,
212
213
  api_key: Optional[str] = None,
213
214
  endpoint: Optional[str] = None,
214
215
  search_id: Optional[str] = None,
215
- shared_datasets: Optional[List[str]] = None,
216
+ shared_datasets: Optional[list[str]] = None,
216
217
  runtime_parameters: Optional[RuntimeParameters] = None,
217
218
  date_format: Optional[str] = None,
218
219
  random_state: int = 42,
219
220
  cv: Optional[CVType] = None,
220
221
  loss: Optional[str] = None,
221
222
  autodetect_search_keys: bool = True,
222
- generate_features: Optional[List[str]] = None,
223
- columns_for_online_api: Optional[List[str]] = None,
223
+ generate_features: Optional[list[str]] = None,
224
+ columns_for_online_api: Optional[list[str]] = None,
224
225
  round_embeddings: Optional[int] = None,
225
226
  logs_enabled: bool = True,
226
227
  raise_validation_error: bool = True,
227
- exclude_columns: Optional[List[str]] = None,
228
+ exclude_columns: Optional[list[str]] = None,
228
229
  baseline_score_column: Optional[Any] = None,
229
230
  client_ip: Optional[str] = None,
230
231
  client_visitorid: Optional[str] = None,
231
232
  custom_bundle_config: Optional[str] = None,
232
233
  add_date_if_missing: bool = True,
233
234
  disable_force_downsampling: bool = False,
234
- id_columns: Optional[List[str]] = None,
235
+ id_columns: Optional[list[str]] = None,
235
236
  generate_search_key_features: bool = True,
236
237
  sample_config: Optional[SampleConfig] = None,
237
238
  print_trace_id: bool = False,
@@ -257,21 +258,21 @@ class FeaturesEnricher(TransformerMixin):
257
258
  self.logger.warning(msg)
258
259
  print(msg)
259
260
 
260
- self.passed_features: List[str] = []
261
+ self.passed_features: list[str] = []
261
262
  self.df_with_original_index: Optional[pd.DataFrame] = None
262
- self.fit_columns_renaming: Optional[Dict[str, str]] = None
263
+ self.fit_columns_renaming: Optional[dict[str, str]] = None
263
264
  self.country_added = False
264
- self.fit_generated_features: List[str] = []
265
+ self.fit_generated_features: list[str] = []
265
266
  self.fit_dropped_features: Set[str] = set()
266
267
  self.fit_search_keys = search_keys
267
268
  self.warning_counter = WarningCounter()
268
269
  self.X: Optional[pd.DataFrame] = None
269
270
  self.y: Optional[pd.Series] = None
270
- self.eval_set: Optional[List[Tuple]] = None
271
- self.autodetected_search_keys: Dict[str, SearchKey] = {}
271
+ self.eval_set: Optional[list[tuple]] = None
272
+ self.autodetected_search_keys: dict[str, SearchKey] = {}
272
273
  self.imbalanced = False
273
274
  self.fit_select_features = True
274
- self.__cached_sampled_datasets: Dict[str, Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict, Dict]] = (
275
+ self.__cached_sampled_datasets: dict[str, tuple[pd.DataFrame, pd.DataFrame, pd.Series, dict, dict, dict]] = (
275
276
  dict()
276
277
  )
277
278
 
@@ -296,8 +297,8 @@ class FeaturesEnricher(TransformerMixin):
296
297
  self.metrics: Optional[pd.DataFrame] = None
297
298
  self.feature_names_ = []
298
299
  self.external_source_feature_names = []
299
- self.zero_shap_client_features = []
300
300
  self.feature_importances_ = []
301
+ self.psi_values: Optional[dict[str, float]] = None
301
302
  self.search_id = search_id
302
303
  self.disable_force_downsampling = disable_force_downsampling
303
304
  self.print_trace_id = print_trace_id
@@ -317,7 +318,8 @@ class FeaturesEnricher(TransformerMixin):
317
318
  x_columns = [c.name for c in file_metadata.columns]
318
319
  self.fit_columns_renaming = {c.name: c.originalName for c in file_metadata.columns}
319
320
  df = pd.DataFrame(columns=x_columns)
320
- self.__prepare_feature_importances(trace_id, df, silent=True)
321
+ self.__prepare_feature_importances(trace_id, df, silent=True, update_selected_features=False)
322
+ self.__show_selected_features()
321
323
  # TODO validate search_keys with search_keys from file_metadata
322
324
  print(self.bundle.get("search_by_task_id_finish"))
323
325
  self.logger.debug(f"Successfully initialized with search_id: {search_id}")
@@ -395,37 +397,54 @@ class FeaturesEnricher(TransformerMixin):
395
397
 
396
398
  api_key = property(_get_api_key, _set_api_key)
397
399
 
398
- @staticmethod
399
- def _check_eval_set(eval_set, X, bundle: ResourceBundle):
400
+ def _check_eval_set(self, eval_set, X):
400
401
  checked_eval_set = []
401
- if eval_set is not None and isinstance(eval_set, tuple):
402
+ if eval_set is None:
403
+ return checked_eval_set
404
+ if isinstance(eval_set, tuple):
402
405
  eval_set = [eval_set]
403
- if eval_set is not None and not isinstance(eval_set, list):
404
- raise ValidationError(bundle.get("unsupported_type_eval_set").format(type(eval_set)))
405
- for eval_pair in eval_set or []:
406
+ if not isinstance(eval_set, list):
407
+ raise ValidationError(self.bundle.get("unsupported_type_eval_set").format(type(eval_set)))
408
+ for i, eval_pair in enumerate(eval_set or [], 1):
409
+ # Handle OOT
410
+ if isinstance(eval_pair, pd.DataFrame):
411
+ empty_target = pd.Series([np.nan] * len(eval_pair), index=eval_pair.index)
412
+ eval_pair = (eval_pair, empty_target)
413
+ elif isinstance(eval_pair, tuple) and len(eval_pair) == 1:
414
+ empty_target = pd.Series([np.nan] * len(eval_pair[0]), index=eval_pair[0].index)
415
+ eval_pair = (eval_pair[0], empty_target)
416
+
406
417
  if not isinstance(eval_pair, tuple) or len(eval_pair) != 2:
407
- raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
408
- if not is_frames_equal(X, eval_pair[0], bundle):
418
+ raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
419
+ if eval_pair[1] is None:
420
+ empty_target = pd.Series([np.nan] * len(eval_pair[0]), index=eval_pair[0].index)
421
+ eval_pair = (eval_pair[0], empty_target)
422
+
423
+ if not is_frames_equal(X, eval_pair[0], self.bundle):
409
424
  checked_eval_set.append(eval_pair)
425
+ else:
426
+ msg = f"Eval set {i} is equal to train set and will be ignored"
427
+ self.logger.warning(msg)
428
+ print(msg)
410
429
  return checked_eval_set
411
430
 
412
431
  def fit(
413
432
  self,
414
433
  X: Union[pd.DataFrame, pd.Series, np.ndarray],
415
- y: Union[pd.Series, np.ndarray, List],
416
- eval_set: Optional[Union[List[tuple], tuple]] = None,
434
+ y: Union[pd.Series, np.ndarray, list],
435
+ eval_set: Optional[Union[list[tuple], tuple]] = None,
417
436
  *args,
418
- exclude_features_sources: Optional[List[str]] = None,
437
+ exclude_features_sources: Optional[list[str]] = None,
419
438
  calculate_metrics: Optional[bool] = None,
420
439
  estimator: Optional[Any] = None,
421
440
  scoring: Union[Callable, str, None] = None,
422
- importance_threshold: Optional[float] = None,
423
- max_features: Optional[int] = None,
424
441
  remove_outliers_calc_metrics: Optional[bool] = None,
425
442
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
426
443
  search_id_callback: Optional[Callable[[str], Any]] = None,
427
444
  select_features: bool = True,
428
445
  auto_fe_parameters: Optional[AutoFEParameters] = None,
446
+ stability_threshold: float = 0.2,
447
+ stability_agg_func: str = "max",
429
448
  **kwargs,
430
449
  ):
431
450
  """Fit to data.
@@ -440,14 +459,8 @@ class FeaturesEnricher(TransformerMixin):
440
459
  y: array-like of shape (n_samples,)
441
460
  Target values.
442
461
 
443
- eval_set: List[tuple], optional (default=None)
444
- List of pairs (X, y) for validation.
445
-
446
- importance_threshold: float, optional (default=None)
447
- Minimum SHAP value to select a feature. Default value is 0.0.
448
-
449
- max_features: int, optional (default=None)
450
- Maximum number of most important features to select. If None, the number is unlimited.
462
+ eval_set: list[tuple], optional (default=None)
463
+ list of pairs (X, y) for validation.
451
464
 
452
465
  calculate_metrics: bool, optional (default=None)
453
466
  Whether to calculate and show metrics.
@@ -465,6 +478,13 @@ class FeaturesEnricher(TransformerMixin):
465
478
  select_features: bool, optional (default=False)
466
479
  If True, return only selected features both from input and data sources.
467
480
  Otherwise, return all features from input and only selected features from data sources.
481
+
482
+ stability_threshold: float, optional (default=0.2)
483
+ Stability threshold for selected features PSI calculation. If PSI is less than this threshold,
484
+ then feature will be dropped.
485
+
486
+ stability_agg_func: str, optional (default="max")
487
+ Function to aggregate stability values. Can be "max", "min", "mean".
468
488
  """
469
489
  trace_id = str(uuid.uuid4())
470
490
  if self.print_trace_id:
@@ -500,7 +520,7 @@ class FeaturesEnricher(TransformerMixin):
500
520
  try:
501
521
  self.X = X
502
522
  self.y = y
503
- self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
523
+ self.eval_set = self._check_eval_set(eval_set, X)
504
524
  self.dump_input(trace_id, X, y, self.eval_set)
505
525
  self.__set_select_features(select_features)
506
526
  self.__inner_fit(
@@ -514,8 +534,8 @@ class FeaturesEnricher(TransformerMixin):
514
534
  calculate_metrics=calculate_metrics,
515
535
  estimator=estimator,
516
536
  scoring=scoring,
517
- importance_threshold=importance_threshold,
518
- max_features=max_features,
537
+ stability_threshold=stability_threshold,
538
+ stability_agg_func=stability_agg_func,
519
539
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
520
540
  auto_fe_parameters=auto_fe_parameters,
521
541
  progress_callback=progress_callback,
@@ -560,13 +580,11 @@ class FeaturesEnricher(TransformerMixin):
560
580
  def fit_transform(
561
581
  self,
562
582
  X: Union[pd.DataFrame, pd.Series, np.ndarray],
563
- y: Union[pd.DataFrame, pd.Series, np.ndarray, List],
564
- eval_set: Optional[Union[List[tuple], tuple]] = None,
583
+ y: Union[pd.DataFrame, pd.Series, np.ndarray, list],
584
+ eval_set: Optional[Union[list[tuple], tuple]] = None,
565
585
  *args,
566
- exclude_features_sources: Optional[List[str]] = None,
586
+ exclude_features_sources: Optional[list[str]] = None,
567
587
  keep_input: bool = True,
568
- importance_threshold: Optional[float] = None,
569
- max_features: Optional[int] = None,
570
588
  calculate_metrics: Optional[bool] = None,
571
589
  scoring: Union[Callable, str, None] = None,
572
590
  estimator: Optional[Any] = None,
@@ -574,6 +592,8 @@ class FeaturesEnricher(TransformerMixin):
574
592
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
575
593
  select_features: bool = True,
576
594
  auto_fe_parameters: Optional[AutoFEParameters] = None,
595
+ stability_threshold: float = 0.2,
596
+ stability_agg_func: str = "max",
577
597
  **kwargs,
578
598
  ) -> pd.DataFrame:
579
599
  """Fit to data, then transform it.
@@ -589,21 +609,12 @@ class FeaturesEnricher(TransformerMixin):
589
609
  y: array-like of shape (n_samples,)
590
610
  Target values.
591
611
 
592
- eval_set: List[tuple], optional (default=None)
593
- List of pairs (X, y) for validation.
612
+ eval_set: list[tuple], optional (default=None)
613
+ list of pairs (X, y) for validation.
594
614
 
595
615
  keep_input: bool, optional (default=True)
596
616
  If True, copy original input columns to the output dataframe.
597
617
 
598
- importance_threshold: float, optional (default=None)
599
- Minimum SHAP value to select a feature. Default value is 0.0.
600
-
601
- max_features: int, optional (default=None)
602
- Maximum number of most important features to select. If None, the number is unlimited.
603
-
604
- calculate_metrics: bool, optional (default=None)
605
- Whether to calculate and show metrics.
606
-
607
618
  estimator: sklearn-compatible estimator, optional (default=None)
608
619
  Custom estimator for metrics calculation.
609
620
 
@@ -618,6 +629,13 @@ class FeaturesEnricher(TransformerMixin):
618
629
  If True, return only selected features both from input and data sources.
619
630
  Otherwise, return all features from input and only selected features from data sources.
620
631
 
632
+ stability_threshold: float, optional (default=0.2)
633
+ Stability threshold for selected features PSI calculation. If PSI is less than this threshold,
634
+ then feature will be dropped.
635
+
636
+ stability_agg_func: str, optional (default="max")
637
+ Function to aggregate stability values. Can be "max", "min", "mean".
638
+
621
639
  Returns
622
640
  -------
623
641
  X_new: pandas.DataFrame of shape (n_samples, n_features_new)
@@ -655,7 +673,7 @@ class FeaturesEnricher(TransformerMixin):
655
673
  try:
656
674
  self.X = X
657
675
  self.y = y
658
- self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
676
+ self.eval_set = self._check_eval_set(eval_set, X)
659
677
  self.__set_select_features(select_features)
660
678
  self.dump_input(trace_id, X, y, self.eval_set)
661
679
 
@@ -673,8 +691,8 @@ class FeaturesEnricher(TransformerMixin):
673
691
  calculate_metrics=calculate_metrics,
674
692
  scoring=scoring,
675
693
  estimator=estimator,
676
- importance_threshold=importance_threshold,
677
- max_features=max_features,
694
+ stability_threshold=stability_threshold,
695
+ stability_agg_func=stability_agg_func,
678
696
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
679
697
  auto_fe_parameters=auto_fe_parameters,
680
698
  progress_callback=progress_callback,
@@ -717,8 +735,6 @@ class FeaturesEnricher(TransformerMixin):
717
735
  X,
718
736
  exclude_features_sources=exclude_features_sources,
719
737
  keep_input=keep_input,
720
- importance_threshold=importance_threshold,
721
- max_features=max_features,
722
738
  trace_id=trace_id,
723
739
  silent_mode=True,
724
740
  progress_bar=progress_bar,
@@ -732,10 +748,8 @@ class FeaturesEnricher(TransformerMixin):
732
748
  X: pd.DataFrame,
733
749
  *args,
734
750
  y: Optional[pd.Series] = None,
735
- exclude_features_sources: Optional[List[str]] = None,
751
+ exclude_features_sources: Optional[list[str]] = None,
736
752
  keep_input: bool = True,
737
- importance_threshold: Optional[float] = None,
738
- max_features: Optional[int] = None,
739
753
  trace_id: Optional[str] = None,
740
754
  metrics_calculation: bool = False,
741
755
  silent_mode=False,
@@ -756,12 +770,6 @@ class FeaturesEnricher(TransformerMixin):
756
770
  keep_input: bool, optional (default=True)
757
771
  If True, copy original input columns to the output dataframe.
758
772
 
759
- importance_threshold: float, optional (default=None)
760
- Minimum SHAP value to select a feature. Default value is 0.0.
761
-
762
- max_features: int, optional (default=None)
763
- Maximum number of most important features to select. If None, the number is unlimited.
764
-
765
773
  Returns
766
774
  -------
767
775
  X_new: pandas.DataFrame of shape (n_samples, n_features_new)
@@ -798,8 +806,6 @@ class FeaturesEnricher(TransformerMixin):
798
806
  X,
799
807
  y=y,
800
808
  exclude_features_sources=exclude_features_sources,
801
- importance_threshold=importance_threshold,
802
- max_features=max_features,
803
809
  metrics_calculation=metrics_calculation,
804
810
  silent_mode=silent_mode,
805
811
  progress_bar=progress_bar,
@@ -854,15 +860,13 @@ class FeaturesEnricher(TransformerMixin):
854
860
  def calculate_metrics(
855
861
  self,
856
862
  X: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
857
- y: Union[pd.DataFrame, pd.Series, np.ndarray, List, None] = None,
858
- eval_set: Optional[Union[List[tuple], tuple]] = None,
863
+ y: Union[pd.DataFrame, pd.Series, np.ndarray, list, None] = None,
864
+ eval_set: Optional[Union[list[tuple], tuple]] = None,
859
865
  *args,
860
866
  scoring: Union[Callable, str, None] = None,
861
867
  cv: Union[BaseCrossValidator, CVType, None] = None,
862
868
  estimator=None,
863
- exclude_features_sources: Optional[List[str]] = None,
864
- importance_threshold: Optional[float] = None,
865
- max_features: Optional[int] = None,
869
+ exclude_features_sources: Optional[list[str]] = None,
866
870
  remove_outliers_calc_metrics: Optional[bool] = None,
867
871
  trace_id: Optional[str] = None,
868
872
  internal_call: bool = False,
@@ -880,8 +884,8 @@ class FeaturesEnricher(TransformerMixin):
880
884
  y: array-like of shape (n_samples,), optional (default=None)
881
885
  Target values. If X not passed then y from fit will be used
882
886
 
883
- eval_set: List[tuple], optional (default=None)
884
- List of pairs (X, y) for validation. If X not passed then eval_set from fit will be used
887
+ eval_set: list[tuple], optional (default=None)
888
+ list of pairs (X, y) for validation. If X not passed then eval_set from fit will be used
885
889
 
886
890
  scoring: string or callable, optional (default=None)
887
891
  A string or a scorer callable object / function with signature scorer(estimator, X, y).
@@ -893,12 +897,6 @@ class FeaturesEnricher(TransformerMixin):
893
897
  estimator: sklearn-compatible estimator, optional (default=None)
894
898
  Custom estimator for metrics calculation. If not passed then CatBoost will be used.
895
899
 
896
- importance_threshold: float, optional (default=None)
897
- Minimum SHAP value to select a feature. Default value is 0.0.
898
-
899
- max_features: int, optional (default=None)
900
- Maximum number of most important features to select. If None, the number is unlimited.
901
-
902
900
  remove_outliers_calc_metrics, optional (default=True)
903
901
  If True then rows with target ouliers will be dropped on metrics calculation
904
902
 
@@ -929,7 +927,7 @@ class FeaturesEnricher(TransformerMixin):
929
927
  effective_X = X if X is not None else self.X
930
928
  effective_y = y if y is not None else self.y
931
929
  effective_eval_set = eval_set if eval_set is not None else self.eval_set
932
- effective_eval_set = self._check_eval_set(effective_eval_set, effective_X, self.bundle)
930
+ effective_eval_set = self._check_eval_set(effective_eval_set, effective_X)
933
931
 
934
932
  if (
935
933
  self._search_task is None
@@ -941,7 +939,7 @@ class FeaturesEnricher(TransformerMixin):
941
939
  raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
942
940
 
943
941
  validated_X, validated_y, validated_eval_set = self._validate_train_eval(
944
- effective_X, effective_y, effective_eval_set
942
+ effective_X, effective_y, effective_eval_set, silent=internal_call
945
943
  )
946
944
 
947
945
  if self.X is None:
@@ -961,8 +959,6 @@ class FeaturesEnricher(TransformerMixin):
961
959
  validated_eval_set,
962
960
  exclude_features_sources=exclude_features_sources,
963
961
  cv=cv if cv is not None else self.cv,
964
- importance_threshold=importance_threshold,
965
- max_features=max_features,
966
962
  scoring=scoring,
967
963
  estimator=estimator,
968
964
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
@@ -979,46 +975,43 @@ class FeaturesEnricher(TransformerMixin):
979
975
  return None
980
976
 
981
977
  cat_features_from_backend = self.__get_categorical_features()
978
+ # Convert to original names
979
+ cat_features_from_backend = [self.fit_columns_renaming.get(c, c) for c in cat_features_from_backend]
982
980
  client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
983
981
  estimator, validated_X, self.search_keys
984
982
  )
983
+ # Exclude id columns from cat_features
985
984
  if self.id_columns and self.id_columns_encoder is not None:
986
985
  if cat_features_from_backend:
987
986
  cat_features_from_backend = [
988
- c
989
- for c in cat_features_from_backend
990
- if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
987
+ c for c in cat_features_from_backend if c not in self.id_columns_encoder.feature_names_in_
991
988
  ]
992
989
  if client_cat_features:
993
990
  client_cat_features = [
994
- c
995
- for c in client_cat_features
996
- if self.fit_columns_renaming.get(c, c) not in self.id_columns_encoder.feature_names_in_
991
+ c for c in client_cat_features if c not in self.id_columns_encoder.feature_names_in_
997
992
  ]
998
993
  for cat_feature in cat_features_from_backend:
999
- original_cat_feature = self.fit_columns_renaming.get(cat_feature)
1000
- if original_cat_feature in self.search_keys:
1001
- if self.search_keys[original_cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
1002
- search_keys_for_metrics.append(original_cat_feature)
994
+ if cat_feature in self.search_keys:
995
+ if self.search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
996
+ search_keys_for_metrics.append(cat_feature)
1003
997
  else:
1004
- self.logger.warning(self.bundle.get("cat_feature_search_key").format(original_cat_feature))
998
+ self.logger.warning(self.bundle.get("cat_feature_search_key").format(cat_feature))
1005
999
  search_keys_for_metrics.extend([c for c in self.id_columns or [] if c not in search_keys_for_metrics])
1006
1000
  self.logger.info(f"Search keys for metrics: {search_keys_for_metrics}")
1007
1001
 
1008
- prepared_data = self._prepare_data_for_metrics(
1002
+ prepared_data = self._get_cached_enriched_data(
1009
1003
  trace_id=trace_id,
1010
1004
  X=X,
1011
1005
  y=y,
1012
1006
  eval_set=eval_set,
1013
1007
  exclude_features_sources=exclude_features_sources,
1014
- importance_threshold=importance_threshold,
1015
- max_features=max_features,
1016
1008
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
1017
1009
  cv_override=cv,
1018
1010
  search_keys_for_metrics=search_keys_for_metrics,
1019
1011
  progress_bar=progress_bar,
1020
1012
  progress_callback=progress_callback,
1021
1013
  client_cat_features=client_cat_features,
1014
+ is_for_metrics=True,
1022
1015
  )
1023
1016
  if prepared_data is None:
1024
1017
  return None
@@ -1033,23 +1026,9 @@ class FeaturesEnricher(TransformerMixin):
1033
1026
  groups,
1034
1027
  _cv,
1035
1028
  columns_renaming,
1029
+ _,
1036
1030
  ) = prepared_data
1037
1031
 
1038
- # rename cat_features
1039
- if client_cat_features:
1040
- for new_c, old_c in columns_renaming.items():
1041
- if old_c in client_cat_features:
1042
- client_cat_features.remove(old_c)
1043
- client_cat_features.append(new_c)
1044
- for cat_feature in client_cat_features:
1045
- if cat_feature not in fitting_X.columns:
1046
- self.logger.error(
1047
- f"Client cat_feature `{cat_feature}` not found in"
1048
- f" x columns: {fitting_X.columns.to_list()}"
1049
- )
1050
- else:
1051
- client_cat_features = []
1052
-
1053
1032
  # rename baseline_score_column
1054
1033
  reversed_renaming = {v: k for k, v in columns_renaming.items()}
1055
1034
  baseline_score_column = self.baseline_score_column
@@ -1074,9 +1053,9 @@ class FeaturesEnricher(TransformerMixin):
1074
1053
  self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
1075
1054
 
1076
1055
  has_date = self._get_date_column(search_keys) is not None
1077
- has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
1078
1056
  model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
1079
1057
  cat_features = list(set(client_cat_features + cat_features_from_backend))
1058
+ has_time = has_date and isinstance(_cv, TimeSeriesSplit) or isinstance(_cv, BlockedTimeSeriesSplit)
1080
1059
  baseline_cat_features = [f for f in cat_features if f in fitting_X.columns]
1081
1060
  enriched_cat_features = [f for f in cat_features if f in fitting_enriched_X.columns]
1082
1061
  if len(enriched_cat_features) < len(cat_features):
@@ -1196,8 +1175,6 @@ class FeaturesEnricher(TransformerMixin):
1196
1175
  # max_initial_eval_set_hit_rate = self._search_task.get_max_initial_eval_set_hit_rate_v2()
1197
1176
  if len(fitting_eval_set_dict) > 0:
1198
1177
  for idx in fitting_eval_set_dict.keys():
1199
- # eval_hit_rate = max_initial_eval_set_hit_rate[idx + 1]
1200
-
1201
1178
  (
1202
1179
  eval_X_sorted,
1203
1180
  eval_y_sorted,
@@ -1205,6 +1182,10 @@ class FeaturesEnricher(TransformerMixin):
1205
1182
  enriched_eval_y_sorted,
1206
1183
  ) = fitting_eval_set_dict[idx]
1207
1184
 
1185
+ if eval_y_sorted.isna().all():
1186
+ # Skip OOT eval set
1187
+ continue
1188
+
1208
1189
  if baseline_estimator is not None:
1209
1190
  self.logger.info(
1210
1191
  f"Calculate baseline {metric} on eval set {idx + 1} "
@@ -1247,17 +1228,14 @@ class FeaturesEnricher(TransformerMixin):
1247
1228
  "quality_metrics_eval_segment"
1248
1229
  ).format(idx + 1),
1249
1230
  self.bundle.get("quality_metrics_rows_header"): _num_samples(
1250
- # effective_eval_set[idx][0]
1251
1231
  # Use actually used for metrics dataset
1252
1232
  eval_X_sorted
1253
1233
  ),
1254
- # self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
1255
1234
  }
1256
1235
  if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
1257
1236
  eval_y_sorted
1258
1237
  ):
1259
1238
  eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
1260
- # np.mean(validated_eval_set[idx][1]), 4
1261
1239
  # Use actually used for metrics dataset
1262
1240
  np.mean(eval_y_sorted),
1263
1241
  4,
@@ -1279,7 +1257,7 @@ class FeaturesEnricher(TransformerMixin):
1279
1257
  metrics.append(eval_metrics)
1280
1258
 
1281
1259
  if updating_shaps is not None:
1282
- decoded_X = self._decode_id_columns(fitting_X, columns_renaming)
1260
+ decoded_X = self._decode_id_columns(fitting_X)
1283
1261
  self._update_shap_values(trace_id, decoded_X, updating_shaps, silent=not internal_call)
1284
1262
 
1285
1263
  metrics_df = pd.DataFrame(metrics)
@@ -1330,7 +1308,188 @@ class FeaturesEnricher(TransformerMixin):
1330
1308
  finally:
1331
1309
  self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
1332
1310
 
1333
- def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float], silent: bool = False):
1311
+ def _select_features_by_psi(
1312
+ self,
1313
+ trace_id: str,
1314
+ X: Union[pd.DataFrame, pd.Series, np.ndarray],
1315
+ y: Union[pd.DataFrame, pd.Series, np.ndarray, list],
1316
+ eval_set: Optional[Union[list[tuple], tuple]],
1317
+ stability_threshold: float,
1318
+ stability_agg_func: Callable,
1319
+ cv: Union[BaseCrossValidator, CVType, str, None] = None,
1320
+ estimator=None,
1321
+ exclude_features_sources: Optional[list[str]] = None,
1322
+ progress_bar: bool = True,
1323
+ progress_callback: Optional[Callable] = None,
1324
+ ):
1325
+ search_keys = self.search_keys.copy()
1326
+ validated_X, _, validated_eval_set = self._validate_train_eval(X, y, eval_set, silent=True)
1327
+ if isinstance(X, np.ndarray):
1328
+ search_keys = {str(k): v for k, v in search_keys.items()}
1329
+
1330
+ date_column = self._get_date_column(search_keys)
1331
+ has_date = date_column is not None
1332
+ if not has_date:
1333
+ self.logger.info("No date column for OOT PSI calculation")
1334
+ return
1335
+ if not validated_eval_set:
1336
+ self.logger.info("No eval set for OOT PSI calculation")
1337
+ return
1338
+ if validated_X[date_column].nunique() <= 1:
1339
+ self.logger.warning("Constant date for OOT PSI calculation")
1340
+ return
1341
+ if self.cv is not None and self.cv.is_time_series():
1342
+ self.logger.warning("Time series CV is not supported for OOT PSI calculation")
1343
+ return
1344
+
1345
+ cat_features_from_backend = self.__get_categorical_features()
1346
+ cat_features_from_backend = [self.fit_columns_renaming.get(c, c) for c in cat_features_from_backend]
1347
+ client_cat_features, search_keys_for_metrics = self._get_and_validate_client_cat_features(
1348
+ estimator, validated_X, search_keys
1349
+ )
1350
+ if self.id_columns and self.id_columns_encoder is not None:
1351
+ if cat_features_from_backend:
1352
+ cat_features_from_backend = [
1353
+ c for c in cat_features_from_backend if c not in self.id_columns_encoder.feature_names_in_
1354
+ ]
1355
+ if client_cat_features:
1356
+ client_cat_features = [
1357
+ c for c in client_cat_features if c not in self.id_columns_encoder.feature_names_in_
1358
+ ]
1359
+
1360
+ prepared_data = self._get_cached_enriched_data(
1361
+ trace_id=trace_id,
1362
+ X=X,
1363
+ y=y,
1364
+ eval_set=eval_set,
1365
+ exclude_features_sources=exclude_features_sources,
1366
+ remove_outliers_calc_metrics=False,
1367
+ cv_override=cv,
1368
+ search_keys_for_metrics=search_keys_for_metrics,
1369
+ progress_bar=progress_bar,
1370
+ progress_callback=progress_callback,
1371
+ client_cat_features=client_cat_features,
1372
+ )
1373
+ if prepared_data is None:
1374
+ return None
1375
+
1376
+ (
1377
+ validated_X,
1378
+ _,
1379
+ y_sorted,
1380
+ _,
1381
+ _,
1382
+ fitting_eval_set_dict,
1383
+ _,
1384
+ _,
1385
+ _,
1386
+ _,
1387
+ eval_set_dates,
1388
+ ) = prepared_data
1389
+
1390
+ model_task_type = self.model_task_type or define_task(y_sorted, has_date, self.logger, silent=True)
1391
+ cat_features = list(set(client_cat_features + cat_features_from_backend))
1392
+
1393
+ # Drop unstable features
1394
+ unstable_features = self._check_stability(
1395
+ validated_X,
1396
+ validated_eval_set,
1397
+ fitting_eval_set_dict,
1398
+ eval_set_dates,
1399
+ search_keys,
1400
+ stability_threshold,
1401
+ stability_agg_func,
1402
+ cat_features,
1403
+ model_task_type,
1404
+ )
1405
+
1406
+ if unstable_features:
1407
+ msg = f"{len(unstable_features)} feature(s) are unstable: {unstable_features} and will be dropped"
1408
+ self.logger.warning(msg)
1409
+ print(msg)
1410
+
1411
+ def _check_stability(
1412
+ self,
1413
+ X: pd.DataFrame,
1414
+ eval_set: list[tuple[pd.DataFrame, pd.Series]],
1415
+ enriched_eval_set: dict,
1416
+ eval_set_dates: dict[int, pd.Series],
1417
+ search_keys: dict[str, SearchKey],
1418
+ stability_threshold: float,
1419
+ stability_agg_func: str | None,
1420
+ cat_features: list[str],
1421
+ model_task_type: ModelTaskType,
1422
+ ) -> list[str]:
1423
+ # Find latest eval set or earliest if all eval sets are before train set
1424
+ date_column = self._get_date_column(search_keys)
1425
+
1426
+ # Get minimum date from main dataset X
1427
+ main_min_date = X[date_column].dropna().min()
1428
+
1429
+ # Find minimum date for each eval_set and compare with main dataset
1430
+ eval_dates = []
1431
+ for i, (eval_x, _) in enumerate(eval_set):
1432
+ if date_column in eval_x.columns:
1433
+ if len(eval_x) < 1000:
1434
+ self.logger.warning(f"Eval_set {i} has less than 1000 rows. It will be ignored for stability check")
1435
+ continue
1436
+ eval_min_date = eval_x[date_column].dropna().min()
1437
+ eval_max_date = eval_x[date_column].dropna().max()
1438
+ eval_dates.append((i, eval_min_date, eval_max_date))
1439
+
1440
+ if not eval_dates:
1441
+ return []
1442
+
1443
+ # Check if any eval_set has minimum date >= main dataset minimum date
1444
+ later_eval_sets = [(i, min_date, max_date) for i, min_date, max_date in eval_dates if min_date >= main_min_date]
1445
+
1446
+ if later_eval_sets:
1447
+ # If there are eval_sets with date >= main date, choose the one with highest maximum date
1448
+ selected_eval_set_idx = max(later_eval_sets, key=lambda x: x[2])[0]
1449
+ else:
1450
+ # If all eval_sets have dates < main date, choose the one with lowest minimux date
1451
+ selected_eval_set_idx = max(eval_dates, key=lambda x: x[1])[0]
1452
+
1453
+ checking_eval_set = enriched_eval_set[selected_eval_set_idx]
1454
+
1455
+ checking_eval_set_df = (
1456
+ checking_eval_set[2]
1457
+ if checking_eval_set[1] is None or checking_eval_set[1].isna().all()
1458
+ else pd.concat([checking_eval_set[2], checking_eval_set[1].to_frame(TARGET)], axis=1)
1459
+ )
1460
+ checking_eval_set_df = checking_eval_set_df.copy()
1461
+
1462
+ checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
1463
+
1464
+ psi_values_sparse = calculate_sparsity_psi(
1465
+ checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
1466
+ )
1467
+
1468
+ self.logger.info(f"PSI values by sparsity: {psi_values_sparse}")
1469
+
1470
+ unstable_by_sparsity = [feature for feature, psi in psi_values_sparse.items() if psi > stability_threshold]
1471
+ if unstable_by_sparsity:
1472
+ self.logger.info(f"Unstable by sparsity features: {sorted(unstable_by_sparsity)}")
1473
+
1474
+ psi_values = calculate_features_psi(
1475
+ checking_eval_set_df, cat_features, date_column, self.logger, model_task_type, stability_agg_func
1476
+ )
1477
+
1478
+ self.logger.info(f"PSI values by value: {psi_values}")
1479
+
1480
+ unstable_by_value = [feature for feature, psi in psi_values.items() if psi > stability_threshold]
1481
+ if unstable_by_value:
1482
+ self.logger.info(f"Unstable by value features: {sorted(unstable_by_value)}")
1483
+
1484
+ self.psi_values = {
1485
+ feature: psi_value for feature, psi_value in psi_values.items() if psi_value <= stability_threshold
1486
+ }
1487
+
1488
+ total_unstable_features = sorted(set(unstable_by_sparsity + unstable_by_value))
1489
+
1490
+ return total_unstable_features
1491
+
1492
+ def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: dict[str, float], silent: bool = False):
1334
1493
  renaming = self.fit_columns_renaming or {}
1335
1494
  self.logger.info(f"Updating SHAP values: {new_shaps}")
1336
1495
  new_shaps = {
@@ -1347,7 +1506,7 @@ class FeaturesEnricher(TransformerMixin):
1347
1506
  display_html_dataframe(
1348
1507
  self.features_info,
1349
1508
  self._features_info_without_links,
1350
- self.bundle.get("relevant_features_header"),
1509
+ self.bundle.get("relevant_features_header").format(len(self.feature_names_)),
1351
1510
  display_handle=self.features_info_display_handle,
1352
1511
  )
1353
1512
  except (ImportError, NameError):
@@ -1398,13 +1557,13 @@ class FeaturesEnricher(TransformerMixin):
1398
1557
  self.logger.warning(msg)
1399
1558
 
1400
1559
  def _has_features_with_commercial_schema(
1401
- self, commercial_schema: str, exclude_features_sources: Optional[List[str]]
1560
+ self, commercial_schema: str, exclude_features_sources: Optional[list[str]]
1402
1561
  ) -> bool:
1403
1562
  return len(self._get_features_with_commercial_schema(commercial_schema, exclude_features_sources)) > 0
1404
1563
 
1405
1564
  def _get_features_with_commercial_schema(
1406
- self, commercial_schema: str, exclude_features_sources: Optional[List[str]]
1407
- ) -> List[str]:
1565
+ self, commercial_schema: str, exclude_features_sources: Optional[list[str]]
1566
+ ) -> list[str]:
1408
1567
  if exclude_features_sources:
1409
1568
  filtered_features_info = self._internal_features_info[
1410
1569
  ~self._internal_features_info[self.bundle.get("features_info_name")].isin(exclude_features_sources)
@@ -1418,19 +1577,19 @@ class FeaturesEnricher(TransformerMixin):
1418
1577
  ].values
1419
1578
  )
1420
1579
 
1421
- def _has_paid_features(self, exclude_features_sources: Optional[List[str]]) -> bool:
1580
+ def _has_paid_features(self, exclude_features_sources: Optional[list[str]]) -> bool:
1422
1581
  return self._has_features_with_commercial_schema(CommercialSchema.PAID.value, exclude_features_sources)
1423
1582
 
1424
1583
  def _is_input_same_as_fit(
1425
1584
  self,
1426
1585
  X: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
1427
- y: Union[pd.DataFrame, pd.Series, np.ndarray, List, None] = None,
1428
- eval_set: Optional[List[tuple]] = None,
1429
- ) -> Tuple:
1586
+ y: Union[pd.DataFrame, pd.Series, np.ndarray, list, None] = None,
1587
+ eval_set: Optional[list[tuple]] = None,
1588
+ ) -> tuple:
1430
1589
  if X is None:
1431
1590
  return True, self.X, self.y, self.eval_set
1432
1591
 
1433
- checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
1592
+ checked_eval_set = self._check_eval_set(eval_set, X)
1434
1593
 
1435
1594
  if (
1436
1595
  X is self.X
@@ -1457,8 +1616,8 @@ class FeaturesEnricher(TransformerMixin):
1457
1616
  self,
1458
1617
  X: pd.DataFrame,
1459
1618
  cv_override: Union[BaseCrossValidator, CVType, str, None],
1460
- search_keys: Dict[str, SearchKey],
1461
- ) -> Tuple[BaseCrossValidator, Optional[np.ndarray]]:
1619
+ search_keys: dict[str, SearchKey],
1620
+ ) -> tuple[BaseCrossValidator, Optional[np.ndarray]]:
1462
1621
  _cv = cv_override or self.cv
1463
1622
  group_columns = sorted(self._get_group_columns(X, search_keys))
1464
1623
  groups = None
@@ -1486,9 +1645,9 @@ class FeaturesEnricher(TransformerMixin):
1486
1645
  return _cv, groups
1487
1646
 
1488
1647
  def _get_and_validate_client_cat_features(
1489
- self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
1490
- ) -> Tuple[Optional[List[str]], List[str]]:
1491
- cat_features = None
1648
+ self, estimator: Optional[Any], X: pd.DataFrame, search_keys: dict[str, SearchKey]
1649
+ ) -> tuple[Optional[list[str]], list[str]]:
1650
+ cat_features = []
1492
1651
  search_keys_for_metrics = []
1493
1652
  if (
1494
1653
  estimator is not None
@@ -1516,41 +1675,41 @@ class FeaturesEnricher(TransformerMixin):
1516
1675
  raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
1517
1676
  return cat_features, search_keys_for_metrics
1518
1677
 
1519
- def _prepare_data_for_metrics(
1678
+ def _get_cached_enriched_data(
1520
1679
  self,
1521
1680
  trace_id: str,
1522
1681
  X: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
1523
- y: Union[pd.DataFrame, pd.Series, np.ndarray, List, None] = None,
1524
- eval_set: Optional[Union[List[tuple], tuple]] = None,
1525
- exclude_features_sources: Optional[List[str]] = None,
1526
- importance_threshold: Optional[float] = None,
1527
- max_features: Optional[int] = None,
1682
+ y: Union[pd.DataFrame, pd.Series, np.ndarray, list, None] = None,
1683
+ eval_set: Optional[Union[list[tuple], tuple]] = None,
1684
+ exclude_features_sources: Optional[list[str]] = None,
1528
1685
  remove_outliers_calc_metrics: Optional[bool] = None,
1529
1686
  cv_override: Union[BaseCrossValidator, CVType, str, None] = None,
1530
- search_keys_for_metrics: Optional[List[str]] = None,
1687
+ search_keys_for_metrics: Optional[list[str]] = None,
1531
1688
  progress_bar: Optional[ProgressBar] = None,
1532
1689
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
1533
- client_cat_features: Optional[List[str]] = None,
1690
+ client_cat_features: Optional[list[str]] = None,
1691
+ is_for_metrics: bool = False,
1534
1692
  ):
1535
1693
  is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
1536
1694
  is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
1537
- checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
1538
- validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set)
1695
+ checked_eval_set = self._check_eval_set(eval_set, X)
1696
+ validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set, silent=True)
1539
1697
 
1540
- sampled_data = self._get_enriched_for_metrics(
1541
- trace_id,
1542
- validated_X,
1543
- validated_y,
1544
- validated_eval_set,
1545
- exclude_features_sources,
1546
- is_input_same_as_fit,
1547
- is_demo_dataset,
1548
- remove_outliers_calc_metrics,
1549
- progress_bar,
1550
- progress_callback,
1698
+ sampled_data = self._get_enriched_datasets(
1699
+ trace_id=trace_id,
1700
+ validated_X=validated_X,
1701
+ validated_y=validated_y,
1702
+ eval_set=validated_eval_set,
1703
+ exclude_features_sources=exclude_features_sources,
1704
+ is_input_same_as_fit=is_input_same_as_fit,
1705
+ is_demo_dataset=is_demo_dataset,
1706
+ remove_outliers_calc_metrics=remove_outliers_calc_metrics,
1707
+ progress_bar=progress_bar,
1708
+ progress_callback=progress_callback,
1709
+ is_for_metrics=is_for_metrics,
1551
1710
  )
1552
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = dataclasses.astuple(
1553
- sampled_data
1711
+ (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming, generated_features) = (
1712
+ dataclasses.astuple(sampled_data)
1554
1713
  )
1555
1714
 
1556
1715
  excluding_search_keys = list(search_keys.keys())
@@ -1566,14 +1725,9 @@ class FeaturesEnricher(TransformerMixin):
1566
1725
 
1567
1726
  client_features = [
1568
1727
  c
1569
- for c in X_sampled.columns.to_list()
1570
- if (
1571
- not self.fit_select_features
1572
- or c in set(self.feature_names_).union(self.id_columns or [])
1573
- or (self.fit_columns_renaming or {}).get(c, c) in set(self.feature_names_).union(self.id_columns or [])
1574
- )
1575
- and c
1576
- not in (
1728
+ for c in (validated_X.columns.to_list() + generated_features)
1729
+ if (not self.fit_select_features or c in set(self.feature_names_).union(self.id_columns or []))
1730
+ and c not in (
1577
1731
  excluding_search_keys
1578
1732
  + list(self.fit_dropped_features)
1579
1733
  + [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
@@ -1581,20 +1735,17 @@ class FeaturesEnricher(TransformerMixin):
1581
1735
  ]
1582
1736
  self.logger.info(f"Client features column on prepare data for metrics: {client_features}")
1583
1737
 
1584
- filtered_enriched_features = self.__filtered_enriched_features(
1585
- importance_threshold, max_features, trace_id, validated_X
1586
- )
1587
- filtered_enriched_features = [c for c in filtered_enriched_features if c not in client_features]
1738
+ selected_enriched_features = [c for c in self.feature_names_ if c not in client_features]
1588
1739
 
1589
1740
  X_sorted, y_sorted = self._sort_by_system_record_id(X_sampled, y_sampled, self.cv)
1590
1741
  enriched_X_sorted, enriched_y_sorted = self._sort_by_system_record_id(enriched_X, y_sampled, self.cv)
1591
1742
 
1592
1743
  cv, groups = self._get_cv_and_groups(enriched_X_sorted, cv_override, search_keys)
1593
1744
 
1594
- existing_filtered_enriched_features = [c for c in filtered_enriched_features if c in enriched_X_sorted.columns]
1745
+ existing_selected_enriched_features = [c for c in selected_enriched_features if c in enriched_X_sorted.columns]
1595
1746
 
1596
1747
  fitting_X = X_sorted[client_features].copy()
1597
- fitting_enriched_X = enriched_X_sorted[client_features + existing_filtered_enriched_features].copy()
1748
+ fitting_enriched_X = enriched_X_sorted[client_features + existing_selected_enriched_features].copy()
1598
1749
 
1599
1750
  renamed_generate_features = [columns_renaming.get(c, c) for c in (self.generate_features or [])]
1600
1751
  renamed_client_cat_features = [columns_renaming.get(c, c) for c in (client_cat_features or [])]
@@ -1658,7 +1809,7 @@ class FeaturesEnricher(TransformerMixin):
1658
1809
  fitting_X, y_sorted, search_keys, self.model_task_type, sort_all_columns=True, logger=self.logger
1659
1810
  )
1660
1811
  fitting_X = fitting_X[fitting_x_columns]
1661
- fitting_X, _ = self._encode_id_columns(fitting_X, self.fit_columns_renaming)
1812
+ fitting_X, _ = self._encode_id_columns(fitting_X)
1662
1813
  self.logger.info(f"Final sorted list of fitting X columns: {fitting_x_columns}")
1663
1814
  fitting_enriched_x_columns = fitting_enriched_X.columns.to_list()
1664
1815
  fitting_enriched_x_columns = sort_columns(
@@ -1670,14 +1821,18 @@ class FeaturesEnricher(TransformerMixin):
1670
1821
  logger=self.logger,
1671
1822
  )
1672
1823
  fitting_enriched_X = fitting_enriched_X[fitting_enriched_x_columns]
1673
- fitting_enriched_X, _ = self._encode_id_columns(fitting_enriched_X, self.fit_columns_renaming)
1824
+ fitting_enriched_X, _ = self._encode_id_columns(fitting_enriched_X)
1674
1825
  self.logger.info(f"Final sorted list of fitting enriched X columns: {fitting_enriched_x_columns}")
1826
+ date_column = self._get_date_column(search_keys)
1827
+ eval_set_dates = {}
1675
1828
  for idx, eval_tuple in eval_set_sampled_dict.items():
1676
1829
  eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
1677
1830
  eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
1678
1831
  enriched_eval_X_sorted, enriched_eval_y_sorted = self._sort_by_system_record_id(
1679
1832
  enriched_eval_X, eval_y_sampled, self.cv
1680
1833
  )
1834
+ if date_column is not None:
1835
+ eval_set_dates[idx] = eval_X_sorted[date_column]
1681
1836
  fitting_eval_X = eval_X_sorted[fitting_x_columns].copy()
1682
1837
  fitting_enriched_eval_X = enriched_eval_X_sorted[fitting_enriched_x_columns].copy()
1683
1838
 
@@ -1698,8 +1853,8 @@ class FeaturesEnricher(TransformerMixin):
1698
1853
  .astype(np.float64)
1699
1854
  )
1700
1855
 
1701
- fitting_eval_X, unknown_dict = self._encode_id_columns(fitting_eval_X, self.fit_columns_renaming)
1702
- fitting_enriched_eval_X, _ = self._encode_id_columns(fitting_enriched_eval_X, self.fit_columns_renaming)
1856
+ fitting_eval_X, unknown_dict = self._encode_id_columns(fitting_eval_X)
1857
+ fitting_enriched_eval_X, _ = self._encode_id_columns(fitting_enriched_eval_X)
1703
1858
 
1704
1859
  if len(unknown_dict) > 0:
1705
1860
  print(self.bundle.get("unknown_id_column_value_in_eval_set").format(unknown_dict))
@@ -1722,6 +1877,7 @@ class FeaturesEnricher(TransformerMixin):
1722
1877
  groups,
1723
1878
  cv,
1724
1879
  columns_renaming,
1880
+ eval_set_dates,
1725
1881
  )
1726
1882
 
1727
1883
  @dataclass
@@ -1729,29 +1885,31 @@ class FeaturesEnricher(TransformerMixin):
1729
1885
  X_sampled: pd.DataFrame
1730
1886
  y_sampled: pd.Series
1731
1887
  enriched_X: pd.DataFrame
1732
- eval_set_sampled_dict: Dict[int, Tuple[pd.DataFrame, pd.Series]]
1733
- search_keys: Dict[str, SearchKey]
1734
- columns_renaming: Dict[str, str]
1888
+ eval_set_sampled_dict: dict[int, tuple[pd.DataFrame, pd.Series]]
1889
+ search_keys: dict[str, SearchKey]
1890
+ columns_renaming: dict[str, str]
1891
+ generated_features: list[str]
1735
1892
 
1736
- def _get_enriched_for_metrics(
1893
+ def _get_enriched_datasets(
1737
1894
  self,
1738
1895
  trace_id: str,
1739
1896
  validated_X: Union[pd.DataFrame, pd.Series, np.ndarray, None],
1740
- validated_y: Union[pd.DataFrame, pd.Series, np.ndarray, List, None],
1741
- eval_set: Optional[List[tuple]],
1742
- exclude_features_sources: Optional[List[str]],
1897
+ validated_y: Union[pd.DataFrame, pd.Series, np.ndarray, list, None],
1898
+ eval_set: Optional[list[tuple]],
1899
+ exclude_features_sources: Optional[list[str]],
1743
1900
  is_input_same_as_fit: bool,
1744
1901
  is_demo_dataset: bool,
1745
1902
  remove_outliers_calc_metrics: Optional[bool],
1746
1903
  progress_bar: Optional[ProgressBar],
1747
1904
  progress_callback: Optional[Callable[[SearchProgress], Any]],
1905
+ is_for_metrics: bool = False,
1748
1906
  ) -> _EnrichedDataForMetrics:
1749
1907
  datasets_hash = hash_input(validated_X, validated_y, eval_set)
1750
1908
  cached_sampled_datasets = self.__cached_sampled_datasets.get(datasets_hash)
1751
1909
  if cached_sampled_datasets is not None and is_input_same_as_fit and remove_outliers_calc_metrics is None:
1752
1910
  self.logger.info("Cached enriched dataset found - use it")
1753
1911
  return self.__get_sampled_cached_enriched(datasets_hash, exclude_features_sources)
1754
- elif len(self.feature_importances_) == 0:
1912
+ elif len(self.feature_names_) == 0 or all([f in validated_X.columns for f in self.feature_names_]):
1755
1913
  self.logger.info("No external features selected. So use only input datasets for metrics calculation")
1756
1914
  return self.__get_enriched_as_input(validated_X, validated_y, eval_set, is_demo_dataset)
1757
1915
  # TODO save and check if dataset was deduplicated - use imbalance branch for such case
@@ -1777,12 +1935,13 @@ class FeaturesEnricher(TransformerMixin):
1777
1935
  trace_id,
1778
1936
  progress_bar,
1779
1937
  progress_callback,
1938
+ is_for_metrics=is_for_metrics,
1780
1939
  )
1781
1940
 
1782
1941
  def __get_sampled_cached_enriched(
1783
- self, datasets_hash: str, exclude_features_sources: Optional[List[str]]
1942
+ self, datasets_hash: str, exclude_features_sources: Optional[list[str]]
1784
1943
  ) -> _EnrichedDataForMetrics:
1785
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming = (
1944
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming, generated_features = (
1786
1945
  self.__cached_sampled_datasets[datasets_hash]
1787
1946
  )
1788
1947
  if exclude_features_sources:
@@ -1796,10 +1955,11 @@ class FeaturesEnricher(TransformerMixin):
1796
1955
  eval_set_sampled_dict,
1797
1956
  columns_renaming,
1798
1957
  search_keys,
1958
+ generated_features,
1799
1959
  )
1800
1960
 
1801
1961
  def __get_enriched_as_input(
1802
- self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
1962
+ self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[list[tuple]], is_demo_dataset: bool
1803
1963
  ) -> _EnrichedDataForMetrics:
1804
1964
  eval_set_sampled_dict = {}
1805
1965
 
@@ -1844,7 +2004,18 @@ class FeaturesEnricher(TransformerMixin):
1844
2004
  columns_renaming = normalizer.columns_renaming
1845
2005
 
1846
2006
  df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1847
- df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID, TARGET, columns_renaming, silent=True)
2007
+ df = self._add_fit_system_record_id(
2008
+ df,
2009
+ search_keys,
2010
+ SYSTEM_RECORD_ID,
2011
+ TARGET,
2012
+ columns_renaming,
2013
+ self.id_columns,
2014
+ self.cv,
2015
+ self.model_task_type,
2016
+ self.logger,
2017
+ self.bundle,
2018
+ )
1848
2019
 
1849
2020
  # Sample after sorting by system_record_id for idempotency
1850
2021
  df.sort_values(by=SYSTEM_RECORD_ID, inplace=True)
@@ -1853,6 +2024,10 @@ class FeaturesEnricher(TransformerMixin):
1853
2024
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
1854
2025
  df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
1855
2026
 
2027
+ df = df.rename(columns=columns_renaming)
2028
+ generated_features = [columns_renaming.get(c, c) for c in generated_features]
2029
+ search_keys = {columns_renaming.get(k, k): v for k, v in search_keys.items()}
2030
+
1856
2031
  train_df = df.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df
1857
2032
  X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
1858
2033
  y_sampled = train_df[TARGET].copy()
@@ -1875,23 +2050,26 @@ class FeaturesEnricher(TransformerMixin):
1875
2050
  eval_set_sampled_dict,
1876
2051
  columns_renaming,
1877
2052
  search_keys,
2053
+ generated_features,
1878
2054
  )
1879
2055
 
1880
2056
  def __get_enriched_from_fit(
1881
2057
  self,
1882
- eval_set: Optional[List[tuple]],
2058
+ eval_set: Optional[list[tuple]],
1883
2059
  trace_id: str,
1884
2060
  remove_outliers_calc_metrics: Optional[bool],
1885
2061
  ) -> _EnrichedDataForMetrics:
1886
2062
  eval_set_sampled_dict = {}
1887
- search_keys = self.fit_search_keys
2063
+ search_keys = self.fit_search_keys.copy()
1888
2064
 
1889
2065
  rows_to_drop = None
1890
2066
  has_date = self._get_date_column(search_keys) is not None
1891
2067
  self.model_task_type = self.model_task_type or define_task(
1892
2068
  self.df_with_original_index[TARGET], has_date, self.logger, silent=True
1893
2069
  )
1894
- if self.model_task_type == ModelTaskType.REGRESSION:
2070
+ if remove_outliers_calc_metrics is None:
2071
+ remove_outliers_calc_metrics = True
2072
+ if self.model_task_type == ModelTaskType.REGRESSION and remove_outliers_calc_metrics:
1895
2073
  target_outliers_df = self._search_task.get_target_outliers(trace_id)
1896
2074
  if target_outliers_df is not None and len(target_outliers_df) > 0:
1897
2075
  outliers = pd.merge(
@@ -1901,11 +2079,8 @@ class FeaturesEnricher(TransformerMixin):
1901
2079
  how="inner",
1902
2080
  )
1903
2081
  top_outliers = outliers.sort_values(by=TARGET, ascending=False)[TARGET].head(3)
1904
- if remove_outliers_calc_metrics is None or remove_outliers_calc_metrics is True:
1905
- rows_to_drop = outliers
1906
- not_msg = ""
1907
- else:
1908
- not_msg = "not "
2082
+ rows_to_drop = outliers
2083
+ not_msg = ""
1909
2084
  msg = self.bundle.get("target_outliers_warning").format(len(target_outliers_df), top_outliers, not_msg)
1910
2085
  print(msg)
1911
2086
  self.logger.warning(msg)
@@ -1963,12 +2138,14 @@ class FeaturesEnricher(TransformerMixin):
1963
2138
  enriched_eval_X = enriched_eval_sets[idx + 1][enriched_X_columns].copy()
1964
2139
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
1965
2140
 
1966
- reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
1967
- X_sampled.rename(columns=reversed_renaming, inplace=True)
1968
- enriched_X.rename(columns=reversed_renaming, inplace=True)
2141
+ # reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
2142
+ X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
2143
+ enriched_X.rename(columns=self.fit_columns_renaming, inplace=True)
1969
2144
  for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
1970
- eval_X_sampled.rename(columns=reversed_renaming, inplace=True)
1971
- enriched_eval_X.rename(columns=reversed_renaming, inplace=True)
2145
+ eval_X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
2146
+ enriched_eval_X.rename(columns=self.fit_columns_renaming, inplace=True)
2147
+ search_keys = {self.fit_columns_renaming.get(k, k): v for k, v in search_keys.items()}
2148
+ generated_features = [self.fit_columns_renaming.get(c, c) for c in self.fit_generated_features]
1972
2149
 
1973
2150
  datasets_hash = hash_input(self.X, self.y, self.eval_set)
1974
2151
  return self.__cache_and_return_results(
@@ -1979,17 +2156,19 @@ class FeaturesEnricher(TransformerMixin):
1979
2156
  eval_set_sampled_dict,
1980
2157
  self.fit_columns_renaming,
1981
2158
  search_keys,
2159
+ generated_features,
1982
2160
  )
1983
2161
 
1984
2162
  def __get_enriched_from_transform(
1985
2163
  self,
1986
2164
  validated_X: pd.DataFrame,
1987
2165
  validated_y: pd.Series,
1988
- eval_set: Optional[List[tuple]],
1989
- exclude_features_sources: Optional[List[str]],
2166
+ eval_set: Optional[list[tuple]],
2167
+ exclude_features_sources: Optional[list[str]],
1990
2168
  trace_id: str,
1991
2169
  progress_bar: Optional[ProgressBar],
1992
2170
  progress_callback: Optional[Callable[[SearchProgress], Any]],
2171
+ is_for_metrics: bool = False,
1993
2172
  ) -> _EnrichedDataForMetrics:
1994
2173
  has_eval_set = eval_set is not None
1995
2174
 
@@ -1997,6 +2176,16 @@ class FeaturesEnricher(TransformerMixin):
1997
2176
 
1998
2177
  # Prepare
1999
2178
  df = self.__combine_train_and_eval_sets(validated_X, validated_y, eval_set)
2179
+
2180
+ # Exclude OOT eval sets from transform because they are not used for metrics calculation
2181
+ if not is_for_metrics and EVAL_SET_INDEX in df.columns:
2182
+ for eval_index in df[EVAL_SET_INDEX].unique():
2183
+ if eval_index == 0:
2184
+ continue
2185
+ eval_df = df.query(f"{EVAL_SET_INDEX} == {eval_index}")
2186
+ if eval_df[TARGET].isna().all():
2187
+ df = df.query(f"{EVAL_SET_INDEX} != {eval_index}")
2188
+
2000
2189
  df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
2001
2190
  df = self.__downsample_for_metrics(df)
2002
2191
 
@@ -2026,13 +2215,7 @@ class FeaturesEnricher(TransformerMixin):
2026
2215
  enriched_df, x_columns, enriched_X.columns.tolist(), len(eval_set) if has_eval_set else 0
2027
2216
  )
2028
2217
 
2029
- # Add hash-suffixes because output of transform has original names
2030
- reversed_renaming = {v: k for k, v in columns_renaming.items()}
2031
- X_sampled.rename(columns=reversed_renaming, inplace=True)
2032
- enriched_X.rename(columns=reversed_renaming, inplace=True)
2033
- for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
2034
- eval_X_sampled.rename(columns=reversed_renaming, inplace=True)
2035
- enriched_eval_X.rename(columns=reversed_renaming, inplace=True)
2218
+ search_keys = {columns_renaming.get(k, k): v for k, v in search_keys.items()}
2036
2219
 
2037
2220
  # Cache and return results
2038
2221
  datasets_hash = hash_input(validated_X, validated_y, eval_set)
@@ -2044,10 +2227,11 @@ class FeaturesEnricher(TransformerMixin):
2044
2227
  eval_set_sampled_dict,
2045
2228
  columns_renaming,
2046
2229
  search_keys,
2230
+ generated_features,
2047
2231
  )
2048
2232
 
2049
2233
  def __combine_train_and_eval_sets(
2050
- self, X: pd.DataFrame, y: Optional[pd.Series] = None, eval_set: Optional[List[tuple]] = None
2234
+ self, X: pd.DataFrame, y: Optional[pd.Series] = None, eval_set: Optional[list[tuple]] = None
2051
2235
  ) -> pd.DataFrame:
2052
2236
  df = X.copy()
2053
2237
  if y is not None:
@@ -2099,8 +2283,8 @@ class FeaturesEnricher(TransformerMixin):
2099
2283
  )
2100
2284
 
2101
2285
  def __extract_train_data(
2102
- self, enriched_df: pd.DataFrame, x_columns: List[str]
2103
- ) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
2286
+ self, enriched_df: pd.DataFrame, x_columns: list[str]
2287
+ ) -> tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
2104
2288
  if EVAL_SET_INDEX in enriched_df.columns:
2105
2289
  enriched_Xy = enriched_df.query(f"{EVAL_SET_INDEX} == 0")
2106
2290
  else:
@@ -2111,8 +2295,8 @@ class FeaturesEnricher(TransformerMixin):
2111
2295
  return X_sampled, y_sampled, enriched_X
2112
2296
 
2113
2297
  def __extract_eval_data(
2114
- self, enriched_df: pd.DataFrame, x_columns: List[str], enriched_X_columns: List[str], eval_set_len: int
2115
- ) -> Dict[int, Tuple]:
2298
+ self, enriched_df: pd.DataFrame, x_columns: list[str], enriched_X_columns: list[str], eval_set_len: int
2299
+ ) -> tuple[dict[int, tuple], dict[int, pd.Series]]:
2116
2300
  eval_set_sampled_dict = {}
2117
2301
 
2118
2302
  for idx in range(eval_set_len):
@@ -2130,9 +2314,10 @@ class FeaturesEnricher(TransformerMixin):
2130
2314
  X_sampled: pd.DataFrame,
2131
2315
  y_sampled: pd.Series,
2132
2316
  enriched_X: pd.DataFrame,
2133
- eval_set_sampled_dict: Dict[int, Tuple],
2134
- columns_renaming: Dict[str, str],
2135
- search_keys: Dict[str, SearchKey],
2317
+ eval_set_sampled_dict: dict[int, tuple],
2318
+ columns_renaming: dict[str, str],
2319
+ search_keys: dict[str, SearchKey],
2320
+ generated_features: list[str],
2136
2321
  ) -> _EnrichedDataForMetrics:
2137
2322
 
2138
2323
  self.__cached_sampled_datasets[datasets_hash] = (
@@ -2142,10 +2327,11 @@ class FeaturesEnricher(TransformerMixin):
2142
2327
  eval_set_sampled_dict,
2143
2328
  search_keys,
2144
2329
  columns_renaming,
2330
+ generated_features,
2145
2331
  )
2146
2332
 
2147
2333
  return self.__mk_sampled_data_tuple(
2148
- X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming
2334
+ X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming, generated_features
2149
2335
  )
2150
2336
 
2151
2337
  def __mk_sampled_data_tuple(
@@ -2153,17 +2339,11 @@ class FeaturesEnricher(TransformerMixin):
2153
2339
  X_sampled: pd.DataFrame,
2154
2340
  y_sampled: pd.Series,
2155
2341
  enriched_X: pd.DataFrame,
2156
- eval_set_sampled_dict: Dict,
2157
- search_keys: Dict,
2158
- columns_renaming: Dict[str, str],
2342
+ eval_set_sampled_dict: dict,
2343
+ search_keys: dict,
2344
+ columns_renaming: dict[str, str],
2345
+ generated_features: list[str],
2159
2346
  ):
2160
- # X_sampled - with hash-suffixes
2161
- reversed_renaming = {v: k for k, v in columns_renaming.items()}
2162
- search_keys = {
2163
- reversed_renaming.get(k, k): v
2164
- for k, v in search_keys.items()
2165
- if reversed_renaming.get(k, k) in X_sampled.columns.to_list()
2166
- }
2167
2347
  return FeaturesEnricher._EnrichedDataForMetrics(
2168
2348
  X_sampled=X_sampled,
2169
2349
  y_sampled=y_sampled,
@@ -2171,6 +2351,7 @@ class FeaturesEnricher(TransformerMixin):
2171
2351
  eval_set_sampled_dict=eval_set_sampled_dict,
2172
2352
  search_keys=search_keys,
2173
2353
  columns_renaming=columns_renaming,
2354
+ generated_features=generated_features,
2174
2355
  )
2175
2356
 
2176
2357
  def get_search_id(self) -> Optional[str]:
@@ -2295,15 +2476,13 @@ if response.status_code == 200:
2295
2476
  X: pd.DataFrame,
2296
2477
  *,
2297
2478
  y: Optional[pd.Series] = None,
2298
- exclude_features_sources: Optional[List[str]] = None,
2299
- importance_threshold: Optional[float] = None,
2300
- max_features: Optional[int] = None,
2479
+ exclude_features_sources: Optional[list[str]] = None,
2301
2480
  metrics_calculation: bool = False,
2302
2481
  silent_mode: bool = False,
2303
2482
  progress_bar: Optional[ProgressBar] = None,
2304
2483
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
2305
2484
  add_fit_system_record_id: bool = False,
2306
- ) -> Tuple[pd.DataFrame, Dict[str, str], List[str], Dict[str, SearchKey]]:
2485
+ ) -> tuple[pd.DataFrame, dict[str, str], list[str], dict[str, SearchKey]]:
2307
2486
  if self._search_task is None:
2308
2487
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
2309
2488
 
@@ -2313,7 +2492,7 @@ if response.status_code == 200:
2313
2492
  self.logger.info("Start transform")
2314
2493
 
2315
2494
  validated_X, validated_y, validated_eval_set = self._validate_train_eval(
2316
- X, y, eval_set=None, is_transform=True
2495
+ X, y, eval_set=None, is_transform=True, silent=True
2317
2496
  )
2318
2497
  df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
2319
2498
 
@@ -2321,11 +2500,8 @@ if response.status_code == 200:
2321
2500
 
2322
2501
  self.__log_debug_information(validated_X, validated_y, exclude_features_sources=exclude_features_sources)
2323
2502
 
2324
- filtered_columns = self.__filtered_enriched_features(
2325
- importance_threshold, max_features, trace_id, validated_X
2326
- )
2327
2503
  # If there are no important features, return original dataframe
2328
- if not filtered_columns:
2504
+ if len(self.feature_names_) == 0:
2329
2505
  msg = self.bundle.get("no_important_features_for_transform")
2330
2506
  self.__log_warning(msg, show_support_link=True)
2331
2507
  return X, {c: c for c in X.columns}, [], dict()
@@ -2415,7 +2591,7 @@ if response.status_code == 200:
2415
2591
  else:
2416
2592
  self.logger.info("Input dataset hasn't date column")
2417
2593
  if self.__should_add_date_column():
2418
- df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
2594
+ df = self._add_current_date_as_key(df, search_keys, self.bundle, silent=True)
2419
2595
 
2420
2596
  email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
2421
2597
  if email_columns and self.generate_search_key_features:
@@ -2432,9 +2608,9 @@ if response.status_code == 200:
2432
2608
  if not external_features:
2433
2609
  self.logger.warning(
2434
2610
  "No external features found, returning original dataframe"
2435
- f" with generated important features: {filtered_columns}"
2611
+ f" with generated important features: {self.feature_names_}"
2436
2612
  )
2437
- filtered_columns = [c for c in filtered_columns if c in df.columns]
2613
+ filtered_columns = [c for c in self.feature_names_ if c in df.columns]
2438
2614
  self.logger.warning(f"Filtered columns by existance in dataframe: {filtered_columns}")
2439
2615
  return df[filtered_columns], columns_renaming, generated_features, search_keys
2440
2616
 
@@ -2462,13 +2638,17 @@ if response.status_code == 200:
2462
2638
 
2463
2639
  features_not_to_pass = []
2464
2640
  if add_fit_system_record_id:
2465
- df = self.__add_fit_system_record_id(
2641
+ df = self._add_fit_system_record_id(
2466
2642
  df,
2467
2643
  search_keys,
2468
2644
  SYSTEM_RECORD_ID,
2469
2645
  TARGET,
2470
2646
  columns_renaming,
2471
- silent=True,
2647
+ self.id_columns,
2648
+ self.cv,
2649
+ self.model_task_type,
2650
+ self.logger,
2651
+ self.bundle,
2472
2652
  )
2473
2653
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
2474
2654
  features_not_to_pass.append(SORT_ID)
@@ -2568,16 +2748,6 @@ if response.status_code == 200:
2568
2748
  )
2569
2749
  dataset.columns_renaming = columns_renaming
2570
2750
 
2571
- if max_features is not None or importance_threshold is not None:
2572
- exclude_features_sources = list(
2573
- set(
2574
- (exclude_features_sources or [])
2575
- + self._get_excluded_features(max_features, importance_threshold)
2576
- )
2577
- )
2578
- if len(exclude_features_sources) == 0:
2579
- exclude_features_sources = None
2580
-
2581
2751
  validation_task = self._search_task.validation(
2582
2752
  trace_id,
2583
2753
  dataset,
@@ -2642,6 +2812,8 @@ if response.status_code == 200:
2642
2812
  print(self.bundle.get("transform_start"))
2643
2813
 
2644
2814
  # Prepare input DataFrame for __enrich by concatenating generated ids and client features
2815
+ df_before_explode = df_before_explode.rename(columns=columns_renaming)
2816
+ generated_features = [columns_renaming.get(c, c) for c in generated_features]
2645
2817
  combined_df = pd.concat(
2646
2818
  [
2647
2819
  validated_Xy.reset_index(drop=True),
@@ -2659,14 +2831,21 @@ if response.status_code == 200:
2659
2831
  )
2660
2832
 
2661
2833
  selected_generated_features = [
2662
- c for c in generated_features if not self.fit_select_features or c in filtered_columns
2834
+ c for c in generated_features if not self.fit_select_features or c in self.feature_names_
2663
2835
  ]
2664
- selecting_columns = [
2836
+ selected_input_columns = [
2665
2837
  c
2666
- for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
2667
- if c not in self.zero_shap_client_features or c in (self.id_columns or [])
2838
+ for c in validated_Xy.columns
2839
+ if not self.fit_select_features
2840
+ or c in self.feature_names_
2841
+ or c in self.search_keys
2842
+ or c in (self.id_columns or [])
2843
+ or c in [EVAL_SET_INDEX, TARGET] # transform for metrics calculation
2668
2844
  ]
2669
- selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
2845
+ selecting_columns = selected_input_columns + selected_generated_features
2846
+ selecting_columns.extend(
2847
+ c for c in result.columns if c in self.feature_names_ and c not in selecting_columns
2848
+ )
2670
2849
  if add_fit_system_record_id:
2671
2850
  selecting_columns.append(SORT_ID)
2672
2851
 
@@ -2692,29 +2871,7 @@ if response.status_code == 200:
2692
2871
 
2693
2872
  return result, columns_renaming, generated_features, search_keys
2694
2873
 
2695
- def _get_excluded_features(self, max_features: Optional[int], importance_threshold: Optional[float]) -> List[str]:
2696
- features_info = self._internal_features_info
2697
- comm_schema_header = self.bundle.get("features_info_commercial_schema")
2698
- shap_value_header = self.bundle.get("features_info_shap")
2699
- feature_name_header = self.bundle.get("features_info_name")
2700
- external_features = features_info[features_info[comm_schema_header].str.len() > 0]
2701
- filtered_features = external_features
2702
- if importance_threshold is not None:
2703
- filtered_features = filtered_features[filtered_features[shap_value_header] >= importance_threshold]
2704
- if max_features is not None and len(filtered_features) > max_features:
2705
- filtered_features = filtered_features.iloc[:max_features, :]
2706
- if len(filtered_features) == len(external_features):
2707
- return []
2708
- else:
2709
- if len(filtered_features[filtered_features[comm_schema_header].isin([CommercialSchema.PAID.value])]):
2710
- return []
2711
- excluded_features = external_features[~external_features.index.isin(filtered_features.index)].copy()
2712
- excluded_features = excluded_features[
2713
- excluded_features[comm_schema_header].isin([CommercialSchema.PAID.value])
2714
- ]
2715
- return excluded_features[feature_name_header].values.tolist()
2716
-
2717
- def __validate_search_keys(self, search_keys: Dict[str, SearchKey], search_id: Optional[str] = None):
2874
+ def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id: Optional[str] = None):
2718
2875
  if (search_keys is None or len(search_keys) == 0) and self.country_code is None:
2719
2876
  if search_id:
2720
2877
  self.logger.debug(f"search_id {search_id} provided without search_keys")
@@ -2788,17 +2945,17 @@ if response.status_code == 200:
2788
2945
  self,
2789
2946
  trace_id: str,
2790
2947
  X: Union[pd.DataFrame, pd.Series, np.ndarray],
2791
- y: Union[pd.DataFrame, pd.Series, np.ndarray, List, None],
2792
- eval_set: Optional[List[tuple]],
2948
+ y: Union[pd.DataFrame, pd.Series, np.ndarray, list, None],
2949
+ eval_set: Optional[list[tuple]],
2793
2950
  progress_bar: Optional[ProgressBar],
2794
2951
  start_time: int,
2795
2952
  *,
2796
- exclude_features_sources: Optional[List[str]] = None,
2953
+ exclude_features_sources: Optional[list[str]] = None,
2797
2954
  calculate_metrics: Optional[bool],
2798
2955
  scoring: Union[Callable, str, None],
2799
2956
  estimator: Optional[Any],
2800
- importance_threshold: Optional[float],
2801
- max_features: Optional[int],
2957
+ stability_threshold: float,
2958
+ stability_agg_func: str,
2802
2959
  remove_outliers_calc_metrics: Optional[bool],
2803
2960
  auto_fe_parameters: AutoFEParameters,
2804
2961
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
@@ -2812,6 +2969,7 @@ if response.status_code == 200:
2812
2969
  self.fit_columns_renaming = None
2813
2970
  self.fit_dropped_features = set()
2814
2971
  self.fit_generated_features = []
2972
+ self.psi_values = None
2815
2973
 
2816
2974
  validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, eval_set)
2817
2975
 
@@ -2862,7 +3020,6 @@ if response.status_code == 200:
2862
3020
  )
2863
3021
 
2864
3022
  df = self.__combine_train_and_eval_sets(validated_X, validated_y, validated_eval_set)
2865
- self.id_columns_encoder = OrdinalEncoder().fit(df[self.id_columns or []])
2866
3023
 
2867
3024
  self.fit_search_keys = self.search_keys.copy()
2868
3025
  df = self.__handle_index_search_keys(df, self.fit_search_keys)
@@ -2870,8 +3027,22 @@ if response.status_code == 200:
2870
3027
 
2871
3028
  maybe_date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2872
3029
  has_date = maybe_date_column is not None
3030
+
2873
3031
  self.model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
2874
3032
 
3033
+ if EVAL_SET_INDEX in df.columns:
3034
+ only_train_df = df.query(f"{EVAL_SET_INDEX} == 0")
3035
+ only_train_df = only_train_df.drop(columns=[EVAL_SET_INDEX])
3036
+ else:
3037
+ only_train_df = df
3038
+
3039
+ self.imbalanced = is_imbalanced(only_train_df, self.model_task_type, self.sample_config, self.bundle)
3040
+ if self.imbalanced:
3041
+ # Exclude eval sets from fit because they will be transformed before metrics calculation
3042
+ df = only_train_df
3043
+
3044
+ self.id_columns_encoder = OrdinalEncoder().fit(df[self.id_columns or []])
3045
+
2875
3046
  self._validate_binary_observations(validated_y, self.model_task_type)
2876
3047
 
2877
3048
  self.runtime_parameters = get_runtime_params_custom_loss(
@@ -2908,7 +3079,7 @@ if response.status_code == 200:
2908
3079
  self.logger.info("Input dataset hasn't date column")
2909
3080
  # TODO remove when this logic will be implemented on the back
2910
3081
  if self.__should_add_date_column():
2911
- df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
3082
+ df = self._add_current_date_as_key(df, self.fit_search_keys, self.bundle)
2912
3083
 
2913
3084
  email_columns = SearchKey.find_all_keys(self.fit_search_keys, SearchKey.EMAIL)
2914
3085
  if email_columns and self.generate_search_key_features:
@@ -2923,10 +3094,13 @@ if response.status_code == 200:
2923
3094
  except Exception:
2924
3095
  self.logger.exception("Failed to check dates distribution validity")
2925
3096
 
3097
+ self.__adjust_cv(df)
3098
+
2926
3099
  if (
2927
3100
  is_numeric_dtype(df[self.TARGET_NAME])
2928
3101
  and self.model_task_type in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]
2929
3102
  and has_date
3103
+ and (self.cv is None or not self.cv.is_time_series())
2930
3104
  ):
2931
3105
  self._validate_PSI(df.sort_values(by=maybe_date_column))
2932
3106
 
@@ -2958,7 +3132,15 @@ if response.status_code == 200:
2958
3132
 
2959
3133
  self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
2960
3134
 
2961
- self.__adjust_cv(df)
3135
+ # Group columns should have normalized names
3136
+ if self.runtime_parameters.properties.get("cv_params.group_columns") is not None:
3137
+ original_to_hash = {v: k for k, v in self.fit_columns_renaming.items()}
3138
+ self.runtime_parameters.properties["cv_params.group_columns"] = ",".join(
3139
+ [
3140
+ original_to_hash.get(c, c)
3141
+ for c in self.runtime_parameters.properties["cv_params.group_columns"].split(",")
3142
+ ]
3143
+ )
2962
3144
 
2963
3145
  if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
2964
3146
  id_columns = self.__get_renamed_id_columns()
@@ -2984,9 +3166,27 @@ if response.status_code == 200:
2984
3166
  else:
2985
3167
  self.__log_warning(full_duplicates_warning)
2986
3168
 
3169
+ # Check if OOT eval set still more than 1000 rows
3170
+ if EVAL_SET_INDEX in df.columns:
3171
+ for eval_set_index in df[EVAL_SET_INDEX].unique():
3172
+ if eval_set_index == 0:
3173
+ continue
3174
+ eval_set_df = df[df[EVAL_SET_INDEX] == eval_set_index]
3175
+ if np.all(pd.isna(eval_set_df[TARGET])) and len(eval_set_df) < 1000:
3176
+ self.__log_warning(self.bundle.get("oot_eval_set_too_small_after_dedup").format(eval_set_index + 1))
3177
+
2987
3178
  # Explode multiple search keys
2988
- df = self.__add_fit_system_record_id(
2989
- df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID, TARGET, self.fit_columns_renaming
3179
+ df = self._add_fit_system_record_id(
3180
+ df,
3181
+ self.fit_search_keys,
3182
+ ENTITY_SYSTEM_RECORD_ID,
3183
+ TARGET,
3184
+ self.fit_columns_renaming,
3185
+ self.id_columns,
3186
+ self.cv,
3187
+ self.model_task_type,
3188
+ self.logger,
3189
+ self.bundle,
2990
3190
  )
2991
3191
 
2992
3192
  # TODO check that this is correct for enrichment
@@ -3020,8 +3220,17 @@ if response.status_code == 200:
3020
3220
  if eval_set is not None and len(eval_set) > 0:
3021
3221
  meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
3022
3222
 
3023
- df = self.__add_fit_system_record_id(
3024
- df, self.fit_search_keys, SYSTEM_RECORD_ID, TARGET, self.fit_columns_renaming, silent=True
3223
+ df = self._add_fit_system_record_id(
3224
+ df,
3225
+ self.fit_search_keys,
3226
+ SYSTEM_RECORD_ID,
3227
+ TARGET,
3228
+ self.fit_columns_renaming,
3229
+ self.id_columns,
3230
+ self.cv,
3231
+ self.model_task_type,
3232
+ self.logger,
3233
+ self.bundle,
3025
3234
  )
3026
3235
 
3027
3236
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
@@ -3049,6 +3258,7 @@ if response.status_code == 200:
3049
3258
  model_task_type=self.model_task_type,
3050
3259
  cv_type=self.cv,
3051
3260
  id_columns=self.__get_renamed_id_columns(),
3261
+ is_imbalanced=self.imbalanced,
3052
3262
  date_column=self._get_date_column(self.fit_search_keys),
3053
3263
  date_format=self.date_format,
3054
3264
  random_state=self.random_state,
@@ -3128,8 +3338,6 @@ if response.status_code == 200:
3128
3338
  if progress_callback is not None:
3129
3339
  progress_callback(progress)
3130
3340
 
3131
- self.imbalanced = dataset.imbalanced
3132
-
3133
3341
  zero_hit_search_keys = self._search_task.get_zero_hit_rate_search_keys()
3134
3342
  if zero_hit_search_keys:
3135
3343
  self.logger.warning(
@@ -3152,7 +3360,23 @@ if response.status_code == 200:
3152
3360
 
3153
3361
  self.__prepare_feature_importances(trace_id, df)
3154
3362
 
3155
- self.__show_selected_features(self.fit_search_keys)
3363
+ self._select_features_by_psi(
3364
+ trace_id=trace_id,
3365
+ X=X,
3366
+ y=y,
3367
+ eval_set=eval_set,
3368
+ stability_threshold=stability_threshold,
3369
+ stability_agg_func=stability_agg_func,
3370
+ cv=self.cv,
3371
+ estimator=estimator,
3372
+ exclude_features_sources=exclude_features_sources,
3373
+ progress_bar=progress_bar,
3374
+ progress_callback=progress_callback,
3375
+ )
3376
+
3377
+ self.__prepare_feature_importances(trace_id, df)
3378
+
3379
+ self.__show_selected_features()
3156
3380
 
3157
3381
  autofe_description = self.get_autofe_features_description()
3158
3382
  if autofe_description is not None and len(autofe_description) > 0:
@@ -3194,8 +3418,6 @@ if response.status_code == 200:
3194
3418
  self.__show_metrics(
3195
3419
  scoring,
3196
3420
  estimator,
3197
- importance_threshold,
3198
- max_features,
3199
3421
  remove_outliers_calc_metrics,
3200
3422
  trace_id,
3201
3423
  progress_bar,
@@ -3212,7 +3434,7 @@ if response.status_code == 200:
3212
3434
  if not self.warning_counter.has_warnings():
3213
3435
  self.__display_support_link(self.bundle.get("all_ok_community_invite"))
3214
3436
 
3215
- def __convert_unnestable_keys(self, df: pd.DataFrame, unnest_search_keys: Dict[str, str]):
3437
+ def __convert_unnestable_keys(self, df: pd.DataFrame, unnest_search_keys: dict[str, str]):
3216
3438
  email_column = self._get_email_column(self.fit_search_keys)
3217
3439
  hem_column = self._get_hem_column(self.fit_search_keys)
3218
3440
  if email_column:
@@ -3244,26 +3466,29 @@ if response.status_code == 200:
3244
3466
  def __should_add_date_column(self):
3245
3467
  return self.add_date_if_missing or (self.cv is not None and self.cv.is_time_series())
3246
3468
 
3247
- def __get_renamed_id_columns(self, renaming: Optional[Dict[str, str]] = None):
3469
+ def __get_renamed_id_columns(self, renaming: Optional[dict[str, str]] = None):
3248
3470
  renaming = renaming or self.fit_columns_renaming
3249
3471
  reverse_renaming = {v: k for k, v in renaming.items()}
3250
3472
  return None if self.id_columns is None else [reverse_renaming.get(c) or c for c in self.id_columns]
3251
3473
 
3252
3474
  def __adjust_cv(self, df: pd.DataFrame):
3253
- date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3254
- # Check Multivariate time series
3255
- if (
3256
- self.cv is None
3257
- and date_column
3258
- and self.model_task_type == ModelTaskType.REGRESSION
3259
- and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys())) == 0
3260
- and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
3261
- ):
3262
- msg = self.bundle.get("multivariate_timeseries_detected")
3263
- self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
3264
- elif self.cv is None and self.model_task_type != ModelTaskType.REGRESSION:
3265
- msg = self.bundle.get("group_k_fold_in_classification")
3266
- self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
3475
+ if self.cv is None:
3476
+ date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3477
+ # Check Multivariate time series
3478
+ if (
3479
+ date_column
3480
+ and self.model_task_type == ModelTaskType.REGRESSION
3481
+ and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys()))
3482
+ == 0
3483
+ and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
3484
+ ):
3485
+ msg = self.bundle.get("multivariate_timeseries_detected")
3486
+ self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
3487
+ elif self.model_task_type != ModelTaskType.REGRESSION:
3488
+ msg = self.bundle.get("group_k_fold_in_classification")
3489
+ self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
3490
+
3491
+ if self.cv == CVType.group_k_fold:
3267
3492
  group_columns = self._get_group_columns(df, self.fit_search_keys)
3268
3493
  self.runtime_parameters.properties["cv_params.group_columns"] = ",".join(group_columns)
3269
3494
  self.runtime_parameters.properties["cv_params.shuffle_kfold"] = "True"
@@ -3275,7 +3500,7 @@ if response.status_code == 200:
3275
3500
  self.cv = cv
3276
3501
  self.runtime_parameters.properties["cv_type"] = self.cv.name
3277
3502
 
3278
- def get_columns_by_search_keys(self, keys: List[str]):
3503
+ def get_columns_by_search_keys(self, keys: list[str]):
3279
3504
  if "HEM" in keys:
3280
3505
  keys.append("EMAIL")
3281
3506
  if "DATE" in keys:
@@ -3287,50 +3512,44 @@ if response.status_code == 200:
3287
3512
  self,
3288
3513
  X: pd.DataFrame,
3289
3514
  y: Optional[pd.Series] = None,
3290
- eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]] = None,
3515
+ eval_set: Optional[list[tuple[pd.DataFrame, pd.Series]]] = None,
3291
3516
  is_transform: bool = False,
3292
- ) -> Tuple[pd.DataFrame, pd.Series, Optional[List[Tuple[pd.DataFrame, pd.Series]]]]:
3517
+ silent: bool = False,
3518
+ ) -> tuple[pd.DataFrame, pd.Series, Optional[list[tuple[pd.DataFrame, pd.Series]]]]:
3293
3519
  validated_X = self._validate_X(X, is_transform)
3294
3520
  validated_y = self._validate_y(validated_X, y, enforce_y=not is_transform)
3295
- validated_eval_set = self._validate_eval_set(validated_X, eval_set)
3521
+ validated_eval_set = self._validate_eval_set(validated_X, eval_set, silent=silent)
3296
3522
  return validated_X, validated_y, validated_eval_set
3297
3523
 
3298
3524
  def _encode_id_columns(
3299
3525
  self,
3300
3526
  X: pd.DataFrame,
3301
- columns_renaming: Optional[Dict[str, str]] = None,
3302
- ) -> Tuple[pd.DataFrame, Dict[str, List[Any]]]:
3303
- columns_renaming = columns_renaming or {}
3527
+ ) -> tuple[pd.DataFrame, dict[str, list[Any]]]:
3304
3528
  unknown_dict = {}
3305
3529
 
3306
3530
  if self.id_columns and self.id_columns_encoder is not None:
3307
- inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
3308
- renamed_id_columns = [
3309
- inverse_columns_renaming.get(col, col) for col in self.id_columns_encoder.feature_names_in_
3310
- ]
3311
- self.logger.info(f"Convert id columns to int: {renamed_id_columns}")
3312
- encoded = self.id_columns_encoder.transform(X[renamed_id_columns].rename(columns=columns_renaming))
3313
- for i, c in enumerate(renamed_id_columns):
3314
- unknown_values = X[encoded[:, i] == -1][c].unique().tolist()
3315
- if len(unknown_values) > 0:
3316
- unknown_dict[c] = unknown_values
3317
- X[renamed_id_columns] = encoded
3318
- X = X.loc[(X[renamed_id_columns] != -1).all(axis=1)]
3319
-
3320
- if len(unknown_dict) > 0:
3321
- self.logger.warning(f"Unknown values in id columns: {unknown_dict}")
3531
+ encoding_id_columns = [c for c in self.id_columns if c in X.columns]
3532
+ if len(encoding_id_columns) > 0:
3533
+ self.logger.info(f"Convert id columns to int: {encoding_id_columns}")
3534
+ encoded = self.id_columns_encoder.transform(X[encoding_id_columns])
3535
+ for i, c in enumerate(encoding_id_columns):
3536
+ unknown_values = X[encoded[:, i] == -1][c].unique().tolist()
3537
+ if len(unknown_values) > 0:
3538
+ unknown_dict[c] = unknown_values
3539
+ X[encoding_id_columns] = encoded
3540
+ X = X.loc[(X[encoding_id_columns] != -1).all(axis=1)]
3541
+
3542
+ if len(unknown_dict) > 0:
3543
+ self.logger.warning(f"Unknown values in id columns: {unknown_dict}")
3322
3544
 
3323
3545
  return X, unknown_dict
3324
3546
 
3325
- def _decode_id_columns(self, X: pd.DataFrame, columns_renaming: Dict[str, str]):
3326
- columns_renaming = columns_renaming or {}
3547
+ def _decode_id_columns(self, X: pd.DataFrame):
3327
3548
  if self.id_columns and self.id_columns_encoder is not None:
3328
- inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
3329
- renamed_id_columns = [
3330
- inverse_columns_renaming.get(col, col) for col in self.id_columns_encoder.feature_names_in_
3331
- ]
3332
- decoded = self.id_columns_encoder.inverse_transform(X[renamed_id_columns].rename(columns=columns_renaming))
3333
- X[renamed_id_columns] = decoded
3549
+ decoding_id_columns = [c for c in self.id_columns if c in X.columns]
3550
+ if len(decoding_id_columns) > 0:
3551
+ decoded = self.id_columns_encoder.inverse_transform(X[self.id_columns])
3552
+ X[self.id_columns] = decoded
3334
3553
 
3335
3554
  return X
3336
3555
 
@@ -3424,12 +3643,32 @@ if response.status_code == 200:
3424
3643
 
3425
3644
  return validated_y
3426
3645
 
3427
- def _validate_eval_set(self, X: pd.DataFrame, eval_set: Optional[List[Tuple[pd.DataFrame, pd.Series]]]):
3646
+ def _validate_eval_set(
3647
+ self, X: pd.DataFrame, eval_set: Optional[list[tuple[pd.DataFrame, pd.Series]]], silent: bool = False
3648
+ ):
3428
3649
  if eval_set is None:
3429
3650
  return None
3430
- return [self._validate_eval_set_pair(X, eval_pair) for eval_pair in eval_set]
3651
+ validated_eval_set = []
3652
+ has_date = self._get_date_column(self.search_keys) is not None
3653
+ for idx, eval_pair in enumerate(eval_set):
3654
+ validated_pair = self._validate_eval_set_pair(X, eval_pair)
3655
+ if validated_pair[1].isna().all():
3656
+ if not has_date:
3657
+ msg = self.bundle.get("oot_without_date_not_supported").format(idx + 1)
3658
+ elif self.columns_for_online_api:
3659
+ msg = self.bundle.get("oot_with_online_sources_not_supported").format(idx + 1)
3660
+ else:
3661
+ msg = None
3662
+ if msg:
3663
+ if not silent:
3664
+ print(msg)
3665
+ self.logger.warning(msg)
3666
+ continue
3667
+ validated_eval_set.append(validated_pair)
3431
3668
 
3432
- def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
3669
+ return validated_eval_set
3670
+
3671
+ def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: tuple) -> tuple[pd.DataFrame, pd.Series]:
3433
3672
  if len(eval_pair) != 2:
3434
3673
  raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
3435
3674
  eval_X, eval_y = eval_pair
@@ -3502,20 +3741,22 @@ if response.status_code == 200:
3502
3741
  raise ValidationError(self.bundle.get("unsupported_y_type_eval_set").format(type(eval_y)))
3503
3742
 
3504
3743
  eval_y_nunique = validated_eval_y.nunique()
3505
- if eval_y_nunique < 2:
3744
+ is_oot = validated_eval_y.isna().all()
3745
+ if not is_oot and eval_y_nunique < 2:
3506
3746
  raise ValidationError(self.bundle.get("y_is_constant_eval_set"))
3507
3747
 
3508
- if self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
3748
+ if not is_oot and self.model_task_type == ModelTaskType.BINARY and eval_y_nunique != 2:
3509
3749
  raise ValidationError(self.bundle.get("binary_target_eval_unique_count_not_2").format(eval_y_nunique))
3510
3750
 
3511
- # Check for duplicates between train and eval sets by comparing all values
3512
- train_eval_intersection = pd.merge(X, validated_eval_X, how="inner")
3513
- if len(train_eval_intersection) > 0:
3514
- raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
3751
+ if not is_oot:
3752
+ # Check for duplicates between train and eval sets by comparing all values
3753
+ train_eval_intersection = pd.merge(X, validated_eval_X, how="inner")
3754
+ if len(train_eval_intersection) > 0:
3755
+ raise ValidationError(self.bundle.get("eval_x_has_train_samples"))
3515
3756
 
3516
3757
  return validated_eval_X, validated_eval_y
3517
3758
 
3518
- def _validate_baseline_score(self, X: pd.DataFrame, eval_set: Optional[List[Tuple]]):
3759
+ def _validate_baseline_score(self, X: pd.DataFrame, eval_set: Optional[list[tuple]]):
3519
3760
  if self.baseline_score_column is not None:
3520
3761
  if self.baseline_score_column not in X.columns:
3521
3762
  raise ValidationError(
@@ -3527,13 +3768,15 @@ if response.status_code == 200:
3527
3768
  if isinstance(eval_set, tuple):
3528
3769
  eval_set = [eval_set]
3529
3770
  for eval in eval_set:
3530
- if self.baseline_score_column not in eval[0].columns:
3531
- raise ValidationError(self.bundle.get("baseline_score_column_not_exists"))
3532
- if eval[0][self.baseline_score_column].isna().any():
3533
- raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
3771
+ is_oot = np.all(pd.isna(eval[1]))
3772
+ if not is_oot:
3773
+ if self.baseline_score_column not in eval[0].columns:
3774
+ raise ValidationError(self.bundle.get("baseline_score_column_not_exists"))
3775
+ if eval[0][self.baseline_score_column].isna().any():
3776
+ raise ValidationError(self.bundle.get("baseline_score_column_has_na"))
3534
3777
 
3535
3778
  @staticmethod
3536
- def _sample_X_and_y(X: pd.DataFrame, y: pd.Series, enriched_X: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
3779
+ def _sample_X_and_y(X: pd.DataFrame, y: pd.Series, enriched_X: pd.DataFrame) -> tuple[pd.DataFrame, pd.Series]:
3537
3780
  Xy = pd.concat([X, y], axis=1)
3538
3781
  Xy = pd.merge(Xy, enriched_X, left_index=True, right_index=True, how="inner", suffixes=("", "enriched"))
3539
3782
  return Xy[X.columns].copy(), Xy[TARGET].copy()
@@ -3541,7 +3784,7 @@ if response.status_code == 200:
3541
3784
  @staticmethod
3542
3785
  def _sort_by_system_record_id(
3543
3786
  X: pd.DataFrame, y: pd.Series, cv: Optional[CVType]
3544
- ) -> Tuple[pd.DataFrame, pd.Series]:
3787
+ ) -> tuple[pd.DataFrame, pd.Series]:
3545
3788
  if cv not in [CVType.time_series, CVType.blocked_time_series]:
3546
3789
  record_id_column = ENTITY_SYSTEM_RECORD_ID if ENTITY_SYSTEM_RECORD_ID in X else SYSTEM_RECORD_ID
3547
3790
  Xy = X.copy()
@@ -3558,8 +3801,8 @@ if response.status_code == 200:
3558
3801
  # Deprecated
3559
3802
  @staticmethod
3560
3803
  def _sort_by_keys(
3561
- X: pd.DataFrame, y: pd.Series, search_keys: Dict[str, SearchKey], cv: Optional[CVType]
3562
- ) -> Tuple[pd.DataFrame, pd.Series]:
3804
+ X: pd.DataFrame, y: pd.Series, search_keys: dict[str, SearchKey], cv: Optional[CVType]
3805
+ ) -> tuple[pd.DataFrame, pd.Series]:
3563
3806
  if cv not in [CVType.time_series, CVType.blocked_time_series]:
3564
3807
  if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
3565
3808
  date_column = DateTimeSearchKeyConverter.DATETIME_COL
@@ -3599,12 +3842,10 @@ if response.status_code == 200:
3599
3842
  self,
3600
3843
  X: pd.DataFrame,
3601
3844
  y: Union[pd.Series, np.ndarray, list, None] = None,
3602
- eval_set: Optional[List[tuple]] = None,
3603
- exclude_features_sources: Optional[List[str]] = None,
3845
+ eval_set: Optional[list[tuple]] = None,
3846
+ exclude_features_sources: Optional[list[str]] = None,
3604
3847
  calculate_metrics: Optional[bool] = None,
3605
3848
  cv: Optional[Any] = None,
3606
- importance_threshold: Optional[Any] = None,
3607
- max_features: Optional[Any] = None,
3608
3849
  scoring: Optional[Any] = None,
3609
3850
  estimator: Optional[Any] = None,
3610
3851
  remove_outliers_calc_metrics: Optional[bool] = None,
@@ -3620,8 +3861,6 @@ if response.status_code == 200:
3620
3861
  f"Runtime parameters: {self.runtime_parameters}\n"
3621
3862
  f"Date format: {self.date_format}\n"
3622
3863
  f"CV: {cv}\n"
3623
- f"importance_threshold: {importance_threshold}\n"
3624
- f"max_features: {max_features}\n"
3625
3864
  f"Shared datasets: {self.shared_datasets}\n"
3626
3865
  f"Random state: {self.random_state}\n"
3627
3866
  f"Generate features: {self.generate_features}\n"
@@ -3685,7 +3924,7 @@ if response.status_code == 200:
3685
3924
  except Exception:
3686
3925
  self.logger.warning("Failed to log debug information", exc_info=True)
3687
3926
 
3688
- def __handle_index_search_keys(self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> pd.DataFrame:
3927
+ def __handle_index_search_keys(self, df: pd.DataFrame, search_keys: dict[str, SearchKey]) -> pd.DataFrame:
3689
3928
  index_names = df.index.names if df.index.names != [None] else [DEFAULT_INDEX]
3690
3929
  index_search_keys = set(index_names).intersection(search_keys.keys())
3691
3930
  if len(index_search_keys) > 0:
@@ -3704,7 +3943,7 @@ if response.status_code == 200:
3704
3943
  return df
3705
3944
 
3706
3945
  def _add_current_date_as_key(
3707
- self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
3946
+ self, df: pd.DataFrame, search_keys: dict[str, SearchKey], bundle: ResourceBundle, silent: bool = False
3708
3947
  ) -> pd.DataFrame:
3709
3948
  if (
3710
3949
  set(search_keys.values()) == {SearchKey.PHONE}
@@ -3712,7 +3951,8 @@ if response.status_code == 200:
3712
3951
  or set(search_keys.values()) == {SearchKey.HEM}
3713
3952
  or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
3714
3953
  ):
3715
- self.__log_warning(bundle.get("current_date_added"))
3954
+ if not silent:
3955
+ self.__log_warning(bundle.get("current_date_added"))
3716
3956
  df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
3717
3957
  search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
3718
3958
  converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, generate_cyclical_features=False)
@@ -3720,7 +3960,7 @@ if response.status_code == 200:
3720
3960
  return df
3721
3961
 
3722
3962
  @staticmethod
3723
- def _get_group_columns(df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> List[str]:
3963
+ def _get_group_columns(df: pd.DataFrame, search_keys: dict[str, SearchKey]) -> list[str]:
3724
3964
  search_key_priority = [SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM, SearchKey.IP]
3725
3965
  for key_type in search_key_priority:
3726
3966
  if key_type in search_keys.values():
@@ -3733,7 +3973,7 @@ if response.status_code == 200:
3733
3973
  ]
3734
3974
 
3735
3975
  @staticmethod
3736
- def _get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3976
+ def _get_email_column(search_keys: dict[str, SearchKey]) -> Optional[str]:
3737
3977
  cols = [col for col, t in search_keys.items() if t == SearchKey.EMAIL]
3738
3978
  if len(cols) > 1:
3739
3979
  raise Exception("More than one email column found after unnest")
@@ -3741,7 +3981,7 @@ if response.status_code == 200:
3741
3981
  return cols[0]
3742
3982
 
3743
3983
  @staticmethod
3744
- def _get_hem_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3984
+ def _get_hem_column(search_keys: dict[str, SearchKey]) -> Optional[str]:
3745
3985
  cols = [col for col, t in search_keys.items() if t == SearchKey.HEM]
3746
3986
  if len(cols) > 1:
3747
3987
  raise Exception("More than one hem column found after unnest")
@@ -3749,7 +3989,7 @@ if response.status_code == 200:
3749
3989
  return cols[0]
3750
3990
 
3751
3991
  @staticmethod
3752
- def _get_ip_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
3992
+ def _get_ip_column(search_keys: dict[str, SearchKey]) -> Optional[str]:
3753
3993
  cols = [col for col, t in search_keys.items() if t == SearchKey.IP]
3754
3994
  if len(cols) > 1:
3755
3995
  raise Exception("More than one ip column found after unnest")
@@ -3757,32 +3997,32 @@ if response.status_code == 200:
3757
3997
  return cols[0]
3758
3998
 
3759
3999
  @staticmethod
3760
- def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
4000
+ def _get_phone_column(search_keys: dict[str, SearchKey]) -> Optional[str]:
3761
4001
  for col, t in search_keys.items():
3762
4002
  if t == SearchKey.PHONE:
3763
4003
  return col
3764
4004
 
3765
4005
  @staticmethod
3766
- def _get_country_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
4006
+ def _get_country_column(search_keys: dict[str, SearchKey]) -> Optional[str]:
3767
4007
  for col, t in search_keys.items():
3768
4008
  if t == SearchKey.COUNTRY:
3769
4009
  return col
3770
4010
 
3771
4011
  @staticmethod
3772
- def _get_postal_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
4012
+ def _get_postal_column(search_keys: dict[str, SearchKey]) -> Optional[str]:
3773
4013
  for col, t in search_keys.items():
3774
4014
  if t == SearchKey.POSTAL_CODE:
3775
4015
  return col
3776
4016
 
3777
4017
  @staticmethod
3778
- def _get_date_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
4018
+ def _get_date_column(search_keys: dict[str, SearchKey]) -> Optional[str]:
3779
4019
  return SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3780
4020
 
3781
4021
  def _explode_multiple_search_keys(
3782
- self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], columns_renaming: Dict[str, str]
3783
- ) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
4022
+ self, df: pd.DataFrame, search_keys: dict[str, SearchKey], columns_renaming: dict[str, str]
4023
+ ) -> tuple[pd.DataFrame, dict[str, list[str]]]:
3784
4024
  # find groups of multiple search keys
3785
- search_key_names_by_type: Dict[SearchKey, List[str]] = {}
4025
+ search_key_names_by_type: dict[SearchKey, list[str]] = {}
3786
4026
  for key_name, key_type in search_keys.items():
3787
4027
  search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
3788
4028
  search_key_names_by_type = {
@@ -3815,14 +4055,18 @@ if response.status_code == 200:
3815
4055
  self.logger.info(f"Finished explosion. Size after: {len(df)}")
3816
4056
  return df, unnest_search_keys
3817
4057
 
3818
- def __add_fit_system_record_id(
3819
- self,
4058
+ @staticmethod
4059
+ def _add_fit_system_record_id(
3820
4060
  df: pd.DataFrame,
3821
- search_keys: Dict[str, SearchKey],
4061
+ search_keys: dict[str, SearchKey],
3822
4062
  id_name: str,
3823
4063
  target_name: str,
3824
- columns_renaming: Dict[str, str],
3825
- silent: bool = False,
4064
+ columns_renaming: dict[str, str],
4065
+ id_columns: Optional[list[str]],
4066
+ cv: Optional[CVType],
4067
+ model_task_type: ModelTaskType,
4068
+ logger: Optional[logging.Logger] = None,
4069
+ bundle: ResourceBundle = bundle,
3826
4070
  ) -> pd.DataFrame:
3827
4071
  original_index_name = df.index.name
3828
4072
  index_name = df.index.name or DEFAULT_INDEX
@@ -3851,32 +4095,33 @@ if response.status_code == 200:
3851
4095
  columns_to_sort = [date_column] if date_column is not None else []
3852
4096
 
3853
4097
  do_sorting = True
3854
- if self.id_columns and self.cv.is_time_series():
4098
+ if id_columns and cv is not None and cv.is_time_series():
3855
4099
  # Check duplicates by date and id_columns
3856
4100
  reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
3857
- renamed_id_columns = [reversed_columns_renaming.get(c, c) for c in self.id_columns]
4101
+ renamed_id_columns = [reversed_columns_renaming.get(c, c) for c in id_columns]
3858
4102
  duplicate_check_columns = [c for c in renamed_id_columns if c in df.columns]
3859
4103
  if date_column is not None:
3860
4104
  duplicate_check_columns.append(date_column)
3861
4105
 
3862
4106
  duplicates = df.duplicated(subset=duplicate_check_columns, keep=False)
3863
4107
  if duplicates.any():
3864
- raise ValueError(self.bundle.get("date_and_id_columns_duplicates").format(duplicates.sum()))
4108
+ raise ValueError(bundle.get("date_and_id_columns_duplicates").format(duplicates.sum()))
3865
4109
  else:
3866
4110
  columns_to_hash = list(set(list(search_keys.keys()) + renamed_id_columns + [target_name]))
3867
4111
  columns_to_hash = sort_columns(
3868
4112
  df[columns_to_hash],
3869
4113
  target_name,
3870
4114
  search_keys,
3871
- self.model_task_type,
4115
+ model_task_type,
3872
4116
  sort_exclude_columns,
3873
- logger=self.logger,
4117
+ logger=logger,
3874
4118
  )
3875
4119
  else:
3876
4120
  columns_to_hash = sort_columns(
3877
- df, target_name, search_keys, self.model_task_type, sort_exclude_columns, logger=self.logger
4121
+ df, target_name, search_keys, model_task_type, sort_exclude_columns, logger=logger
3878
4122
  )
3879
- if do_sorting:
4123
+
4124
+ def sort_df(df: pd.DataFrame) -> pd.DataFrame:
3880
4125
  search_keys_hash = "search_keys_hash"
3881
4126
  if len(columns_to_hash) > 0:
3882
4127
  factorized_df = df.copy()
@@ -3890,6 +4135,24 @@ if response.status_code == 200:
3890
4135
 
3891
4136
  if search_keys_hash in df.columns:
3892
4137
  df.drop(columns=search_keys_hash, inplace=True)
4138
+ return df
4139
+
4140
+ if do_sorting:
4141
+ sorted_dfs = []
4142
+ if EVAL_SET_INDEX in df.columns:
4143
+ # Sort train and eval sets separately
4144
+ train = df[df[EVAL_SET_INDEX] == 0].copy()
4145
+ sorted_dfs.append(sort_df(train))
4146
+
4147
+ for eval_set_index in df[EVAL_SET_INDEX].unique():
4148
+ if eval_set_index == 0:
4149
+ continue
4150
+ eval_set_df = df[df[EVAL_SET_INDEX] == eval_set_index].copy()
4151
+ sorted_dfs.append(sort_df(eval_set_df))
4152
+
4153
+ df = pd.concat(sorted_dfs)
4154
+ else:
4155
+ df = sort_df(df)
3893
4156
 
3894
4157
  df = df.reset_index(drop=True).reset_index()
3895
4158
  # system_record_id saves correct order for fit
@@ -3900,11 +4163,6 @@ if response.status_code == 200:
3900
4163
  df.index.name = original_index_name
3901
4164
  df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
3902
4165
 
3903
- # meaning_types[id_name] = (
3904
- # FileColumnMeaningType.SYSTEM_RECORD_ID
3905
- # if id_name == SYSTEM_RECORD_ID
3906
- # else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
3907
- # )
3908
4166
  return df
3909
4167
 
3910
4168
  def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -3925,7 +4183,7 @@ if response.status_code == 200:
3925
4183
 
3926
4184
  return df
3927
4185
 
3928
- def __add_country_code(self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> pd.DataFrame:
4186
+ def __add_country_code(self, df: pd.DataFrame, search_keys: dict[str, SearchKey]) -> pd.DataFrame:
3929
4187
  self.country_added = False
3930
4188
 
3931
4189
  if self.country_code is not None and SearchKey.COUNTRY not in search_keys.values():
@@ -3951,6 +4209,7 @@ if response.status_code == 200:
3951
4209
  self.logger.error(f"result features not found by search_task_id: {self.get_search_id()}")
3952
4210
  raise RuntimeError(self.bundle.get("features_wasnt_returned"))
3953
4211
 
4212
+ result_features = result_features.copy()
3954
4213
  if EVAL_SET_INDEX in result_features.columns:
3955
4214
  result_features = result_features.drop(columns=EVAL_SET_INDEX)
3956
4215
 
@@ -3978,6 +4237,17 @@ if response.status_code == 200:
3978
4237
 
3979
4238
  # TODO drop system_record_id before merge
3980
4239
  # Merge with result features
4240
+ # Align dtypes for join key to avoid int/float merge warnings
4241
+ if ENTITY_SYSTEM_RECORD_ID in input_df.columns and ENTITY_SYSTEM_RECORD_ID in result_features.columns:
4242
+ input_is_float = pd.api.types.is_float_dtype(input_df[ENTITY_SYSTEM_RECORD_ID])
4243
+ result_is_float = pd.api.types.is_float_dtype(result_features[ENTITY_SYSTEM_RECORD_ID])
4244
+ if input_is_float or result_is_float:
4245
+ input_df[ENTITY_SYSTEM_RECORD_ID] = pd.to_numeric(
4246
+ input_df[ENTITY_SYSTEM_RECORD_ID], errors="coerce"
4247
+ ).astype("float64")
4248
+ result_features[ENTITY_SYSTEM_RECORD_ID] = pd.to_numeric(
4249
+ result_features[ENTITY_SYSTEM_RECORD_ID], errors="coerce"
4250
+ ).astype("float64")
3981
4251
  result_features = pd.merge(
3982
4252
  input_df,
3983
4253
  result_features,
@@ -4039,7 +4309,7 @@ if response.status_code == 200:
4039
4309
 
4040
4310
  return importances
4041
4311
 
4042
- def __get_categorical_features(self) -> List[str]:
4312
+ def __get_categorical_features(self) -> list[str]:
4043
4313
  features_meta = self._search_task.get_all_features_metadata_v2()
4044
4314
  if features_meta is None:
4045
4315
  raise Exception(self.bundle.get("missing_features_meta"))
@@ -4047,10 +4317,16 @@ if response.status_code == 200:
4047
4317
  return [f.name for f in features_meta if f.type == "categorical" and f.name not in (self.id_columns or [])]
4048
4318
 
4049
4319
  def __prepare_feature_importances(
4050
- self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
4320
+ self,
4321
+ trace_id: str,
4322
+ clients_features_df: pd.DataFrame,
4323
+ updated_shaps: Optional[dict[str, float]] = None,
4324
+ update_selected_features: bool = True,
4325
+ silent=False,
4051
4326
  ):
4052
4327
  if self._search_task is None:
4053
4328
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
4329
+ selected_features = self._search_task.get_selected_features(trace_id)
4054
4330
  features_meta = self._search_task.get_all_features_metadata_v2()
4055
4331
  if features_meta is None:
4056
4332
  raise Exception(self.bundle.get("missing_features_meta"))
@@ -4060,11 +4336,10 @@ if response.status_code == 200:
4060
4336
  features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
4061
4337
 
4062
4338
  # To be sure that names with hash suffixes
4063
- df = df.rename(columns=original_names_dict)
4339
+ clients_features_df = clients_features_df.rename(columns=original_names_dict)
4064
4340
 
4065
4341
  self.feature_names_ = []
4066
4342
  self.external_source_feature_names = []
4067
- self.zero_shap_client_features = []
4068
4343
  self.feature_importances_ = []
4069
4344
  features_info = []
4070
4345
  features_info_without_links = []
@@ -4072,11 +4347,18 @@ if response.status_code == 200:
4072
4347
 
4073
4348
  original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
4074
4349
 
4350
+ selected_features_meta = []
4075
4351
  for feature_meta in features_meta:
4076
- if feature_meta.name in original_names_dict.keys():
4077
- feature_meta.name = original_names_dict[feature_meta.name]
4352
+ original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
4353
+ feature_meta.name = original_name
4354
+
4355
+ is_client_feature = original_name in clients_features_df.columns
4356
+
4357
+ if selected_features is not None and feature_meta.name not in selected_features:
4358
+ self.logger.info(f"Feature {feature_meta.name} is not selected before and skipped")
4359
+ continue
4078
4360
 
4079
- is_client_feature = original_names_dict.get(feature_meta.name, feature_meta.name) in df.columns
4361
+ selected_features_meta.append(feature_meta)
4080
4362
 
4081
4363
  # Show and update shap values for client features only if select_features is True
4082
4364
  if updated_shaps is not None and (not is_client_feature or self.fit_select_features):
@@ -4089,19 +4371,24 @@ if response.status_code == 200:
4089
4371
  updating_shap = 0.0
4090
4372
  feature_meta.shap_value = updating_shap
4091
4373
 
4092
- features_meta.sort(key=lambda m: (-m.shap_value, m.name))
4374
+ selected_features_meta.sort(key=lambda m: (-m.shap_value, m.name))
4093
4375
 
4094
- for feature_meta in features_meta:
4376
+ for feature_meta in selected_features_meta:
4095
4377
  original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
4096
- is_client_feature = original_name in df.columns
4378
+ is_client_feature = original_name in clients_features_df.columns
4097
4379
 
4098
4380
  if not is_client_feature:
4099
4381
  self.external_source_feature_names.append(original_name)
4100
4382
 
4383
+ if self.psi_values is not None:
4384
+ if original_name in self.psi_values:
4385
+ feature_meta.psi_value = self.psi_values[original_name]
4386
+ else:
4387
+ continue
4388
+
4101
4389
  # TODO make a decision about selected features based on special flag from mlb
4390
+
4102
4391
  if original_shaps.get(feature_meta.name, 0.0) == 0.0:
4103
- if is_client_feature and self.fit_select_features:
4104
- self.zero_shap_client_features.append(original_name)
4105
4392
  continue
4106
4393
 
4107
4394
  # Use only important features
@@ -4122,14 +4409,19 @@ if response.status_code == 200:
4122
4409
  self.feature_names_.append(feature_meta.name)
4123
4410
  self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
4124
4411
 
4125
- df_for_sample = features_df if feature_meta.name in features_df.columns else df
4412
+ df_for_sample = features_df if feature_meta.name in features_df.columns else clients_features_df
4126
4413
  feature_info = FeatureInfo.from_metadata(feature_meta, df_for_sample, is_client_feature)
4127
4414
  features_info.append(feature_info.to_row(self.bundle))
4128
4415
  features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
4129
4416
  internal_features_info.append(feature_info.to_internal_row(self.bundle))
4130
4417
 
4418
+ if update_selected_features:
4419
+ self._search_task.update_selected_features(trace_id, self.feature_names_)
4420
+
4131
4421
  if len(features_info) > 0:
4132
4422
  self.features_info = pd.DataFrame(features_info)
4423
+ if self.features_info[self.bundle.get("features_info_psi")].isna().all():
4424
+ self.features_info.drop(columns=[self.bundle.get("features_info_psi")], inplace=True)
4133
4425
  self._features_info_without_links = pd.DataFrame(features_info_without_links)
4134
4426
  self._internal_features_info = pd.DataFrame(internal_features_info)
4135
4427
  if not silent:
@@ -4253,32 +4545,10 @@ if response.status_code == 200:
4253
4545
  )
4254
4546
  )
4255
4547
 
4256
- def __filtered_importance_names(
4257
- self, importance_threshold: Optional[float], max_features: Optional[int], trace_id: str, df: pd.DataFrame
4258
- ) -> List[str]:
4259
- # get features importance from server
4260
- filtered_importances = self.__get_features_importance_from_server(trace_id, df)
4261
-
4262
- if len(filtered_importances) == 0:
4263
- return []
4264
-
4265
- if importance_threshold is not None:
4266
- filtered_importances = [
4267
- (name, importance)
4268
- for name, importance in filtered_importances.items()
4269
- if importance > importance_threshold
4270
- ]
4271
- if max_features is not None:
4272
- filtered_importances = list(filtered_importances)[:max_features]
4273
- if len(filtered_importances) == 0:
4274
- return []
4275
- filtered_importance_names, _ = zip(*filtered_importances)
4276
- return list(filtered_importance_names)
4277
-
4278
4548
  def __prepare_search_keys(
4279
4549
  self,
4280
4550
  x: pd.DataFrame,
4281
- search_keys: Dict[str, SearchKey],
4551
+ search_keys: dict[str, SearchKey],
4282
4552
  is_demo_dataset: bool,
4283
4553
  is_transform=False,
4284
4554
  silent_mode=False,
@@ -4391,8 +4661,6 @@ if response.status_code == 200:
4391
4661
  self,
4392
4662
  scoring: Union[Callable, str, None],
4393
4663
  estimator: Optional[Any],
4394
- importance_threshold: Optional[float],
4395
- max_features: Optional[int],
4396
4664
  remove_outliers_calc_metrics: Optional[bool],
4397
4665
  trace_id: str,
4398
4666
  progress_bar: Optional[ProgressBar] = None,
@@ -4401,8 +4669,6 @@ if response.status_code == 200:
4401
4669
  self.metrics = self.calculate_metrics(
4402
4670
  scoring=scoring,
4403
4671
  estimator=estimator,
4404
- importance_threshold=importance_threshold,
4405
- max_features=max_features,
4406
4672
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
4407
4673
  trace_id=trace_id,
4408
4674
  internal_call=True,
@@ -4413,22 +4679,15 @@ if response.status_code == 200:
4413
4679
  msg = self.bundle.get("quality_metrics_header")
4414
4680
  display_html_dataframe(self.metrics, self.metrics, msg)
4415
4681
 
4416
- def __show_selected_features(self, search_keys: Dict[str, SearchKey]):
4417
- search_key_names = [col for col, tpe in search_keys.items() if tpe != SearchKey.CUSTOM_KEY]
4418
- if self.fit_columns_renaming:
4419
- search_key_names = sorted(set([self.fit_columns_renaming.get(col, col) for col in search_key_names]))
4420
- msg = self.bundle.get("features_info_header").format(len(self.feature_names_), search_key_names)
4421
-
4682
+ def __show_selected_features(self):
4422
4683
  try:
4423
4684
  _ = get_ipython() # type: ignore
4424
4685
 
4425
- print(Format.GREEN + Format.BOLD + msg + Format.END)
4426
- self.logger.info(msg)
4427
4686
  if len(self.feature_names_) > 0:
4428
4687
  self.features_info_display_handle = display_html_dataframe(
4429
4688
  self.features_info,
4430
4689
  self._features_info_without_links,
4431
- self.bundle.get("relevant_features_header"),
4690
+ self.bundle.get("relevant_features_header").format(len(self.feature_names_)),
4432
4691
  display_id=f"features_info_{uuid.uuid4()}",
4433
4692
  )
4434
4693
 
@@ -4443,7 +4702,6 @@ if response.status_code == 200:
4443
4702
  msg = self.bundle.get("features_info_zero_important_features")
4444
4703
  self.__log_warning(msg, show_support_link=True)
4445
4704
  except (ImportError, NameError):
4446
- print(msg)
4447
4705
  print(self._internal_features_info)
4448
4706
 
4449
4707
  def __show_report_button(self, display_id: Optional[str] = None, display_handle=None):
@@ -4462,40 +4720,14 @@ if response.status_code == 200:
4462
4720
  except Exception:
4463
4721
  pass
4464
4722
 
4465
- def __validate_importance_threshold(self, importance_threshold: Optional[float]) -> float:
4466
- try:
4467
- return float(importance_threshold) if importance_threshold is not None else 0.0
4468
- except ValueError:
4469
- self.logger.exception(f"Invalid importance_threshold provided: {importance_threshold}")
4470
- raise ValidationError(self.bundle.get("invalid_importance_threshold"))
4471
-
4472
- def __validate_max_features(self, max_features: Optional[int]) -> int:
4473
- try:
4474
- return int(max_features) if max_features is not None else 400
4475
- except ValueError:
4476
- self.logger.exception(f"Invalid max_features provided: {max_features}")
4477
- raise ValidationError(self.bundle.get("invalid_max_features"))
4478
-
4479
- def __filtered_enriched_features(
4480
- self,
4481
- importance_threshold: Optional[float],
4482
- max_features: Optional[int],
4483
- trace_id: str,
4484
- df: pd.DataFrame,
4485
- ) -> List[str]:
4486
- importance_threshold = self.__validate_importance_threshold(importance_threshold)
4487
- max_features = self.__validate_max_features(max_features)
4488
-
4489
- return self.__filtered_importance_names(importance_threshold, max_features, trace_id, df)
4490
-
4491
4723
  def __detect_missing_search_keys(
4492
4724
  self,
4493
4725
  df: pd.DataFrame,
4494
- search_keys: Dict[str, SearchKey],
4726
+ search_keys: dict[str, SearchKey],
4495
4727
  is_demo_dataset: bool,
4496
4728
  silent_mode=False,
4497
4729
  is_transform=False,
4498
- ) -> Dict[str, SearchKey]:
4730
+ ) -> dict[str, SearchKey]:
4499
4731
  sample = df.head(100)
4500
4732
 
4501
4733
  def check_need_detect(search_key: SearchKey):
@@ -4641,7 +4873,7 @@ if response.status_code == 200:
4641
4873
  trace_id: str,
4642
4874
  X: Union[pd.DataFrame, pd.Series],
4643
4875
  y: Union[pd.DataFrame, pd.Series, None] = None,
4644
- eval_set: Union[Tuple, None] = None,
4876
+ eval_set: Union[tuple, None] = None,
4645
4877
  ):
4646
4878
  def dump_task(X_, y_, eval_set_):
4647
4879
  with MDC(trace_id=trace_id):
@@ -4651,7 +4883,7 @@ if response.status_code == 200:
4651
4883
 
4652
4884
  with tempfile.TemporaryDirectory() as tmp_dir:
4653
4885
  X_.to_parquet(f"{tmp_dir}/x.parquet", compression="zstd")
4654
- x_digest_sha256 = self.rest_client.compute_file_digest(f"{tmp_dir}/x.parquet")
4886
+ x_digest_sha256 = file_hash(f"{tmp_dir}/x.parquet")
4655
4887
  if self.rest_client.is_file_uploaded(trace_id, x_digest_sha256):
4656
4888
  self.logger.info(
4657
4889
  f"File x.parquet was already uploaded with digest {x_digest_sha256}, skipping"
@@ -4665,7 +4897,7 @@ if response.status_code == 200:
4665
4897
  if isinstance(y_, pd.Series):
4666
4898
  y_ = y_.to_frame()
4667
4899
  y_.to_parquet(f"{tmp_dir}/y.parquet", compression="zstd")
4668
- y_digest_sha256 = self.rest_client.compute_file_digest(f"{tmp_dir}/y.parquet")
4900
+ y_digest_sha256 = file_hash(f"{tmp_dir}/y.parquet")
4669
4901
  if self.rest_client.is_file_uploaded(trace_id, y_digest_sha256):
4670
4902
  self.logger.info(
4671
4903
  f"File y.parquet was already uploaded with digest {y_digest_sha256}, skipping"
@@ -4680,9 +4912,7 @@ if response.status_code == 200:
4680
4912
  if isinstance(eval_x_, pd.Series):
4681
4913
  eval_x_ = eval_x_.to_frame()
4682
4914
  eval_x_.to_parquet(f"{tmp_dir}/eval_x_{idx}.parquet", compression="zstd")
4683
- eval_x_digest_sha256 = self.rest_client.compute_file_digest(
4684
- f"{tmp_dir}/eval_x_{idx}.parquet"
4685
- )
4915
+ eval_x_digest_sha256 = file_hash(f"{tmp_dir}/eval_x_{idx}.parquet")
4686
4916
  if self.rest_client.is_file_uploaded(trace_id, eval_x_digest_sha256):
4687
4917
  self.logger.info(
4688
4918
  f"File eval_x_{idx}.parquet was already uploaded with"
@@ -4699,9 +4929,7 @@ if response.status_code == 200:
4699
4929
  if isinstance(eval_y_, pd.Series):
4700
4930
  eval_y_ = eval_y_.to_frame()
4701
4931
  eval_y_.to_parquet(f"{tmp_dir}/eval_y_{idx}.parquet", compression="zstd")
4702
- eval_y_digest_sha256 = self.rest_client.compute_file_digest(
4703
- f"{tmp_dir}/eval_y_{idx}.parquet"
4704
- )
4932
+ eval_y_digest_sha256 = file_hash(f"{tmp_dir}/eval_y_{idx}.parquet")
4705
4933
  if self.rest_client.is_file_uploaded(trace_id, eval_y_digest_sha256):
4706
4934
  self.logger.info(
4707
4935
  f"File eval_y_{idx}.parquet was already uploaded"
@@ -4736,28 +4964,10 @@ def is_frames_equal(first, second, bundle: ResourceBundle) -> bool:
4736
4964
  raise ValidationError(bundle.get("x_and_eval_x_diff_types").format(type(first), type(second)))
4737
4965
 
4738
4966
 
4739
- def drop_duplicates(df: Union[pd.DataFrame, np.ndarray]) -> pd.DataFrame:
4967
+ def drop_duplicates(df: Union[pd.DataFrame, np.ndarray, Any]) -> pd.DataFrame:
4740
4968
  if isinstance(df, pd.DataFrame):
4741
4969
  return df.drop_duplicates()
4742
4970
  elif isinstance(df, np.ndarray):
4743
4971
  return pd.DataFrame(df).drop_duplicates()
4744
4972
  else:
4745
4973
  return df
4746
-
4747
-
4748
- def hash_input(X: pd.DataFrame, y: Optional[pd.Series] = None, eval_set: Optional[List[Tuple]] = None) -> str:
4749
- hashed_objects = []
4750
- try:
4751
- hashed_objects.append(pd.util.hash_pandas_object(X, index=False).values)
4752
- if y is not None:
4753
- hashed_objects.append(pd.util.hash_pandas_object(y, index=False).values)
4754
- if eval_set is not None:
4755
- if isinstance(eval_set, tuple):
4756
- eval_set = [eval_set]
4757
- for eval_X, eval_y in eval_set:
4758
- hashed_objects.append(pd.util.hash_pandas_object(eval_X, index=False).values)
4759
- hashed_objects.append(pd.util.hash_pandas_object(eval_y, index=False).values)
4760
- common_hash = hashlib.sha256(np.concatenate(hashed_objects)).hexdigest()
4761
- return common_hash
4762
- except Exception:
4763
- return ""