upgini 1.2.114a5__py3-none-any.whl → 1.2.115a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.114a5"
1
+ __version__ = "1.2.115a1"
@@ -12,7 +12,7 @@ from collections import Counter
12
12
  from copy import deepcopy
13
13
  from dataclasses import dataclass
14
14
  from threading import Thread
15
- from typing import Any, Callable, Optional, Set, Union
15
+ from typing import Any, Callable
16
16
 
17
17
  import numpy as np
18
18
  import pandas as pd
@@ -207,34 +207,34 @@ class FeaturesEnricher(TransformerMixin):
207
207
 
208
208
  def __init__(
209
209
  self,
210
- search_keys: Optional[dict[str, SearchKey]] = None,
211
- country_code: Optional[str] = None,
212
- model_task_type: Optional[Union[ModelTaskType, str]] = None,
213
- api_key: Optional[str] = None,
214
- endpoint: Optional[str] = None,
215
- search_id: Optional[str] = None,
216
- shared_datasets: Optional[list[str]] = None,
217
- runtime_parameters: Optional[RuntimeParameters] = None,
218
- date_format: Optional[str] = None,
210
+ search_keys: dict[str, SearchKey] | None = None,
211
+ country_code: str | None = None,
212
+ model_task_type: ModelTaskType | str | None = None,
213
+ api_key: str | None = None,
214
+ endpoint: str | None = None,
215
+ search_id: str | None = None,
216
+ shared_datasets: list[str] | None = None,
217
+ runtime_parameters: RuntimeParameters | None = None,
218
+ date_format: str | None = None,
219
219
  random_state: int = 42,
220
- cv: Optional[CVType] = None,
221
- loss: Optional[str] = None,
220
+ cv: CVType | None = None,
221
+ loss: str | None = None,
222
222
  autodetect_search_keys: bool = True,
223
- generate_features: Optional[list[str]] = None,
224
- columns_for_online_api: Optional[list[str]] = None,
225
- round_embeddings: Optional[int] = None,
223
+ generate_features: list[str] | None = None,
224
+ columns_for_online_api: list[str] | None = None,
225
+ round_embeddings: int | None = None,
226
226
  logs_enabled: bool = True,
227
227
  raise_validation_error: bool = True,
228
- exclude_columns: Optional[list[str]] = None,
229
- baseline_score_column: Optional[Any] = None,
230
- client_ip: Optional[str] = None,
231
- client_visitorid: Optional[str] = None,
232
- custom_bundle_config: Optional[str] = None,
228
+ exclude_columns: list[str] | None = None,
229
+ baseline_score_column: Any | None = None,
230
+ client_ip: str | None = None,
231
+ client_visitorid: str | None = None,
232
+ custom_bundle_config: str | None = None,
233
233
  add_date_if_missing: bool = True,
234
234
  disable_force_downsampling: bool = False,
235
- id_columns: Optional[list[str]] = None,
235
+ id_columns: list[str] | None = None,
236
236
  generate_search_key_features: bool = True,
237
- sample_config: Optional[SampleConfig] = None,
237
+ sample_config: SampleConfig | None = None,
238
238
  print_trace_id: bool = False,
239
239
  **kwargs,
240
240
  ):
@@ -259,16 +259,16 @@ class FeaturesEnricher(TransformerMixin):
259
259
  print(msg)
260
260
 
261
261
  self.passed_features: list[str] = []
262
- self.df_with_original_index: Optional[pd.DataFrame] = None
263
- self.fit_columns_renaming: Optional[dict[str, str]] = None
262
+ self.df_with_original_index: pd.DataFrame | None = None
263
+ self.fit_columns_renaming: dict[str, str] | None = None
264
264
  self.country_added = False
265
265
  self.fit_generated_features: list[str] = []
266
- self.fit_dropped_features: Set[str] = set()
266
+ self.fit_dropped_features: set[str] = set()
267
267
  self.fit_search_keys = search_keys
268
268
  self.warning_counter = WarningCounter()
269
- self.X: Optional[pd.DataFrame] = None
270
- self.y: Optional[pd.Series] = None
271
- self.eval_set: Optional[list[tuple]] = None
269
+ self.X: pd.DataFrame | None = None
270
+ self.y: pd.Series | None = None
271
+ self.eval_set: list[tuple] | None = None
272
272
  self.autodetected_search_keys: dict[str, SearchKey] = {}
273
273
  self.imbalanced = False
274
274
  self.fit_select_features = True
@@ -288,17 +288,17 @@ class FeaturesEnricher(TransformerMixin):
288
288
  self.model_task_type = ModelTaskType.parse(model_task_type)
289
289
  self.model_task_type = model_task_type
290
290
  self.endpoint = endpoint
291
- self._search_task: Optional[SearchTask] = None
291
+ self._search_task: SearchTask | None = None
292
292
  self.features_info: pd.DataFrame = self.EMPTY_FEATURES_INFO
293
293
  self._features_info_without_links: pd.DataFrame = self.EMPTY_FEATURES_INFO
294
294
  self._internal_features_info: pd.DataFrame = self.EMPTY_INTERNAL_FEATURES_INFO
295
295
  self.relevant_data_sources: pd.DataFrame = self.EMPTY_DATA_SOURCES
296
296
  self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
297
- self.metrics: Optional[pd.DataFrame] = None
297
+ self.metrics: pd.DataFrame | None = None
298
298
  self.feature_names_ = []
299
299
  self.external_source_feature_names = []
300
300
  self.feature_importances_ = []
301
- self.psi_values: Optional[dict[str, float]] = None
301
+ self.psi_values: dict[str, float] | None = None
302
302
  self.search_id = search_id
303
303
  self.disable_force_downsampling = disable_force_downsampling
304
304
  self.print_trace_id = print_trace_id
@@ -375,7 +375,7 @@ class FeaturesEnricher(TransformerMixin):
375
375
  self.autofe_features_display_handle = None
376
376
  self.report_button_handle = None
377
377
 
378
- def _get_sample_config(self, sample_config: Optional[SampleConfig] = None):
378
+ def _get_sample_config(self, sample_config: SampleConfig | None = None):
379
379
  sample_config = sample_config or SampleConfig(force_sample_size=Dataset.FORCE_SAMPLE_SIZE)
380
380
 
381
381
  maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
@@ -430,20 +430,20 @@ class FeaturesEnricher(TransformerMixin):
430
430
 
431
431
  def fit(
432
432
  self,
433
- X: Union[pd.DataFrame, pd.Series, np.ndarray],
434
- y: Union[pd.Series, np.ndarray, list],
435
- eval_set: Optional[Union[list[tuple], tuple]] = None,
433
+ X: pd.DataFrame | pd.Series | np.ndarray,
434
+ y: pd.Series | np.ndarray | list,
435
+ eval_set: list[tuple] | tuple | None = None,
436
436
  *args,
437
- exclude_features_sources: Optional[list[str]] = None,
438
- calculate_metrics: Optional[bool] = None,
439
- estimator: Optional[Any] = None,
440
- scoring: Union[Callable, str, None] = None,
441
- remove_outliers_calc_metrics: Optional[bool] = None,
442
- progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
443
- search_id_callback: Optional[Callable[[str], Any]] = None,
437
+ exclude_features_sources: list[str] | None = None,
438
+ calculate_metrics: bool | None = None,
439
+ estimator: Any | None = None,
440
+ scoring: Callable | str | None = None,
441
+ remove_outliers_calc_metrics: bool | None = None,
442
+ progress_callback: Callable[[SearchProgress], Any] | None = None,
443
+ search_id_callback: Callable[[str], Any] | None = None,
444
444
  select_features: bool = True,
445
- auto_fe_parameters: Optional[AutoFEParameters] = None,
446
- stability_threshold: float = 0.15,
445
+ auto_fe_parameters: AutoFEParameters | None = None,
446
+ stability_threshold: float = 0.2,
447
447
  stability_agg_func: str = "max",
448
448
  **kwargs,
449
449
  ):
@@ -479,7 +479,7 @@ class FeaturesEnricher(TransformerMixin):
479
479
  If True, return only selected features both from input and data sources.
480
480
  Otherwise, return all features from input and only selected features from data sources.
481
481
 
482
- stability_threshold: float, optional (default=0.15)
482
+ stability_threshold: float, optional (default=0.2)
483
483
  Stability threshold for selected features PSI calculation. If PSI is less than this threshold,
484
484
  then feature will be dropped.
485
485
 
@@ -579,27 +579,26 @@ class FeaturesEnricher(TransformerMixin):
579
579
 
580
580
  def fit_transform(
581
581
  self,
582
- X: Union[pd.DataFrame, pd.Series, np.ndarray],
583
- y: Union[pd.DataFrame, pd.Series, np.ndarray, list],
584
- eval_set: Optional[Union[list[tuple], tuple]] = None,
582
+ X: pd.DataFrame | pd.Series | np.ndarray,
583
+ y: pd.DataFrame | pd.Series | np.ndarray | list,
584
+ eval_set: list[tuple] | tuple | None = None,
585
585
  *args,
586
- exclude_features_sources: Optional[list[str]] = None,
587
- keep_input: bool = True,
588
- calculate_metrics: Optional[bool] = None,
589
- scoring: Union[Callable, str, None] = None,
590
- estimator: Optional[Any] = None,
591
- remove_outliers_calc_metrics: Optional[bool] = None,
592
- progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
586
+ exclude_features_sources: list[str] | None | None = None,
587
+ keep_input: bool | None = None,
588
+ calculate_metrics: bool | None = None,
589
+ scoring: Callable | str | None = None,
590
+ estimator: Any | None = None,
591
+ remove_outliers_calc_metrics: bool | None = None,
592
+ progress_callback: Callable[[SearchProgress], Any] | None = None,
593
593
  select_features: bool = True,
594
- auto_fe_parameters: Optional[AutoFEParameters] = None,
595
- stability_threshold: float = 0.15,
594
+ auto_fe_parameters: AutoFEParameters | None = None,
595
+ stability_threshold: float = 0.2,
596
596
  stability_agg_func: str = "max",
597
597
  **kwargs,
598
598
  ) -> pd.DataFrame:
599
599
  """Fit to data, then transform it.
600
600
 
601
601
  Fits transformer to `X` and `y` and returns a transformed version of `X`.
602
- If keep_input is True, then all input columns are copied to the output dataframe.
603
602
 
604
603
  Parameters
605
604
  ----------
@@ -613,7 +612,10 @@ class FeaturesEnricher(TransformerMixin):
613
612
  list of pairs (X, y) for validation.
614
613
 
615
614
  keep_input: bool, optional (default=True)
615
+ keep_input: bool, optional (default=None)
616
616
  If True, copy original input columns to the output dataframe.
617
+ If False, then only enriched columns are returned.
618
+ If None, then all search keys, ID columns, selected client features and enriched columns will be returned.
617
619
 
618
620
  estimator: sklearn-compatible estimator, optional (default=None)
619
621
  Custom estimator for metrics calculation.
@@ -629,7 +631,7 @@ class FeaturesEnricher(TransformerMixin):
629
631
  If True, return only selected features both from input and data sources.
630
632
  Otherwise, return all features from input and only selected features from data sources.
631
633
 
632
- stability_threshold: float, optional (default=0.15)
634
+ stability_threshold: float, optional (default=0.2)
633
635
  Stability threshold for selected features PSI calculation. If PSI is less than this threshold,
634
636
  then feature will be dropped.
635
637
 
@@ -747,28 +749,29 @@ class FeaturesEnricher(TransformerMixin):
747
749
  self,
748
750
  X: pd.DataFrame,
749
751
  *args,
750
- y: Optional[pd.Series] = None,
751
- exclude_features_sources: Optional[list[str]] = None,
752
- keep_input: bool = True,
753
- trace_id: Optional[str] = None,
752
+ y: pd.Series | None = None,
753
+ exclude_features_sources: list[str] | None = None,
754
+ keep_input: bool | None = None,
755
+ trace_id: str | None = None,
754
756
  metrics_calculation: bool = False,
755
757
  silent_mode=False,
756
- progress_bar: Optional[ProgressBar] = None,
757
- progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
758
+ progress_bar: ProgressBar | None = None,
759
+ progress_callback: Callable[[SearchProgress], Any] | None = None,
758
760
  **kwargs,
759
761
  ) -> pd.DataFrame:
760
762
  """Transform `X`.
761
763
 
762
764
  Returns a transformed version of `X`.
763
- If keep_input is True, then all input columns are copied to the output dataframe.
764
765
 
765
766
  Parameters
766
767
  ----------
767
768
  X: pandas.DataFrame of shape (n_samples, n_features)
768
769
  Input samples.
769
770
 
770
- keep_input: bool, optional (default=True)
771
+ keep_input: bool, optional (default=None)
771
772
  If True, copy original input columns to the output dataframe.
773
+ If False, then only enriched columns are returned.
774
+ If None, then all search keys, ID columns, selected client features and enriched columns will be returned.
772
775
 
773
776
  Returns
774
777
  -------
@@ -809,6 +812,7 @@ class FeaturesEnricher(TransformerMixin):
809
812
  metrics_calculation=metrics_calculation,
810
813
  silent_mode=silent_mode,
811
814
  progress_bar=progress_bar,
815
+ keep_input=keep_input,
812
816
  )
813
817
  self.logger.info("Transform finished successfully")
814
818
  search_progress = SearchProgress(100.0, ProgressStage.FINISHED)
@@ -850,30 +854,26 @@ class FeaturesEnricher(TransformerMixin):
850
854
  raise e
851
855
  finally:
852
856
  self.logger.info(f"Transform elapsed time: {time.time() - start_time}")
853
-
854
- if result is not None:
855
- if keep_input:
856
- return result
857
- else:
858
- return result.drop(columns=X.columns, errors="ignore")
857
+
858
+ return result
859
859
 
860
860
  def calculate_metrics(
861
861
  self,
862
- X: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
863
- y: Union[pd.DataFrame, pd.Series, np.ndarray, list, None] = None,
864
- eval_set: Optional[Union[list[tuple], tuple]] = None,
862
+ X: pd.DataFrame | pd.Series | np.ndarray | None = None,
863
+ y: pd.DataFrame | pd.Series | np.ndarray | list | None = None,
864
+ eval_set: list[tuple] | tuple | None = None,
865
865
  *args,
866
- scoring: Union[Callable, str, None] = None,
867
- cv: Union[BaseCrossValidator, CVType, None] = None,
866
+ scoring: Callable | str | None = None,
867
+ cv: BaseCrossValidator | CVType | str | None = None,
868
868
  estimator=None,
869
- exclude_features_sources: Optional[list[str]] = None,
870
- remove_outliers_calc_metrics: Optional[bool] = None,
871
- trace_id: Optional[str] = None,
869
+ exclude_features_sources: list[str] | None = None,
870
+ remove_outliers_calc_metrics: bool | None = None,
871
+ trace_id: str | None = None,
872
872
  internal_call: bool = False,
873
- progress_bar: Optional[ProgressBar] = None,
874
- progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
873
+ progress_bar: ProgressBar | None = None,
874
+ progress_callback: Callable[[SearchProgress], Any] | None = None,
875
875
  **kwargs,
876
- ) -> Optional[pd.DataFrame]:
876
+ ) -> pd.DataFrame | None:
877
877
  """Calculate metrics
878
878
 
879
879
  Parameters
@@ -1311,16 +1311,16 @@ class FeaturesEnricher(TransformerMixin):
1311
1311
  def _select_features_by_psi(
1312
1312
  self,
1313
1313
  trace_id: str,
1314
- X: Union[pd.DataFrame, pd.Series, np.ndarray],
1315
- y: Union[pd.DataFrame, pd.Series, np.ndarray, list],
1316
- eval_set: Optional[Union[list[tuple], tuple]],
1314
+ X: pd.DataFrame | pd.Series | np.ndarray,
1315
+ y: pd.DataFrame | pd.Series | np.ndarray | list,
1316
+ eval_set: list[tuple] | tuple | None,
1317
1317
  stability_threshold: float,
1318
1318
  stability_agg_func: Callable,
1319
- cv: Union[BaseCrossValidator, CVType, str, None] = None,
1319
+ cv: BaseCrossValidator | CVType | str | None = None,
1320
1320
  estimator=None,
1321
- exclude_features_sources: Optional[list[str]] = None,
1321
+ exclude_features_sources: list[str] | None = None,
1322
1322
  progress_bar: bool = True,
1323
- progress_callback: Optional[Callable] = None,
1323
+ progress_callback: Callable | None = None,
1324
1324
  ):
1325
1325
  search_keys = self.search_keys.copy()
1326
1326
  validated_X, _, validated_eval_set = self._validate_train_eval(X, y, eval_set, silent=True)
@@ -1469,7 +1469,7 @@ class FeaturesEnricher(TransformerMixin):
1469
1469
 
1470
1470
  unstable_by_sparsity = [feature for feature, psi in psi_values_sparse.items() if psi > stability_threshold]
1471
1471
  if unstable_by_sparsity:
1472
- self.logger.info(f"Unstable by sparsity features: {sorted(unstable_by_sparsity)}")
1472
+ self.logger.info(f"Unstable by sparsity features ({stability_threshold}): {sorted(unstable_by_sparsity)}")
1473
1473
 
1474
1474
  psi_values = calculate_features_psi(
1475
1475
  checking_eval_set_df, cat_features, date_column, self.logger, model_task_type, stability_agg_func
@@ -1479,7 +1479,7 @@ class FeaturesEnricher(TransformerMixin):
1479
1479
 
1480
1480
  unstable_by_value = [feature for feature, psi in psi_values.items() if psi > stability_threshold]
1481
1481
  if unstable_by_value:
1482
- self.logger.info(f"Unstable by value features: {sorted(unstable_by_value)}")
1482
+ self.logger.info(f"Unstable by value features ({stability_threshold}): {sorted(unstable_by_value)}")
1483
1483
 
1484
1484
  self.psi_values = {
1485
1485
  feature: psi_value for feature, psi_value in psi_values.items() if psi_value <= stability_threshold
@@ -1557,12 +1557,12 @@ class FeaturesEnricher(TransformerMixin):
1557
1557
  self.logger.warning(msg)
1558
1558
 
1559
1559
  def _has_features_with_commercial_schema(
1560
- self, commercial_schema: str, exclude_features_sources: Optional[list[str]]
1560
+ self, commercial_schema: str, exclude_features_sources: list[str] | None
1561
1561
  ) -> bool:
1562
1562
  return len(self._get_features_with_commercial_schema(commercial_schema, exclude_features_sources)) > 0
1563
1563
 
1564
1564
  def _get_features_with_commercial_schema(
1565
- self, commercial_schema: str, exclude_features_sources: Optional[list[str]]
1565
+ self, commercial_schema: str, exclude_features_sources: list[str] | None
1566
1566
  ) -> list[str]:
1567
1567
  if exclude_features_sources:
1568
1568
  filtered_features_info = self._internal_features_info[
@@ -1577,14 +1577,14 @@ class FeaturesEnricher(TransformerMixin):
1577
1577
  ].values
1578
1578
  )
1579
1579
 
1580
- def _has_paid_features(self, exclude_features_sources: Optional[list[str]]) -> bool:
1580
+ def _has_paid_features(self, exclude_features_sources: list[str] | None) -> bool:
1581
1581
  return self._has_features_with_commercial_schema(CommercialSchema.PAID.value, exclude_features_sources)
1582
1582
 
1583
1583
  def _is_input_same_as_fit(
1584
1584
  self,
1585
- X: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
1586
- y: Union[pd.DataFrame, pd.Series, np.ndarray, list, None] = None,
1587
- eval_set: Optional[list[tuple]] = None,
1585
+ X: pd.DataFrame | pd.Series | np.ndarray | None = None,
1586
+ y: pd.DataFrame | pd.Series | np.ndarray | list | None = None,
1587
+ eval_set: list[tuple] | None = None,
1588
1588
  ) -> tuple:
1589
1589
  if X is None:
1590
1590
  return True, self.X, self.y, self.eval_set
@@ -1615,9 +1615,9 @@ class FeaturesEnricher(TransformerMixin):
1615
1615
  def _get_cv_and_groups(
1616
1616
  self,
1617
1617
  X: pd.DataFrame,
1618
- cv_override: Union[BaseCrossValidator, CVType, str, None],
1618
+ cv_override: BaseCrossValidator | CVType | str | None,
1619
1619
  search_keys: dict[str, SearchKey],
1620
- ) -> tuple[BaseCrossValidator, Optional[np.ndarray]]:
1620
+ ) -> tuple[BaseCrossValidator, np.ndarray] | None:
1621
1621
  _cv = cv_override or self.cv
1622
1622
  group_columns = sorted(self._get_group_columns(X, search_keys))
1623
1623
  groups = None
@@ -1645,8 +1645,8 @@ class FeaturesEnricher(TransformerMixin):
1645
1645
  return _cv, groups
1646
1646
 
1647
1647
  def _get_and_validate_client_cat_features(
1648
- self, estimator: Optional[Any], X: pd.DataFrame, search_keys: dict[str, SearchKey]
1649
- ) -> tuple[Optional[list[str]], list[str]]:
1648
+ self, estimator: Any | None, X: pd.DataFrame, search_keys: dict[str, SearchKey]
1649
+ ) -> tuple[list[str] | None, list[str]]:
1650
1650
  cat_features = []
1651
1651
  search_keys_for_metrics = []
1652
1652
  if (
@@ -1678,16 +1678,16 @@ class FeaturesEnricher(TransformerMixin):
1678
1678
  def _get_cached_enriched_data(
1679
1679
  self,
1680
1680
  trace_id: str,
1681
- X: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
1682
- y: Union[pd.DataFrame, pd.Series, np.ndarray, list, None] = None,
1683
- eval_set: Optional[Union[list[tuple], tuple]] = None,
1684
- exclude_features_sources: Optional[list[str]] = None,
1685
- remove_outliers_calc_metrics: Optional[bool] = None,
1686
- cv_override: Union[BaseCrossValidator, CVType, str, None] = None,
1687
- search_keys_for_metrics: Optional[list[str]] = None,
1688
- progress_bar: Optional[ProgressBar] = None,
1689
- progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
1690
- client_cat_features: Optional[list[str]] = None,
1681
+ X: pd.DataFrame | pd.Series | np.ndarray | None = None,
1682
+ y: pd.DataFrame | pd.Series | np.ndarray | list | None = None,
1683
+ eval_set: list[tuple] | tuple | None = None,
1684
+ exclude_features_sources: list[str] | None = None,
1685
+ remove_outliers_calc_metrics: bool | None = None,
1686
+ cv_override: BaseCrossValidator | CVType | str | None = None,
1687
+ search_keys_for_metrics: list[str] | None = None,
1688
+ progress_bar: ProgressBar | None = None,
1689
+ progress_callback: Callable[[SearchProgress], Any] | None = None,
1690
+ client_cat_features: list[str] | None = None,
1691
1691
  is_for_metrics: bool = False,
1692
1692
  ):
1693
1693
  is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
@@ -1893,15 +1893,15 @@ class FeaturesEnricher(TransformerMixin):
1893
1893
  def _get_enriched_datasets(
1894
1894
  self,
1895
1895
  trace_id: str,
1896
- validated_X: Union[pd.DataFrame, pd.Series, np.ndarray, None],
1897
- validated_y: Union[pd.DataFrame, pd.Series, np.ndarray, list, None],
1898
- eval_set: Optional[list[tuple]],
1899
- exclude_features_sources: Optional[list[str]],
1896
+ validated_X: pd.DataFrame | pd.Series | np.ndarray | None,
1897
+ validated_y: pd.DataFrame | pd.Series | np.ndarray | list | None,
1898
+ eval_set: list[tuple] | None,
1899
+ exclude_features_sources: list[str] | None,
1900
1900
  is_input_same_as_fit: bool,
1901
1901
  is_demo_dataset: bool,
1902
- remove_outliers_calc_metrics: Optional[bool],
1903
- progress_bar: Optional[ProgressBar],
1904
- progress_callback: Optional[Callable[[SearchProgress], Any]],
1902
+ remove_outliers_calc_metrics: bool | None,
1903
+ progress_bar: ProgressBar | None,
1904
+ progress_callback: Callable[[SearchProgress], Any] | None,
1905
1905
  is_for_metrics: bool = False,
1906
1906
  ) -> _EnrichedDataForMetrics:
1907
1907
  datasets_hash = hash_input(validated_X, validated_y, eval_set)
@@ -1939,7 +1939,7 @@ class FeaturesEnricher(TransformerMixin):
1939
1939
  )
1940
1940
 
1941
1941
  def __get_sampled_cached_enriched(
1942
- self, datasets_hash: str, exclude_features_sources: Optional[list[str]]
1942
+ self, datasets_hash: str, exclude_features_sources: list[str] | None
1943
1943
  ) -> _EnrichedDataForMetrics:
1944
1944
  X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming, generated_features = (
1945
1945
  self.__cached_sampled_datasets[datasets_hash]
@@ -1959,7 +1959,7 @@ class FeaturesEnricher(TransformerMixin):
1959
1959
  )
1960
1960
 
1961
1961
  def __get_enriched_as_input(
1962
- self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[list[tuple]], is_demo_dataset: bool
1962
+ self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: list[tuple] | None, is_demo_dataset: bool
1963
1963
  ) -> _EnrichedDataForMetrics:
1964
1964
  eval_set_sampled_dict = {}
1965
1965
 
@@ -2055,9 +2055,9 @@ class FeaturesEnricher(TransformerMixin):
2055
2055
 
2056
2056
  def __get_enriched_from_fit(
2057
2057
  self,
2058
- eval_set: Optional[list[tuple]],
2058
+ eval_set: list[tuple] | None,
2059
2059
  trace_id: str,
2060
- remove_outliers_calc_metrics: Optional[bool],
2060
+ remove_outliers_calc_metrics: bool | None,
2061
2061
  ) -> _EnrichedDataForMetrics:
2062
2062
  eval_set_sampled_dict = {}
2063
2063
  search_keys = self.fit_search_keys.copy()
@@ -2163,11 +2163,11 @@ class FeaturesEnricher(TransformerMixin):
2163
2163
  self,
2164
2164
  validated_X: pd.DataFrame,
2165
2165
  validated_y: pd.Series,
2166
- eval_set: Optional[list[tuple]],
2167
- exclude_features_sources: Optional[list[str]],
2166
+ eval_set: list[tuple] | None,
2167
+ exclude_features_sources: list[str] | None,
2168
2168
  trace_id: str,
2169
- progress_bar: Optional[ProgressBar],
2170
- progress_callback: Optional[Callable[[SearchProgress], Any]],
2169
+ progress_bar: ProgressBar | None,
2170
+ progress_callback: Callable[[SearchProgress], Any] | None,
2171
2171
  is_for_metrics: bool = False,
2172
2172
  ) -> _EnrichedDataForMetrics:
2173
2173
  has_eval_set = eval_set is not None
@@ -2231,7 +2231,7 @@ class FeaturesEnricher(TransformerMixin):
2231
2231
  )
2232
2232
 
2233
2233
  def __combine_train_and_eval_sets(
2234
- self, X: pd.DataFrame, y: Optional[pd.Series] = None, eval_set: Optional[list[tuple]] = None
2234
+ self, X: pd.DataFrame, y: pd.Series | None = None, eval_set: list[tuple] | None = None
2235
2235
  ) -> pd.DataFrame:
2236
2236
  df = X.copy()
2237
2237
  if y is not None:
@@ -2354,7 +2354,7 @@ class FeaturesEnricher(TransformerMixin):
2354
2354
  generated_features=generated_features,
2355
2355
  )
2356
2356
 
2357
- def get_search_id(self) -> Optional[str]:
2357
+ def get_search_id(self) -> str | None:
2358
2358
  """Returns search_id of the fitted enricher. Not available before a successful fit."""
2359
2359
  return self._search_task.search_task_id if self._search_task else None
2360
2360
 
@@ -2367,7 +2367,7 @@ class FeaturesEnricher(TransformerMixin):
2367
2367
 
2368
2368
  return self.features_info
2369
2369
 
2370
- def get_progress(self, trace_id: Optional[str] = None, search_task: Optional[SearchTask] = None) -> SearchProgress:
2370
+ def get_progress(self, trace_id: str | None = None, search_task: SearchTask | None = None) -> SearchProgress:
2371
2371
  search_task = search_task or self._search_task
2372
2372
  if search_task is not None:
2373
2373
  trace_id = trace_id or uuid.uuid4()
@@ -2475,13 +2475,14 @@ if response.status_code == 200:
2475
2475
  trace_id: str,
2476
2476
  X: pd.DataFrame,
2477
2477
  *,
2478
- y: Optional[pd.Series] = None,
2479
- exclude_features_sources: Optional[list[str]] = None,
2478
+ y: pd.Series | None = None,
2479
+ exclude_features_sources: list[str] | None = None,
2480
2480
  metrics_calculation: bool = False,
2481
2481
  silent_mode: bool = False,
2482
- progress_bar: Optional[ProgressBar] = None,
2483
- progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
2482
+ progress_bar: ProgressBar | None = None,
2483
+ progress_callback: Callable[[SearchProgress], Any] | None = None,
2484
2484
  add_fit_system_record_id: bool = False,
2485
+ keep_input: bool | None = None,
2485
2486
  ) -> tuple[pd.DataFrame, dict[str, str], list[str], dict[str, SearchKey]]:
2486
2487
  if self._search_task is None:
2487
2488
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
@@ -2775,7 +2776,7 @@ if response.status_code == 200:
2775
2776
  progress_bar.progress = progress.to_progress_bar()
2776
2777
  if progress_callback is not None:
2777
2778
  progress_callback(progress)
2778
- prev_progress: Optional[SearchProgress] = None
2779
+ prev_progress: SearchProgress | None = None
2779
2780
  polling_period_seconds = 1
2780
2781
  try:
2781
2782
  while progress.stage != ProgressStage.DOWNLOADING.value:
@@ -2833,18 +2834,25 @@ if response.status_code == 200:
2833
2834
  selected_generated_features = [
2834
2835
  c for c in generated_features if not self.fit_select_features or c in self.feature_names_
2835
2836
  ]
2836
- selected_input_columns = [
2837
- c
2838
- for c in validated_Xy.columns
2839
- if not self.fit_select_features
2840
- or c in self.feature_names_
2841
- or c in self.search_keys
2842
- or c in (self.id_columns or [])
2843
- or c in [EVAL_SET_INDEX, TARGET] # transform for metrics calculation
2844
- ]
2837
+ if keep_input is None:
2838
+ selected_input_columns = [
2839
+ c
2840
+ for c in validated_Xy.columns
2841
+ if not self.fit_select_features
2842
+ or c in self.feature_names_
2843
+ or c in self.search_keys
2844
+ or c in (self.id_columns or [])
2845
+ or c in [EVAL_SET_INDEX, TARGET] # transform for metrics calculation
2846
+ ]
2847
+ elif keep_input is True:
2848
+ selected_input_columns = validated_Xy.columns.to_list()
2849
+ else:
2850
+ selected_input_columns = []
2851
+
2845
2852
  selecting_columns = selected_input_columns + selected_generated_features
2846
2853
  selecting_columns.extend(
2847
- c for c in result.columns if c in self.feature_names_ and c not in selecting_columns
2854
+ c for c in result.columns
2855
+ if c in self.feature_names_ and c not in selecting_columns and c not in validated_Xy.columns
2848
2856
  )
2849
2857
  if add_fit_system_record_id:
2850
2858
  selecting_columns.append(SORT_ID)
@@ -2871,7 +2879,7 @@ if response.status_code == 200:
2871
2879
 
2872
2880
  return result, columns_renaming, generated_features, search_keys
2873
2881
 
2874
- def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id: Optional[str] = None):
2882
+ def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id: str | None = None):
2875
2883
  if (search_keys is None or len(search_keys) == 0) and self.country_code is None:
2876
2884
  if search_id:
2877
2885
  self.logger.debug(f"search_id {search_id} provided without search_keys")
@@ -2944,22 +2952,22 @@ if response.status_code == 200:
2944
2952
  def __inner_fit(
2945
2953
  self,
2946
2954
  trace_id: str,
2947
- X: Union[pd.DataFrame, pd.Series, np.ndarray],
2948
- y: Union[pd.DataFrame, pd.Series, np.ndarray, list, None],
2949
- eval_set: Optional[list[tuple]],
2950
- progress_bar: Optional[ProgressBar],
2955
+ X: pd.DataFrame | pd.Series | np.ndarray,
2956
+ y: pd.DataFrame | pd.Series | np.ndarray | list | None,
2957
+ eval_set: list[tuple] | None,
2958
+ progress_bar: ProgressBar | None,
2951
2959
  start_time: int,
2952
2960
  *,
2953
- exclude_features_sources: Optional[list[str]] = None,
2954
- calculate_metrics: Optional[bool],
2955
- scoring: Union[Callable, str, None],
2956
- estimator: Optional[Any],
2961
+ exclude_features_sources: list[str] | None = None,
2962
+ calculate_metrics: bool | None,
2963
+ scoring: Callable | str | None,
2964
+ estimator: Any | None,
2957
2965
  stability_threshold: float,
2958
2966
  stability_agg_func: str,
2959
- remove_outliers_calc_metrics: Optional[bool],
2967
+ remove_outliers_calc_metrics: bool | None,
2960
2968
  auto_fe_parameters: AutoFEParameters,
2961
- progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
2962
- search_id_callback: Optional[Callable[[str], Any]] = None,
2969
+ progress_callback: Callable[[SearchProgress], Any] | None = None,
2970
+ search_id_callback: Callable[[str], Any] | None = None,
2963
2971
  ):
2964
2972
  self._search_task = None
2965
2973
  self.warning_counter.reset()
@@ -3378,16 +3386,6 @@ if response.status_code == 200:
3378
3386
 
3379
3387
  self.__show_selected_features()
3380
3388
 
3381
- autofe_description = self.get_autofe_features_description()
3382
- if autofe_description is not None and len(autofe_description) > 0:
3383
- self.logger.info(f"AutoFE descriptions: {autofe_description}")
3384
- self.autofe_features_display_handle = display_html_dataframe(
3385
- df=autofe_description,
3386
- internal_df=autofe_description,
3387
- header=self.bundle.get("autofe_descriptions_header"),
3388
- display_id=f"autofe_descriptions_{uuid.uuid4()}",
3389
- )
3390
-
3391
3389
  if self._has_paid_features(exclude_features_sources):
3392
3390
  if calculate_metrics is not None and calculate_metrics:
3393
3391
  msg = self.bundle.get("metrics_with_paid_features")
@@ -3466,7 +3464,7 @@ if response.status_code == 200:
3466
3464
  def __should_add_date_column(self):
3467
3465
  return self.add_date_if_missing or (self.cv is not None and self.cv.is_time_series())
3468
3466
 
3469
- def __get_renamed_id_columns(self, renaming: Optional[dict[str, str]] = None):
3467
+ def __get_renamed_id_columns(self, renaming: dict[str, str] | None = None):
3470
3468
  renaming = renaming or self.fit_columns_renaming
3471
3469
  reverse_renaming = {v: k for k, v in renaming.items()}
3472
3470
  return None if self.id_columns is None else [reverse_renaming.get(c) or c for c in self.id_columns]
@@ -3511,11 +3509,11 @@ if response.status_code == 200:
3511
3509
  def _validate_train_eval(
3512
3510
  self,
3513
3511
  X: pd.DataFrame,
3514
- y: Optional[pd.Series] = None,
3515
- eval_set: Optional[list[tuple[pd.DataFrame, pd.Series]]] = None,
3512
+ y: pd.Series | None = None,
3513
+ eval_set: list[tuple[pd.DataFrame, pd.Series]] | None = None,
3516
3514
  is_transform: bool = False,
3517
3515
  silent: bool = False,
3518
- ) -> tuple[pd.DataFrame, pd.Series, Optional[list[tuple[pd.DataFrame, pd.Series]]]]:
3516
+ ) -> tuple[pd.DataFrame, pd.Series, list[tuple[pd.DataFrame, pd.Series]]] | None:
3519
3517
  validated_X = self._validate_X(X, is_transform)
3520
3518
  validated_y = self._validate_y(validated_X, y, enforce_y=not is_transform)
3521
3519
  validated_eval_set = self._validate_eval_set(validated_X, eval_set, silent=silent)
@@ -3594,7 +3592,7 @@ if response.status_code == 200:
3594
3592
 
3595
3593
  return validated_X
3596
3594
 
3597
- def _validate_y(self, X: pd.DataFrame, y, enforce_y: bool = True) -> Optional[pd.Series]:
3595
+ def _validate_y(self, X: pd.DataFrame, y, enforce_y: bool = True) -> pd.Series | None:
3598
3596
  if y is None and not enforce_y:
3599
3597
  return None
3600
3598
  if (
@@ -3644,7 +3642,7 @@ if response.status_code == 200:
3644
3642
  return validated_y
3645
3643
 
3646
3644
  def _validate_eval_set(
3647
- self, X: pd.DataFrame, eval_set: Optional[list[tuple[pd.DataFrame, pd.Series]]], silent: bool = False
3645
+ self, X: pd.DataFrame, eval_set: list[tuple[pd.DataFrame, pd.Series]] | None, silent: bool = False
3648
3646
  ):
3649
3647
  if eval_set is None:
3650
3648
  return None
@@ -3756,7 +3754,7 @@ if response.status_code == 200:
3756
3754
 
3757
3755
  return validated_eval_X, validated_eval_y
3758
3756
 
3759
- def _validate_baseline_score(self, X: pd.DataFrame, eval_set: Optional[list[tuple]]):
3757
+ def _validate_baseline_score(self, X: pd.DataFrame, eval_set: list[tuple] | None):
3760
3758
  if self.baseline_score_column is not None:
3761
3759
  if self.baseline_score_column not in X.columns:
3762
3760
  raise ValidationError(
@@ -3783,7 +3781,7 @@ if response.status_code == 200:
3783
3781
 
3784
3782
  @staticmethod
3785
3783
  def _sort_by_system_record_id(
3786
- X: pd.DataFrame, y: pd.Series, cv: Optional[CVType]
3784
+ X: pd.DataFrame, y: pd.Series, cv: CVType | None
3787
3785
  ) -> tuple[pd.DataFrame, pd.Series]:
3788
3786
  if cv not in [CVType.time_series, CVType.blocked_time_series]:
3789
3787
  record_id_column = ENTITY_SYSTEM_RECORD_ID if ENTITY_SYSTEM_RECORD_ID in X else SYSTEM_RECORD_ID
@@ -3801,7 +3799,7 @@ if response.status_code == 200:
3801
3799
  # Deprecated
3802
3800
  @staticmethod
3803
3801
  def _sort_by_keys(
3804
- X: pd.DataFrame, y: pd.Series, search_keys: dict[str, SearchKey], cv: Optional[CVType]
3802
+ X: pd.DataFrame, y: pd.Series, search_keys: dict[str, SearchKey], cv: CVType | None
3805
3803
  ) -> tuple[pd.DataFrame, pd.Series]:
3806
3804
  if cv not in [CVType.time_series, CVType.blocked_time_series]:
3807
3805
  if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
@@ -3841,14 +3839,14 @@ if response.status_code == 200:
3841
3839
  def __log_debug_information(
3842
3840
  self,
3843
3841
  X: pd.DataFrame,
3844
- y: Union[pd.Series, np.ndarray, list, None] = None,
3845
- eval_set: Optional[list[tuple]] = None,
3846
- exclude_features_sources: Optional[list[str]] = None,
3847
- calculate_metrics: Optional[bool] = None,
3848
- cv: Optional[Any] = None,
3849
- scoring: Optional[Any] = None,
3850
- estimator: Optional[Any] = None,
3851
- remove_outliers_calc_metrics: Optional[bool] = None,
3842
+ y: pd.Series | np.ndarray | list | None = None,
3843
+ eval_set: list[tuple] | None = None,
3844
+ exclude_features_sources: list[str] | None = None,
3845
+ calculate_metrics: bool | None = None,
3846
+ cv: Any | None = None,
3847
+ scoring: Any | None = None,
3848
+ estimator: Any | None = None,
3849
+ remove_outliers_calc_metrics: bool | None = None,
3852
3850
  ):
3853
3851
  try:
3854
3852
  resolved_api_key = self.api_key or os.environ.get(UPGINI_API_KEY)
@@ -3973,7 +3971,7 @@ if response.status_code == 200:
3973
3971
  ]
3974
3972
 
3975
3973
  @staticmethod
3976
- def _get_email_column(search_keys: dict[str, SearchKey]) -> Optional[str]:
3974
+ def _get_email_column(search_keys: dict[str, SearchKey]) -> str | None:
3977
3975
  cols = [col for col, t in search_keys.items() if t == SearchKey.EMAIL]
3978
3976
  if len(cols) > 1:
3979
3977
  raise Exception("More than one email column found after unnest")
@@ -3981,7 +3979,7 @@ if response.status_code == 200:
3981
3979
  return cols[0]
3982
3980
 
3983
3981
  @staticmethod
3984
- def _get_hem_column(search_keys: dict[str, SearchKey]) -> Optional[str]:
3982
+ def _get_hem_column(search_keys: dict[str, SearchKey]) -> str | None:
3985
3983
  cols = [col for col, t in search_keys.items() if t == SearchKey.HEM]
3986
3984
  if len(cols) > 1:
3987
3985
  raise Exception("More than one hem column found after unnest")
@@ -3989,7 +3987,7 @@ if response.status_code == 200:
3989
3987
  return cols[0]
3990
3988
 
3991
3989
  @staticmethod
3992
- def _get_ip_column(search_keys: dict[str, SearchKey]) -> Optional[str]:
3990
+ def _get_ip_column(search_keys: dict[str, SearchKey]) -> str | None:
3993
3991
  cols = [col for col, t in search_keys.items() if t == SearchKey.IP]
3994
3992
  if len(cols) > 1:
3995
3993
  raise Exception("More than one ip column found after unnest")
@@ -3997,25 +3995,25 @@ if response.status_code == 200:
3997
3995
  return cols[0]
3998
3996
 
3999
3997
  @staticmethod
4000
- def _get_phone_column(search_keys: dict[str, SearchKey]) -> Optional[str]:
3998
+ def _get_phone_column(search_keys: dict[str, SearchKey]) -> str | None:
4001
3999
  for col, t in search_keys.items():
4002
4000
  if t == SearchKey.PHONE:
4003
4001
  return col
4004
4002
 
4005
4003
  @staticmethod
4006
- def _get_country_column(search_keys: dict[str, SearchKey]) -> Optional[str]:
4004
+ def _get_country_column(search_keys: dict[str, SearchKey]) -> str | None:
4007
4005
  for col, t in search_keys.items():
4008
4006
  if t == SearchKey.COUNTRY:
4009
4007
  return col
4010
4008
 
4011
4009
  @staticmethod
4012
- def _get_postal_column(search_keys: dict[str, SearchKey]) -> Optional[str]:
4010
+ def _get_postal_column(search_keys: dict[str, SearchKey]) -> str | None:
4013
4011
  for col, t in search_keys.items():
4014
4012
  if t == SearchKey.POSTAL_CODE:
4015
4013
  return col
4016
4014
 
4017
4015
  @staticmethod
4018
- def _get_date_column(search_keys: dict[str, SearchKey]) -> Optional[str]:
4016
+ def _get_date_column(search_keys: dict[str, SearchKey]) -> str | None:
4019
4017
  return SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
4020
4018
 
4021
4019
  def _explode_multiple_search_keys(
@@ -4062,10 +4060,10 @@ if response.status_code == 200:
4062
4060
  id_name: str,
4063
4061
  target_name: str,
4064
4062
  columns_renaming: dict[str, str],
4065
- id_columns: Optional[list[str]],
4066
- cv: Optional[CVType],
4063
+ id_columns: list[str] | None,
4064
+ cv: CVType | None,
4067
4065
  model_task_type: ModelTaskType,
4068
- logger: Optional[logging.Logger] = None,
4066
+ logger: logging.Logger | None = None,
4069
4067
  bundle: ResourceBundle = bundle,
4070
4068
  ) -> pd.DataFrame:
4071
4069
  original_index_name = df.index.name
@@ -4201,7 +4199,7 @@ if response.status_code == 200:
4201
4199
  def __enrich(
4202
4200
  self,
4203
4201
  input_df: pd.DataFrame,
4204
- result_features: Optional[pd.DataFrame],
4202
+ result_features: pd.DataFrame | None,
4205
4203
  how: str = "inner",
4206
4204
  drop_system_record_id=True,
4207
4205
  ) -> pd.DataFrame:
@@ -4320,7 +4318,7 @@ if response.status_code == 200:
4320
4318
  self,
4321
4319
  trace_id: str,
4322
4320
  clients_features_df: pd.DataFrame,
4323
- updated_shaps: Optional[dict[str, float]] = None,
4321
+ updated_shaps: dict[str, float] | None = None,
4324
4322
  update_selected_features: bool = True,
4325
4323
  silent=False,
4326
4324
  ):
@@ -4659,12 +4657,12 @@ if response.status_code == 200:
4659
4657
 
4660
4658
  def __show_metrics(
4661
4659
  self,
4662
- scoring: Union[Callable, str, None],
4663
- estimator: Optional[Any],
4664
- remove_outliers_calc_metrics: Optional[bool],
4660
+ scoring: Callable | str | None,
4661
+ estimator: Any | None,
4662
+ remove_outliers_calc_metrics: bool | None,
4665
4663
  trace_id: str,
4666
- progress_bar: Optional[ProgressBar] = None,
4667
- progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
4664
+ progress_bar: ProgressBar | None = None,
4665
+ progress_callback: Callable[[SearchProgress], Any] | None = None,
4668
4666
  ):
4669
4667
  self.metrics = self.calculate_metrics(
4670
4668
  scoring=scoring,
@@ -4698,13 +4696,23 @@ if response.status_code == 200:
4698
4696
  self.bundle.get("relevant_data_sources_header"),
4699
4697
  display_id=f"data_sources_{uuid.uuid4()}",
4700
4698
  )
4699
+
4700
+ autofe_description = self.get_autofe_features_description()
4701
+ if autofe_description is not None and len(autofe_description) > 0:
4702
+ self.logger.info(f"AutoFE descriptions: {autofe_description}")
4703
+ self.autofe_features_display_handle = display_html_dataframe(
4704
+ df=autofe_description,
4705
+ internal_df=autofe_description,
4706
+ header=self.bundle.get("autofe_descriptions_header"),
4707
+ display_id=f"autofe_descriptions_{uuid.uuid4()}",
4708
+ )
4701
4709
  else:
4702
4710
  msg = self.bundle.get("features_info_zero_important_features")
4703
4711
  self.__log_warning(msg, show_support_link=True)
4704
4712
  except (ImportError, NameError):
4705
4713
  print(self._internal_features_info)
4706
4714
 
4707
- def __show_report_button(self, display_id: Optional[str] = None, display_handle=None):
4715
+ def __show_report_button(self, display_id: str | None = None, display_handle=None):
4708
4716
  try:
4709
4717
  return prepare_and_show_report(
4710
4718
  relevant_features_df=self._features_info_without_links,
@@ -4844,7 +4852,7 @@ if response.status_code == 200:
4844
4852
  except Exception:
4845
4853
  self.logger.exception("Failed to dump python libs")
4846
4854
 
4847
- def __display_support_link(self, link_text: Optional[str] = None):
4855
+ def __display_support_link(self, link_text: str | None = None):
4848
4856
  support_link = self.bundle.get("support_link")
4849
4857
  link_text = link_text or self.bundle.get("support_text")
4850
4858
  try:
@@ -4871,9 +4879,9 @@ if response.status_code == 200:
4871
4879
  def dump_input(
4872
4880
  self,
4873
4881
  trace_id: str,
4874
- X: Union[pd.DataFrame, pd.Series],
4875
- y: Union[pd.DataFrame, pd.Series, None] = None,
4876
- eval_set: Union[tuple, None] = None,
4882
+ X: pd.DataFrame | pd.Series,
4883
+ y: pd.DataFrame | pd.Series | None = None,
4884
+ eval_set: tuple | None = None,
4877
4885
  ):
4878
4886
  def dump_task(X_, y_, eval_set_):
4879
4887
  with MDC(trace_id=trace_id):
@@ -4964,7 +4972,7 @@ def is_frames_equal(first, second, bundle: ResourceBundle) -> bool:
4964
4972
  raise ValidationError(bundle.get("x_and_eval_x_diff_types").format(type(first), type(second)))
4965
4973
 
4966
4974
 
4967
- def drop_duplicates(df: Union[pd.DataFrame, np.ndarray, Any]) -> pd.DataFrame:
4975
+ def drop_duplicates(df: pd.DataFrame | np.ndarray | Any) -> pd.DataFrame:
4968
4976
  if isinstance(df, pd.DataFrame):
4969
4977
  return df.drop_duplicates()
4970
4978
  elif isinstance(df, np.ndarray):
upgini/http.py CHANGED
@@ -413,43 +413,9 @@ class _RestClient:
413
413
  with open(path, "rb") as file:
414
414
  files = {"file": (file_name, file, "application/octet-stream")}
415
415
  self._with_unauth_retry(
416
- lambda: self._send_post_file_req_v2(
417
- api_path, files, trace_id=trace_id, need_json_response=False
418
- )
416
+ lambda: self._send_post_file_req_v2(api_path, files, trace_id=trace_id, need_json_response=False)
419
417
  )
420
418
 
421
- def dump_input_files(
422
- self,
423
- trace_id: str,
424
- x_path: str,
425
- y_path: Optional[str] = None,
426
- eval_x_path: Optional[str] = None,
427
- eval_y_path: Optional[str] = None,
428
- ):
429
- api_path = self.SEARCH_DUMP_INPUT_FILE_FMT
430
-
431
- def upload_with_check(path: str, file_name: str):
432
- digest_sha256 = file_hash(path)
433
- if self.is_file_uploaded(trace_id, digest_sha256):
434
- # print(f"File {path} was already uploaded with digest {digest_sha256}, skipping")
435
- return
436
- else:
437
- with open(path, "rb") as file:
438
- files = {"file": (file_name, file, "application/octet-stream")}
439
- self._with_unauth_retry(
440
- lambda: self._send_post_file_req_v2(
441
- api_path, files, trace_id=trace_id, need_json_response=False
442
- )
443
- )
444
-
445
- upload_with_check(x_path, "x.parquet")
446
- if y_path:
447
- upload_with_check(y_path, "y.parquet")
448
- if eval_x_path:
449
- upload_with_check(eval_x_path, "eval_x.parquet")
450
- if eval_y_path:
451
- upload_with_check(eval_y_path, "eval_y.parquet")
452
-
453
419
  def initial_search_v2(
454
420
  self,
455
421
  trace_id: str,
@@ -1080,6 +1046,7 @@ class LoggerFactory:
1080
1046
 
1081
1047
  upgini_logger = logging.getLogger(f"upgini.{hash(key)}")
1082
1048
  upgini_logger.handlers.clear()
1049
+ upgini_logger.propagate = False # Prevent duplicate logging in Jupyter notebooks
1083
1050
  rest_client = get_rest_client(backend_url, api_token, client_ip, client_visitorid)
1084
1051
  datadog_handler = BackendLogHandler(rest_client, client_ip, client_visitorid)
1085
1052
  json_formatter = jsonlogger.JsonFormatter(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.114a5
3
+ Version: 1.2.115a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -749,6 +749,36 @@ enricher.fit(
749
749
  )
750
750
  ```
751
751
 
752
+ ### Control feature stability with PSI parameters
753
+
754
+ `FeaturesEnricher` supports Population Stability Index (PSI) calculation on eval_set to evaluate feature stability over time. You can control this behavior using stability parameters in `fit` and `fit_transform` methods:
755
+
756
+ ```python
757
+ enricher = FeaturesEnricher(
758
+ search_keys={"registration_date": SearchKey.DATE}
759
+ )
760
+
761
+ # Control feature stability during fit
762
+ enricher.fit(
763
+ X, y,
764
+ stability_threshold=0.2, # PSI threshold: features with PSI above this value will be dropped
765
+ stability_agg_func="max" # Aggregation function for stability values: "max", "min", "mean"
766
+ )
767
+
768
+ # Same parameters work for fit_transform
769
+ enriched_df = enricher.fit_transform(
770
+ X, y,
771
+ stability_threshold=0.1, # Stricter threshold for more stable features
772
+ stability_agg_func="mean" # Use mean aggregation instead of max
773
+ )
774
+ ```
775
+
776
+ **Stability parameters:**
777
+ - `stability_threshold` (float, default=0.2): PSI threshold value. Features with PSI below this threshold will be excluded from the final feature set. Lower values mean stricter stability requirements.
778
+ - `stability_agg_func` (str, default="max"): Function to aggregate PSI values across time intervals. Options: "max" (most conservative), "min" (least conservative), "mean" (balanced approach).
779
+
780
+ **PSI (Population Stability Index)** measures how much feature distribution changes over time. Lower PSI values indicate more stable features, which are generally more reliable for production ML models.
781
+
752
782
  ### Use custom loss function in feature selection & metrics calculation
753
783
 
754
784
  `FeaturesEnricher` can be initialized with additional string parameter `loss`.
@@ -1,10 +1,10 @@
1
- upgini/__about__.py,sha256=iwE4cHR_k5DhwOJ-LokS39KSTP25EzJzsOu-OvIvCHA,26
1
+ upgini/__about__.py,sha256=bgpppKWVKHgLo8IRKBM8YYuR4qMETYP4hSkfrlgcwgU,26
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=pQ8JQe0cdygD-W9GefJmfE6bnj4EYzXsjlgWdIS9nS8,31578
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=YHfuasHkeUZqexG46fOFviPv7SYuavT6Bk0HGsHAnqs,228847
7
- upgini/http.py,sha256=OuvlySDc8CRgDvB8o1lqRbmsp2Gi1WWRRTYElu1D5nc,45531
6
+ upgini/features_enricher.py,sha256=Ogye2TBqV-k1Znbc3ffmRKwsff00P7ea5useiuOsZIc,228799
7
+ upgini/http.py,sha256=-J_wOpnwVnT0ebPC6sOs6fN3AWtCD0LJLu6nlYmxaqk,44348
8
8
  upgini/metadata.py,sha256=VzgtgEbPPtNxTrj9LM5qSDP3DujHwAXqbUSKBjPcb9c,12477
9
9
  upgini/metrics.py,sha256=gjJDtlV6JrhUJumbNipdzjY4ojEupHGPihb9_VxjtWc,45939
10
10
  upgini/search_task.py,sha256=SAiUd1AytbA2Q6PSnnztr7oTRKpud1wQZ5YtKjsmQHU,18256
@@ -74,7 +74,7 @@ upgini/utils/target_utils.py,sha256=GCPn4QeJ83JJ_vyBJ3IhY5fyIRkLC9q9BE59S2FRO1I,
74
74
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
75
75
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
76
76
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
77
- upgini-1.2.114a5.dist-info/METADATA,sha256=TLzfyEHQdjQYUgRQhHilQdMtpIIOreYLC8Tb0SqQIwg,49197
78
- upgini-1.2.114a5.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
79
- upgini-1.2.114a5.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
80
- upgini-1.2.114a5.dist-info/RECORD,,
77
+ upgini-1.2.115a1.dist-info/METADATA,sha256=_aYBoX0V8yPCap7XV0FQp_5_RIJWH1aVe0S5jDvDGXw,50695
78
+ upgini-1.2.115a1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
79
+ upgini-1.2.115a1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
80
+ upgini-1.2.115a1.dist-info/RECORD,,