upgini 1.2.114a5__py3-none-any.whl → 1.2.116__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.114a5"
1
+ __version__ = "1.2.116"
@@ -12,7 +12,7 @@ from collections import Counter
12
12
  from copy import deepcopy
13
13
  from dataclasses import dataclass
14
14
  from threading import Thread
15
- from typing import Any, Callable, Optional, Set, Union
15
+ from typing import Any, Callable
16
16
 
17
17
  import numpy as np
18
18
  import pandas as pd
@@ -207,34 +207,34 @@ class FeaturesEnricher(TransformerMixin):
207
207
 
208
208
  def __init__(
209
209
  self,
210
- search_keys: Optional[dict[str, SearchKey]] = None,
211
- country_code: Optional[str] = None,
212
- model_task_type: Optional[Union[ModelTaskType, str]] = None,
213
- api_key: Optional[str] = None,
214
- endpoint: Optional[str] = None,
215
- search_id: Optional[str] = None,
216
- shared_datasets: Optional[list[str]] = None,
217
- runtime_parameters: Optional[RuntimeParameters] = None,
218
- date_format: Optional[str] = None,
210
+ search_keys: dict[str, SearchKey] | None = None,
211
+ country_code: str | None = None,
212
+ model_task_type: ModelTaskType | str | None = None,
213
+ api_key: str | None = None,
214
+ endpoint: str | None = None,
215
+ search_id: str | None = None,
216
+ shared_datasets: list[str] | None = None,
217
+ runtime_parameters: RuntimeParameters | None = None,
218
+ date_format: str | None = None,
219
219
  random_state: int = 42,
220
- cv: Optional[CVType] = None,
221
- loss: Optional[str] = None,
220
+ cv: CVType | None = None,
221
+ loss: str | None = None,
222
222
  autodetect_search_keys: bool = True,
223
- generate_features: Optional[list[str]] = None,
224
- columns_for_online_api: Optional[list[str]] = None,
225
- round_embeddings: Optional[int] = None,
223
+ generate_features: list[str] | None = None,
224
+ columns_for_online_api: list[str] | None = None,
225
+ round_embeddings: int | None = None,
226
226
  logs_enabled: bool = True,
227
227
  raise_validation_error: bool = True,
228
- exclude_columns: Optional[list[str]] = None,
229
- baseline_score_column: Optional[Any] = None,
230
- client_ip: Optional[str] = None,
231
- client_visitorid: Optional[str] = None,
232
- custom_bundle_config: Optional[str] = None,
228
+ exclude_columns: list[str] | None = None,
229
+ baseline_score_column: Any | None = None,
230
+ client_ip: str | None = None,
231
+ client_visitorid: str | None = None,
232
+ custom_bundle_config: str | None = None,
233
233
  add_date_if_missing: bool = True,
234
234
  disable_force_downsampling: bool = False,
235
- id_columns: Optional[list[str]] = None,
235
+ id_columns: list[str] | None = None,
236
236
  generate_search_key_features: bool = True,
237
- sample_config: Optional[SampleConfig] = None,
237
+ sample_config: SampleConfig | None = None,
238
238
  print_trace_id: bool = False,
239
239
  **kwargs,
240
240
  ):
@@ -259,16 +259,16 @@ class FeaturesEnricher(TransformerMixin):
259
259
  print(msg)
260
260
 
261
261
  self.passed_features: list[str] = []
262
- self.df_with_original_index: Optional[pd.DataFrame] = None
263
- self.fit_columns_renaming: Optional[dict[str, str]] = None
262
+ self.df_with_original_index: pd.DataFrame | None = None
263
+ self.fit_columns_renaming: dict[str, str] | None = None
264
264
  self.country_added = False
265
265
  self.fit_generated_features: list[str] = []
266
- self.fit_dropped_features: Set[str] = set()
266
+ self.fit_dropped_features: set[str] = set()
267
267
  self.fit_search_keys = search_keys
268
268
  self.warning_counter = WarningCounter()
269
- self.X: Optional[pd.DataFrame] = None
270
- self.y: Optional[pd.Series] = None
271
- self.eval_set: Optional[list[tuple]] = None
269
+ self.X: pd.DataFrame | None = None
270
+ self.y: pd.Series | None = None
271
+ self.eval_set: list[tuple] | None = None
272
272
  self.autodetected_search_keys: dict[str, SearchKey] = {}
273
273
  self.imbalanced = False
274
274
  self.fit_select_features = True
@@ -288,17 +288,17 @@ class FeaturesEnricher(TransformerMixin):
288
288
  self.model_task_type = ModelTaskType.parse(model_task_type)
289
289
  self.model_task_type = model_task_type
290
290
  self.endpoint = endpoint
291
- self._search_task: Optional[SearchTask] = None
291
+ self._search_task: SearchTask | None = None
292
292
  self.features_info: pd.DataFrame = self.EMPTY_FEATURES_INFO
293
293
  self._features_info_without_links: pd.DataFrame = self.EMPTY_FEATURES_INFO
294
294
  self._internal_features_info: pd.DataFrame = self.EMPTY_INTERNAL_FEATURES_INFO
295
295
  self.relevant_data_sources: pd.DataFrame = self.EMPTY_DATA_SOURCES
296
296
  self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
297
- self.metrics: Optional[pd.DataFrame] = None
297
+ self.metrics: pd.DataFrame | None = None
298
298
  self.feature_names_ = []
299
299
  self.external_source_feature_names = []
300
300
  self.feature_importances_ = []
301
- self.psi_values: Optional[dict[str, float]] = None
301
+ self.psi_values: dict[str, float] | None = None
302
302
  self.search_id = search_id
303
303
  self.disable_force_downsampling = disable_force_downsampling
304
304
  self.print_trace_id = print_trace_id
@@ -375,7 +375,7 @@ class FeaturesEnricher(TransformerMixin):
375
375
  self.autofe_features_display_handle = None
376
376
  self.report_button_handle = None
377
377
 
378
- def _get_sample_config(self, sample_config: Optional[SampleConfig] = None):
378
+ def _get_sample_config(self, sample_config: SampleConfig | None = None):
379
379
  sample_config = sample_config or SampleConfig(force_sample_size=Dataset.FORCE_SAMPLE_SIZE)
380
380
 
381
381
  maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
@@ -430,20 +430,20 @@ class FeaturesEnricher(TransformerMixin):
430
430
 
431
431
  def fit(
432
432
  self,
433
- X: Union[pd.DataFrame, pd.Series, np.ndarray],
434
- y: Union[pd.Series, np.ndarray, list],
435
- eval_set: Optional[Union[list[tuple], tuple]] = None,
433
+ X: pd.DataFrame | pd.Series | np.ndarray,
434
+ y: pd.Series | np.ndarray | list,
435
+ eval_set: list[tuple] | tuple | None = None,
436
436
  *args,
437
- exclude_features_sources: Optional[list[str]] = None,
438
- calculate_metrics: Optional[bool] = None,
439
- estimator: Optional[Any] = None,
440
- scoring: Union[Callable, str, None] = None,
441
- remove_outliers_calc_metrics: Optional[bool] = None,
442
- progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
443
- search_id_callback: Optional[Callable[[str], Any]] = None,
437
+ exclude_features_sources: list[str] | None = None,
438
+ calculate_metrics: bool | None = None,
439
+ estimator: Any | None = None,
440
+ scoring: Callable | str | None = None,
441
+ remove_outliers_calc_metrics: bool | None = None,
442
+ progress_callback: Callable[[SearchProgress], Any] | None = None,
443
+ search_id_callback: Callable[[str], Any] | None = None,
444
444
  select_features: bool = True,
445
- auto_fe_parameters: Optional[AutoFEParameters] = None,
446
- stability_threshold: float = 0.15,
445
+ auto_fe_parameters: AutoFEParameters | None = None,
446
+ stability_threshold: float = 0.2,
447
447
  stability_agg_func: str = "max",
448
448
  **kwargs,
449
449
  ):
@@ -479,7 +479,7 @@ class FeaturesEnricher(TransformerMixin):
479
479
  If True, return only selected features both from input and data sources.
480
480
  Otherwise, return all features from input and only selected features from data sources.
481
481
 
482
- stability_threshold: float, optional (default=0.15)
482
+ stability_threshold: float, optional (default=0.2)
483
483
  Stability threshold for selected features PSI calculation. If PSI is less than this threshold,
484
484
  then feature will be dropped.
485
485
 
@@ -579,27 +579,26 @@ class FeaturesEnricher(TransformerMixin):
579
579
 
580
580
  def fit_transform(
581
581
  self,
582
- X: Union[pd.DataFrame, pd.Series, np.ndarray],
583
- y: Union[pd.DataFrame, pd.Series, np.ndarray, list],
584
- eval_set: Optional[Union[list[tuple], tuple]] = None,
582
+ X: pd.DataFrame | pd.Series | np.ndarray,
583
+ y: pd.DataFrame | pd.Series | np.ndarray | list,
584
+ eval_set: list[tuple] | tuple | None = None,
585
585
  *args,
586
- exclude_features_sources: Optional[list[str]] = None,
586
+ exclude_features_sources: list[str] | None | None = None,
587
587
  keep_input: bool = True,
588
- calculate_metrics: Optional[bool] = None,
589
- scoring: Union[Callable, str, None] = None,
590
- estimator: Optional[Any] = None,
591
- remove_outliers_calc_metrics: Optional[bool] = None,
592
- progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
588
+ calculate_metrics: bool | None = None,
589
+ scoring: Callable | str | None = None,
590
+ estimator: Any | None = None,
591
+ remove_outliers_calc_metrics: bool | None = None,
592
+ progress_callback: Callable[[SearchProgress], Any] | None = None,
593
593
  select_features: bool = True,
594
- auto_fe_parameters: Optional[AutoFEParameters] = None,
595
- stability_threshold: float = 0.15,
594
+ auto_fe_parameters: AutoFEParameters | None = None,
595
+ stability_threshold: float = 0.2,
596
596
  stability_agg_func: str = "max",
597
597
  **kwargs,
598
598
  ) -> pd.DataFrame:
599
599
  """Fit to data, then transform it.
600
600
 
601
601
  Fits transformer to `X` and `y` and returns a transformed version of `X`.
602
- If keep_input is True, then all input columns are copied to the output dataframe.
603
602
 
604
603
  Parameters
605
604
  ----------
@@ -613,7 +612,9 @@ class FeaturesEnricher(TransformerMixin):
613
612
  list of pairs (X, y) for validation.
614
613
 
615
614
  keep_input: bool, optional (default=True)
616
- If True, copy original input columns to the output dataframe.
615
+ keep_input: bool, optional (default=True)
616
+ If True, then all search keys, ID columns, selected client features and enriched columns will be returned.
617
+ If False, then only enriched columns are returned.
617
618
 
618
619
  estimator: sklearn-compatible estimator, optional (default=None)
619
620
  Custom estimator for metrics calculation.
@@ -629,7 +630,7 @@ class FeaturesEnricher(TransformerMixin):
629
630
  If True, return only selected features both from input and data sources.
630
631
  Otherwise, return all features from input and only selected features from data sources.
631
632
 
632
- stability_threshold: float, optional (default=0.15)
633
+ stability_threshold: float, optional (default=0.2)
633
634
  Stability threshold for selected features PSI calculation. If PSI is less than this threshold,
634
635
  then feature will be dropped.
635
636
 
@@ -747,20 +748,19 @@ class FeaturesEnricher(TransformerMixin):
747
748
  self,
748
749
  X: pd.DataFrame,
749
750
  *args,
750
- y: Optional[pd.Series] = None,
751
- exclude_features_sources: Optional[list[str]] = None,
751
+ y: pd.Series | None = None,
752
+ exclude_features_sources: list[str] | None = None,
752
753
  keep_input: bool = True,
753
- trace_id: Optional[str] = None,
754
+ trace_id: str | None = None,
754
755
  metrics_calculation: bool = False,
755
756
  silent_mode=False,
756
- progress_bar: Optional[ProgressBar] = None,
757
- progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
757
+ progress_bar: ProgressBar | None = None,
758
+ progress_callback: Callable[[SearchProgress], Any] | None = None,
758
759
  **kwargs,
759
760
  ) -> pd.DataFrame:
760
761
  """Transform `X`.
761
762
 
762
763
  Returns a transformed version of `X`.
763
- If keep_input is True, then all input columns are copied to the output dataframe.
764
764
 
765
765
  Parameters
766
766
  ----------
@@ -768,7 +768,10 @@ class FeaturesEnricher(TransformerMixin):
768
768
  Input samples.
769
769
 
770
770
  keep_input: bool, optional (default=True)
771
- If True, copy original input columns to the output dataframe.
771
+ keep_input: bool, optional (default=True)
772
+ If True, then all search keys, ID columns, selected client features, enriched columns and intput columns
773
+ that were not present on fit will be returned.
774
+ If False, then only enriched columns are returned.
772
775
 
773
776
  Returns
774
777
  -------
@@ -809,6 +812,7 @@ class FeaturesEnricher(TransformerMixin):
809
812
  metrics_calculation=metrics_calculation,
810
813
  silent_mode=silent_mode,
811
814
  progress_bar=progress_bar,
815
+ keep_input=keep_input,
812
816
  )
813
817
  self.logger.info("Transform finished successfully")
814
818
  search_progress = SearchProgress(100.0, ProgressStage.FINISHED)
@@ -850,30 +854,26 @@ class FeaturesEnricher(TransformerMixin):
850
854
  raise e
851
855
  finally:
852
856
  self.logger.info(f"Transform elapsed time: {time.time() - start_time}")
853
-
854
- if result is not None:
855
- if keep_input:
856
- return result
857
- else:
858
- return result.drop(columns=X.columns, errors="ignore")
857
+
858
+ return result
859
859
 
860
860
  def calculate_metrics(
861
861
  self,
862
- X: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
863
- y: Union[pd.DataFrame, pd.Series, np.ndarray, list, None] = None,
864
- eval_set: Optional[Union[list[tuple], tuple]] = None,
862
+ X: pd.DataFrame | pd.Series | np.ndarray | None = None,
863
+ y: pd.DataFrame | pd.Series | np.ndarray | list | None = None,
864
+ eval_set: list[tuple] | tuple | None = None,
865
865
  *args,
866
- scoring: Union[Callable, str, None] = None,
867
- cv: Union[BaseCrossValidator, CVType, None] = None,
866
+ scoring: Callable | str | None = None,
867
+ cv: BaseCrossValidator | CVType | str | None = None,
868
868
  estimator=None,
869
- exclude_features_sources: Optional[list[str]] = None,
870
- remove_outliers_calc_metrics: Optional[bool] = None,
871
- trace_id: Optional[str] = None,
869
+ exclude_features_sources: list[str] | None = None,
870
+ remove_outliers_calc_metrics: bool | None = None,
871
+ trace_id: str | None = None,
872
872
  internal_call: bool = False,
873
- progress_bar: Optional[ProgressBar] = None,
874
- progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
873
+ progress_bar: ProgressBar | None = None,
874
+ progress_callback: Callable[[SearchProgress], Any] | None = None,
875
875
  **kwargs,
876
- ) -> Optional[pd.DataFrame]:
876
+ ) -> pd.DataFrame | None:
877
877
  """Calculate metrics
878
878
 
879
879
  Parameters
@@ -1311,16 +1311,16 @@ class FeaturesEnricher(TransformerMixin):
1311
1311
  def _select_features_by_psi(
1312
1312
  self,
1313
1313
  trace_id: str,
1314
- X: Union[pd.DataFrame, pd.Series, np.ndarray],
1315
- y: Union[pd.DataFrame, pd.Series, np.ndarray, list],
1316
- eval_set: Optional[Union[list[tuple], tuple]],
1314
+ X: pd.DataFrame | pd.Series | np.ndarray,
1315
+ y: pd.DataFrame | pd.Series | np.ndarray | list,
1316
+ eval_set: list[tuple] | tuple | None,
1317
1317
  stability_threshold: float,
1318
1318
  stability_agg_func: Callable,
1319
- cv: Union[BaseCrossValidator, CVType, str, None] = None,
1319
+ cv: BaseCrossValidator | CVType | str | None = None,
1320
1320
  estimator=None,
1321
- exclude_features_sources: Optional[list[str]] = None,
1321
+ exclude_features_sources: list[str] | None = None,
1322
1322
  progress_bar: bool = True,
1323
- progress_callback: Optional[Callable] = None,
1323
+ progress_callback: Callable | None = None,
1324
1324
  ):
1325
1325
  search_keys = self.search_keys.copy()
1326
1326
  validated_X, _, validated_eval_set = self._validate_train_eval(X, y, eval_set, silent=True)
@@ -1469,7 +1469,7 @@ class FeaturesEnricher(TransformerMixin):
1469
1469
 
1470
1470
  unstable_by_sparsity = [feature for feature, psi in psi_values_sparse.items() if psi > stability_threshold]
1471
1471
  if unstable_by_sparsity:
1472
- self.logger.info(f"Unstable by sparsity features: {sorted(unstable_by_sparsity)}")
1472
+ self.logger.info(f"Unstable by sparsity features ({stability_threshold}): {sorted(unstable_by_sparsity)}")
1473
1473
 
1474
1474
  psi_values = calculate_features_psi(
1475
1475
  checking_eval_set_df, cat_features, date_column, self.logger, model_task_type, stability_agg_func
@@ -1479,7 +1479,7 @@ class FeaturesEnricher(TransformerMixin):
1479
1479
 
1480
1480
  unstable_by_value = [feature for feature, psi in psi_values.items() if psi > stability_threshold]
1481
1481
  if unstable_by_value:
1482
- self.logger.info(f"Unstable by value features: {sorted(unstable_by_value)}")
1482
+ self.logger.info(f"Unstable by value features ({stability_threshold}): {sorted(unstable_by_value)}")
1483
1483
 
1484
1484
  self.psi_values = {
1485
1485
  feature: psi_value for feature, psi_value in psi_values.items() if psi_value <= stability_threshold
@@ -1557,12 +1557,12 @@ class FeaturesEnricher(TransformerMixin):
1557
1557
  self.logger.warning(msg)
1558
1558
 
1559
1559
  def _has_features_with_commercial_schema(
1560
- self, commercial_schema: str, exclude_features_sources: Optional[list[str]]
1560
+ self, commercial_schema: str, exclude_features_sources: list[str] | None
1561
1561
  ) -> bool:
1562
1562
  return len(self._get_features_with_commercial_schema(commercial_schema, exclude_features_sources)) > 0
1563
1563
 
1564
1564
  def _get_features_with_commercial_schema(
1565
- self, commercial_schema: str, exclude_features_sources: Optional[list[str]]
1565
+ self, commercial_schema: str, exclude_features_sources: list[str] | None
1566
1566
  ) -> list[str]:
1567
1567
  if exclude_features_sources:
1568
1568
  filtered_features_info = self._internal_features_info[
@@ -1577,14 +1577,14 @@ class FeaturesEnricher(TransformerMixin):
1577
1577
  ].values
1578
1578
  )
1579
1579
 
1580
- def _has_paid_features(self, exclude_features_sources: Optional[list[str]]) -> bool:
1580
+ def _has_paid_features(self, exclude_features_sources: list[str] | None) -> bool:
1581
1581
  return self._has_features_with_commercial_schema(CommercialSchema.PAID.value, exclude_features_sources)
1582
1582
 
1583
1583
  def _is_input_same_as_fit(
1584
1584
  self,
1585
- X: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
1586
- y: Union[pd.DataFrame, pd.Series, np.ndarray, list, None] = None,
1587
- eval_set: Optional[list[tuple]] = None,
1585
+ X: pd.DataFrame | pd.Series | np.ndarray | None = None,
1586
+ y: pd.DataFrame | pd.Series | np.ndarray | list | None = None,
1587
+ eval_set: list[tuple] | None = None,
1588
1588
  ) -> tuple:
1589
1589
  if X is None:
1590
1590
  return True, self.X, self.y, self.eval_set
@@ -1615,9 +1615,9 @@ class FeaturesEnricher(TransformerMixin):
1615
1615
  def _get_cv_and_groups(
1616
1616
  self,
1617
1617
  X: pd.DataFrame,
1618
- cv_override: Union[BaseCrossValidator, CVType, str, None],
1618
+ cv_override: BaseCrossValidator | CVType | str | None,
1619
1619
  search_keys: dict[str, SearchKey],
1620
- ) -> tuple[BaseCrossValidator, Optional[np.ndarray]]:
1620
+ ) -> tuple[BaseCrossValidator, np.ndarray] | None:
1621
1621
  _cv = cv_override or self.cv
1622
1622
  group_columns = sorted(self._get_group_columns(X, search_keys))
1623
1623
  groups = None
@@ -1645,8 +1645,8 @@ class FeaturesEnricher(TransformerMixin):
1645
1645
  return _cv, groups
1646
1646
 
1647
1647
  def _get_and_validate_client_cat_features(
1648
- self, estimator: Optional[Any], X: pd.DataFrame, search_keys: dict[str, SearchKey]
1649
- ) -> tuple[Optional[list[str]], list[str]]:
1648
+ self, estimator: Any | None, X: pd.DataFrame, search_keys: dict[str, SearchKey]
1649
+ ) -> tuple[list[str] | None, list[str]]:
1650
1650
  cat_features = []
1651
1651
  search_keys_for_metrics = []
1652
1652
  if (
@@ -1678,16 +1678,16 @@ class FeaturesEnricher(TransformerMixin):
1678
1678
  def _get_cached_enriched_data(
1679
1679
  self,
1680
1680
  trace_id: str,
1681
- X: Union[pd.DataFrame, pd.Series, np.ndarray, None] = None,
1682
- y: Union[pd.DataFrame, pd.Series, np.ndarray, list, None] = None,
1683
- eval_set: Optional[Union[list[tuple], tuple]] = None,
1684
- exclude_features_sources: Optional[list[str]] = None,
1685
- remove_outliers_calc_metrics: Optional[bool] = None,
1686
- cv_override: Union[BaseCrossValidator, CVType, str, None] = None,
1687
- search_keys_for_metrics: Optional[list[str]] = None,
1688
- progress_bar: Optional[ProgressBar] = None,
1689
- progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
1690
- client_cat_features: Optional[list[str]] = None,
1681
+ X: pd.DataFrame | pd.Series | np.ndarray | None = None,
1682
+ y: pd.DataFrame | pd.Series | np.ndarray | list | None = None,
1683
+ eval_set: list[tuple] | tuple | None = None,
1684
+ exclude_features_sources: list[str] | None = None,
1685
+ remove_outliers_calc_metrics: bool | None = None,
1686
+ cv_override: BaseCrossValidator | CVType | str | None = None,
1687
+ search_keys_for_metrics: list[str] | None = None,
1688
+ progress_bar: ProgressBar | None = None,
1689
+ progress_callback: Callable[[SearchProgress], Any] | None = None,
1690
+ client_cat_features: list[str] | None = None,
1691
1691
  is_for_metrics: bool = False,
1692
1692
  ):
1693
1693
  is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
@@ -1893,15 +1893,15 @@ class FeaturesEnricher(TransformerMixin):
1893
1893
  def _get_enriched_datasets(
1894
1894
  self,
1895
1895
  trace_id: str,
1896
- validated_X: Union[pd.DataFrame, pd.Series, np.ndarray, None],
1897
- validated_y: Union[pd.DataFrame, pd.Series, np.ndarray, list, None],
1898
- eval_set: Optional[list[tuple]],
1899
- exclude_features_sources: Optional[list[str]],
1896
+ validated_X: pd.DataFrame | pd.Series | np.ndarray | None,
1897
+ validated_y: pd.DataFrame | pd.Series | np.ndarray | list | None,
1898
+ eval_set: list[tuple] | None,
1899
+ exclude_features_sources: list[str] | None,
1900
1900
  is_input_same_as_fit: bool,
1901
1901
  is_demo_dataset: bool,
1902
- remove_outliers_calc_metrics: Optional[bool],
1903
- progress_bar: Optional[ProgressBar],
1904
- progress_callback: Optional[Callable[[SearchProgress], Any]],
1902
+ remove_outliers_calc_metrics: bool | None,
1903
+ progress_bar: ProgressBar | None,
1904
+ progress_callback: Callable[[SearchProgress], Any] | None,
1905
1905
  is_for_metrics: bool = False,
1906
1906
  ) -> _EnrichedDataForMetrics:
1907
1907
  datasets_hash = hash_input(validated_X, validated_y, eval_set)
@@ -1939,7 +1939,7 @@ class FeaturesEnricher(TransformerMixin):
1939
1939
  )
1940
1940
 
1941
1941
  def __get_sampled_cached_enriched(
1942
- self, datasets_hash: str, exclude_features_sources: Optional[list[str]]
1942
+ self, datasets_hash: str, exclude_features_sources: list[str] | None
1943
1943
  ) -> _EnrichedDataForMetrics:
1944
1944
  X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys, columns_renaming, generated_features = (
1945
1945
  self.__cached_sampled_datasets[datasets_hash]
@@ -1959,7 +1959,7 @@ class FeaturesEnricher(TransformerMixin):
1959
1959
  )
1960
1960
 
1961
1961
  def __get_enriched_as_input(
1962
- self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[list[tuple]], is_demo_dataset: bool
1962
+ self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: list[tuple] | None, is_demo_dataset: bool
1963
1963
  ) -> _EnrichedDataForMetrics:
1964
1964
  eval_set_sampled_dict = {}
1965
1965
 
@@ -2055,9 +2055,9 @@ class FeaturesEnricher(TransformerMixin):
2055
2055
 
2056
2056
  def __get_enriched_from_fit(
2057
2057
  self,
2058
- eval_set: Optional[list[tuple]],
2058
+ eval_set: list[tuple] | None,
2059
2059
  trace_id: str,
2060
- remove_outliers_calc_metrics: Optional[bool],
2060
+ remove_outliers_calc_metrics: bool | None,
2061
2061
  ) -> _EnrichedDataForMetrics:
2062
2062
  eval_set_sampled_dict = {}
2063
2063
  search_keys = self.fit_search_keys.copy()
@@ -2163,11 +2163,11 @@ class FeaturesEnricher(TransformerMixin):
2163
2163
  self,
2164
2164
  validated_X: pd.DataFrame,
2165
2165
  validated_y: pd.Series,
2166
- eval_set: Optional[list[tuple]],
2167
- exclude_features_sources: Optional[list[str]],
2166
+ eval_set: list[tuple] | None,
2167
+ exclude_features_sources: list[str] | None,
2168
2168
  trace_id: str,
2169
- progress_bar: Optional[ProgressBar],
2170
- progress_callback: Optional[Callable[[SearchProgress], Any]],
2169
+ progress_bar: ProgressBar | None,
2170
+ progress_callback: Callable[[SearchProgress], Any] | None,
2171
2171
  is_for_metrics: bool = False,
2172
2172
  ) -> _EnrichedDataForMetrics:
2173
2173
  has_eval_set = eval_set is not None
@@ -2178,7 +2178,7 @@ class FeaturesEnricher(TransformerMixin):
2178
2178
  df = self.__combine_train_and_eval_sets(validated_X, validated_y, eval_set)
2179
2179
 
2180
2180
  # Exclude OOT eval sets from transform because they are not used for metrics calculation
2181
- if not is_for_metrics and EVAL_SET_INDEX in df.columns:
2181
+ if is_for_metrics and EVAL_SET_INDEX in df.columns:
2182
2182
  for eval_index in df[EVAL_SET_INDEX].unique():
2183
2183
  if eval_index == 0:
2184
2184
  continue
@@ -2231,7 +2231,7 @@ class FeaturesEnricher(TransformerMixin):
2231
2231
  )
2232
2232
 
2233
2233
  def __combine_train_and_eval_sets(
2234
- self, X: pd.DataFrame, y: Optional[pd.Series] = None, eval_set: Optional[list[tuple]] = None
2234
+ self, X: pd.DataFrame, y: pd.Series | None = None, eval_set: list[tuple] | None = None
2235
2235
  ) -> pd.DataFrame:
2236
2236
  df = X.copy()
2237
2237
  if y is not None:
@@ -2354,7 +2354,7 @@ class FeaturesEnricher(TransformerMixin):
2354
2354
  generated_features=generated_features,
2355
2355
  )
2356
2356
 
2357
- def get_search_id(self) -> Optional[str]:
2357
+ def get_search_id(self) -> str | None:
2358
2358
  """Returns search_id of the fitted enricher. Not available before a successful fit."""
2359
2359
  return self._search_task.search_task_id if self._search_task else None
2360
2360
 
@@ -2367,7 +2367,7 @@ class FeaturesEnricher(TransformerMixin):
2367
2367
 
2368
2368
  return self.features_info
2369
2369
 
2370
- def get_progress(self, trace_id: Optional[str] = None, search_task: Optional[SearchTask] = None) -> SearchProgress:
2370
+ def get_progress(self, trace_id: str | None = None, search_task: SearchTask | None = None) -> SearchProgress:
2371
2371
  search_task = search_task or self._search_task
2372
2372
  if search_task is not None:
2373
2373
  trace_id = trace_id or uuid.uuid4()
@@ -2475,13 +2475,14 @@ if response.status_code == 200:
2475
2475
  trace_id: str,
2476
2476
  X: pd.DataFrame,
2477
2477
  *,
2478
- y: Optional[pd.Series] = None,
2479
- exclude_features_sources: Optional[list[str]] = None,
2478
+ y: pd.Series | None = None,
2479
+ exclude_features_sources: list[str] | None = None,
2480
2480
  metrics_calculation: bool = False,
2481
2481
  silent_mode: bool = False,
2482
- progress_bar: Optional[ProgressBar] = None,
2483
- progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
2482
+ progress_bar: ProgressBar | None = None,
2483
+ progress_callback: Callable[[SearchProgress], Any] | None = None,
2484
2484
  add_fit_system_record_id: bool = False,
2485
+ keep_input: bool = True,
2485
2486
  ) -> tuple[pd.DataFrame, dict[str, str], list[str], dict[str, SearchKey]]:
2486
2487
  if self._search_task is None:
2487
2488
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
@@ -2775,7 +2776,7 @@ if response.status_code == 200:
2775
2776
  progress_bar.progress = progress.to_progress_bar()
2776
2777
  if progress_callback is not None:
2777
2778
  progress_callback(progress)
2778
- prev_progress: Optional[SearchProgress] = None
2779
+ prev_progress: SearchProgress | None = None
2779
2780
  polling_period_seconds = 1
2780
2781
  try:
2781
2782
  while progress.stage != ProgressStage.DOWNLOADING.value:
@@ -2830,21 +2831,30 @@ if response.status_code == 200:
2830
2831
  how="left",
2831
2832
  )
2832
2833
 
2834
+ fit_input_columns = [c.originalName for c in self._search_task.get_file_metadata(trace_id).columns]
2835
+ new_columns_on_transform = [c for c in validated_Xy.columns if c not in fit_input_columns]
2836
+
2833
2837
  selected_generated_features = [
2834
2838
  c for c in generated_features if not self.fit_select_features or c in self.feature_names_
2835
2839
  ]
2836
- selected_input_columns = [
2837
- c
2838
- for c in validated_Xy.columns
2839
- if not self.fit_select_features
2840
- or c in self.feature_names_
2841
- or c in self.search_keys
2842
- or c in (self.id_columns or [])
2843
- or c in [EVAL_SET_INDEX, TARGET] # transform for metrics calculation
2844
- ]
2840
+ if keep_input is True:
2841
+ selected_input_columns = [
2842
+ c
2843
+ for c in validated_Xy.columns
2844
+ if not self.fit_select_features
2845
+ or c in self.feature_names_
2846
+ or c in new_columns_on_transform
2847
+ or c in self.search_keys
2848
+ or c in (self.id_columns or [])
2849
+ or c in [EVAL_SET_INDEX, TARGET] # transform for metrics calculation
2850
+ ]
2851
+ else:
2852
+ selected_input_columns = []
2853
+
2845
2854
  selecting_columns = selected_input_columns + selected_generated_features
2846
2855
  selecting_columns.extend(
2847
- c for c in result.columns if c in self.feature_names_ and c not in selecting_columns
2856
+ c for c in result.columns
2857
+ if c in self.feature_names_ and c not in selecting_columns and c not in validated_Xy.columns
2848
2858
  )
2849
2859
  if add_fit_system_record_id:
2850
2860
  selecting_columns.append(SORT_ID)
@@ -2871,7 +2881,7 @@ if response.status_code == 200:
2871
2881
 
2872
2882
  return result, columns_renaming, generated_features, search_keys
2873
2883
 
2874
- def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id: Optional[str] = None):
2884
+ def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id: str | None = None):
2875
2885
  if (search_keys is None or len(search_keys) == 0) and self.country_code is None:
2876
2886
  if search_id:
2877
2887
  self.logger.debug(f"search_id {search_id} provided without search_keys")
@@ -2944,22 +2954,22 @@ if response.status_code == 200:
2944
2954
  def __inner_fit(
2945
2955
  self,
2946
2956
  trace_id: str,
2947
- X: Union[pd.DataFrame, pd.Series, np.ndarray],
2948
- y: Union[pd.DataFrame, pd.Series, np.ndarray, list, None],
2949
- eval_set: Optional[list[tuple]],
2950
- progress_bar: Optional[ProgressBar],
2957
+ X: pd.DataFrame | pd.Series | np.ndarray,
2958
+ y: pd.DataFrame | pd.Series | np.ndarray | list | None,
2959
+ eval_set: list[tuple] | None,
2960
+ progress_bar: ProgressBar | None,
2951
2961
  start_time: int,
2952
2962
  *,
2953
- exclude_features_sources: Optional[list[str]] = None,
2954
- calculate_metrics: Optional[bool],
2955
- scoring: Union[Callable, str, None],
2956
- estimator: Optional[Any],
2963
+ exclude_features_sources: list[str] | None = None,
2964
+ calculate_metrics: bool | None,
2965
+ scoring: Callable | str | None,
2966
+ estimator: Any | None,
2957
2967
  stability_threshold: float,
2958
2968
  stability_agg_func: str,
2959
- remove_outliers_calc_metrics: Optional[bool],
2969
+ remove_outliers_calc_metrics: bool | None,
2960
2970
  auto_fe_parameters: AutoFEParameters,
2961
- progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
2962
- search_id_callback: Optional[Callable[[str], Any]] = None,
2971
+ progress_callback: Callable[[SearchProgress], Any] | None = None,
2972
+ search_id_callback: Callable[[str], Any] | None = None,
2963
2973
  ):
2964
2974
  self._search_task = None
2965
2975
  self.warning_counter.reset()
@@ -3378,16 +3388,6 @@ if response.status_code == 200:
3378
3388
 
3379
3389
  self.__show_selected_features()
3380
3390
 
3381
- autofe_description = self.get_autofe_features_description()
3382
- if autofe_description is not None and len(autofe_description) > 0:
3383
- self.logger.info(f"AutoFE descriptions: {autofe_description}")
3384
- self.autofe_features_display_handle = display_html_dataframe(
3385
- df=autofe_description,
3386
- internal_df=autofe_description,
3387
- header=self.bundle.get("autofe_descriptions_header"),
3388
- display_id=f"autofe_descriptions_{uuid.uuid4()}",
3389
- )
3390
-
3391
3391
  if self._has_paid_features(exclude_features_sources):
3392
3392
  if calculate_metrics is not None and calculate_metrics:
3393
3393
  msg = self.bundle.get("metrics_with_paid_features")
@@ -3466,7 +3466,7 @@ if response.status_code == 200:
3466
3466
  def __should_add_date_column(self):
3467
3467
  return self.add_date_if_missing or (self.cv is not None and self.cv.is_time_series())
3468
3468
 
3469
- def __get_renamed_id_columns(self, renaming: Optional[dict[str, str]] = None):
3469
+ def __get_renamed_id_columns(self, renaming: dict[str, str] | None = None):
3470
3470
  renaming = renaming or self.fit_columns_renaming
3471
3471
  reverse_renaming = {v: k for k, v in renaming.items()}
3472
3472
  return None if self.id_columns is None else [reverse_renaming.get(c) or c for c in self.id_columns]
@@ -3511,11 +3511,11 @@ if response.status_code == 200:
3511
3511
  def _validate_train_eval(
3512
3512
  self,
3513
3513
  X: pd.DataFrame,
3514
- y: Optional[pd.Series] = None,
3515
- eval_set: Optional[list[tuple[pd.DataFrame, pd.Series]]] = None,
3514
+ y: pd.Series | None = None,
3515
+ eval_set: list[tuple[pd.DataFrame, pd.Series]] | None = None,
3516
3516
  is_transform: bool = False,
3517
3517
  silent: bool = False,
3518
- ) -> tuple[pd.DataFrame, pd.Series, Optional[list[tuple[pd.DataFrame, pd.Series]]]]:
3518
+ ) -> tuple[pd.DataFrame, pd.Series, list[tuple[pd.DataFrame, pd.Series]]] | None:
3519
3519
  validated_X = self._validate_X(X, is_transform)
3520
3520
  validated_y = self._validate_y(validated_X, y, enforce_y=not is_transform)
3521
3521
  validated_eval_set = self._validate_eval_set(validated_X, eval_set, silent=silent)
@@ -3594,7 +3594,7 @@ if response.status_code == 200:
3594
3594
 
3595
3595
  return validated_X
3596
3596
 
3597
- def _validate_y(self, X: pd.DataFrame, y, enforce_y: bool = True) -> Optional[pd.Series]:
3597
+ def _validate_y(self, X: pd.DataFrame, y, enforce_y: bool = True) -> pd.Series | None:
3598
3598
  if y is None and not enforce_y:
3599
3599
  return None
3600
3600
  if (
@@ -3644,7 +3644,7 @@ if response.status_code == 200:
3644
3644
  return validated_y
3645
3645
 
3646
3646
  def _validate_eval_set(
3647
- self, X: pd.DataFrame, eval_set: Optional[list[tuple[pd.DataFrame, pd.Series]]], silent: bool = False
3647
+ self, X: pd.DataFrame, eval_set: list[tuple[pd.DataFrame, pd.Series]] | None, silent: bool = False
3648
3648
  ):
3649
3649
  if eval_set is None:
3650
3650
  return None
@@ -3756,7 +3756,7 @@ if response.status_code == 200:
3756
3756
 
3757
3757
  return validated_eval_X, validated_eval_y
3758
3758
 
3759
- def _validate_baseline_score(self, X: pd.DataFrame, eval_set: Optional[list[tuple]]):
3759
+ def _validate_baseline_score(self, X: pd.DataFrame, eval_set: list[tuple] | None):
3760
3760
  if self.baseline_score_column is not None:
3761
3761
  if self.baseline_score_column not in X.columns:
3762
3762
  raise ValidationError(
@@ -3783,7 +3783,7 @@ if response.status_code == 200:
3783
3783
 
3784
3784
  @staticmethod
3785
3785
  def _sort_by_system_record_id(
3786
- X: pd.DataFrame, y: pd.Series, cv: Optional[CVType]
3786
+ X: pd.DataFrame, y: pd.Series, cv: CVType | None
3787
3787
  ) -> tuple[pd.DataFrame, pd.Series]:
3788
3788
  if cv not in [CVType.time_series, CVType.blocked_time_series]:
3789
3789
  record_id_column = ENTITY_SYSTEM_RECORD_ID if ENTITY_SYSTEM_RECORD_ID in X else SYSTEM_RECORD_ID
@@ -3801,7 +3801,7 @@ if response.status_code == 200:
3801
3801
  # Deprecated
3802
3802
  @staticmethod
3803
3803
  def _sort_by_keys(
3804
- X: pd.DataFrame, y: pd.Series, search_keys: dict[str, SearchKey], cv: Optional[CVType]
3804
+ X: pd.DataFrame, y: pd.Series, search_keys: dict[str, SearchKey], cv: CVType | None
3805
3805
  ) -> tuple[pd.DataFrame, pd.Series]:
3806
3806
  if cv not in [CVType.time_series, CVType.blocked_time_series]:
3807
3807
  if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
@@ -3841,14 +3841,14 @@ if response.status_code == 200:
3841
3841
  def __log_debug_information(
3842
3842
  self,
3843
3843
  X: pd.DataFrame,
3844
- y: Union[pd.Series, np.ndarray, list, None] = None,
3845
- eval_set: Optional[list[tuple]] = None,
3846
- exclude_features_sources: Optional[list[str]] = None,
3847
- calculate_metrics: Optional[bool] = None,
3848
- cv: Optional[Any] = None,
3849
- scoring: Optional[Any] = None,
3850
- estimator: Optional[Any] = None,
3851
- remove_outliers_calc_metrics: Optional[bool] = None,
3844
+ y: pd.Series | np.ndarray | list | None = None,
3845
+ eval_set: list[tuple] | None = None,
3846
+ exclude_features_sources: list[str] | None = None,
3847
+ calculate_metrics: bool | None = None,
3848
+ cv: Any | None = None,
3849
+ scoring: Any | None = None,
3850
+ estimator: Any | None = None,
3851
+ remove_outliers_calc_metrics: bool | None = None,
3852
3852
  ):
3853
3853
  try:
3854
3854
  resolved_api_key = self.api_key or os.environ.get(UPGINI_API_KEY)
@@ -3973,7 +3973,7 @@ if response.status_code == 200:
3973
3973
  ]
3974
3974
 
3975
3975
  @staticmethod
3976
- def _get_email_column(search_keys: dict[str, SearchKey]) -> Optional[str]:
3976
+ def _get_email_column(search_keys: dict[str, SearchKey]) -> str | None:
3977
3977
  cols = [col for col, t in search_keys.items() if t == SearchKey.EMAIL]
3978
3978
  if len(cols) > 1:
3979
3979
  raise Exception("More than one email column found after unnest")
@@ -3981,7 +3981,7 @@ if response.status_code == 200:
3981
3981
  return cols[0]
3982
3982
 
3983
3983
  @staticmethod
3984
- def _get_hem_column(search_keys: dict[str, SearchKey]) -> Optional[str]:
3984
+ def _get_hem_column(search_keys: dict[str, SearchKey]) -> str | None:
3985
3985
  cols = [col for col, t in search_keys.items() if t == SearchKey.HEM]
3986
3986
  if len(cols) > 1:
3987
3987
  raise Exception("More than one hem column found after unnest")
@@ -3989,7 +3989,7 @@ if response.status_code == 200:
3989
3989
  return cols[0]
3990
3990
 
3991
3991
  @staticmethod
3992
- def _get_ip_column(search_keys: dict[str, SearchKey]) -> Optional[str]:
3992
+ def _get_ip_column(search_keys: dict[str, SearchKey]) -> str | None:
3993
3993
  cols = [col for col, t in search_keys.items() if t == SearchKey.IP]
3994
3994
  if len(cols) > 1:
3995
3995
  raise Exception("More than one ip column found after unnest")
@@ -3997,25 +3997,25 @@ if response.status_code == 200:
3997
3997
  return cols[0]
3998
3998
 
3999
3999
  @staticmethod
4000
- def _get_phone_column(search_keys: dict[str, SearchKey]) -> Optional[str]:
4000
+ def _get_phone_column(search_keys: dict[str, SearchKey]) -> str | None:
4001
4001
  for col, t in search_keys.items():
4002
4002
  if t == SearchKey.PHONE:
4003
4003
  return col
4004
4004
 
4005
4005
  @staticmethod
4006
- def _get_country_column(search_keys: dict[str, SearchKey]) -> Optional[str]:
4006
+ def _get_country_column(search_keys: dict[str, SearchKey]) -> str | None:
4007
4007
  for col, t in search_keys.items():
4008
4008
  if t == SearchKey.COUNTRY:
4009
4009
  return col
4010
4010
 
4011
4011
  @staticmethod
4012
- def _get_postal_column(search_keys: dict[str, SearchKey]) -> Optional[str]:
4012
+ def _get_postal_column(search_keys: dict[str, SearchKey]) -> str | None:
4013
4013
  for col, t in search_keys.items():
4014
4014
  if t == SearchKey.POSTAL_CODE:
4015
4015
  return col
4016
4016
 
4017
4017
  @staticmethod
4018
- def _get_date_column(search_keys: dict[str, SearchKey]) -> Optional[str]:
4018
+ def _get_date_column(search_keys: dict[str, SearchKey]) -> str | None:
4019
4019
  return SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
4020
4020
 
4021
4021
  def _explode_multiple_search_keys(
@@ -4062,10 +4062,10 @@ if response.status_code == 200:
4062
4062
  id_name: str,
4063
4063
  target_name: str,
4064
4064
  columns_renaming: dict[str, str],
4065
- id_columns: Optional[list[str]],
4066
- cv: Optional[CVType],
4065
+ id_columns: list[str] | None,
4066
+ cv: CVType | None,
4067
4067
  model_task_type: ModelTaskType,
4068
- logger: Optional[logging.Logger] = None,
4068
+ logger: logging.Logger | None = None,
4069
4069
  bundle: ResourceBundle = bundle,
4070
4070
  ) -> pd.DataFrame:
4071
4071
  original_index_name = df.index.name
@@ -4201,7 +4201,7 @@ if response.status_code == 200:
4201
4201
  def __enrich(
4202
4202
  self,
4203
4203
  input_df: pd.DataFrame,
4204
- result_features: Optional[pd.DataFrame],
4204
+ result_features: pd.DataFrame | None,
4205
4205
  how: str = "inner",
4206
4206
  drop_system_record_id=True,
4207
4207
  ) -> pd.DataFrame:
@@ -4320,7 +4320,7 @@ if response.status_code == 200:
4320
4320
  self,
4321
4321
  trace_id: str,
4322
4322
  clients_features_df: pd.DataFrame,
4323
- updated_shaps: Optional[dict[str, float]] = None,
4323
+ updated_shaps: dict[str, float] | None = None,
4324
4324
  update_selected_features: bool = True,
4325
4325
  silent=False,
4326
4326
  ):
@@ -4659,12 +4659,12 @@ if response.status_code == 200:
4659
4659
 
4660
4660
  def __show_metrics(
4661
4661
  self,
4662
- scoring: Union[Callable, str, None],
4663
- estimator: Optional[Any],
4664
- remove_outliers_calc_metrics: Optional[bool],
4662
+ scoring: Callable | str | None,
4663
+ estimator: Any | None,
4664
+ remove_outliers_calc_metrics: bool | None,
4665
4665
  trace_id: str,
4666
- progress_bar: Optional[ProgressBar] = None,
4667
- progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
4666
+ progress_bar: ProgressBar | None = None,
4667
+ progress_callback: Callable[[SearchProgress], Any] | None = None,
4668
4668
  ):
4669
4669
  self.metrics = self.calculate_metrics(
4670
4670
  scoring=scoring,
@@ -4698,13 +4698,23 @@ if response.status_code == 200:
4698
4698
  self.bundle.get("relevant_data_sources_header"),
4699
4699
  display_id=f"data_sources_{uuid.uuid4()}",
4700
4700
  )
4701
+
4702
+ autofe_description = self.get_autofe_features_description()
4703
+ if autofe_description is not None and len(autofe_description) > 0:
4704
+ self.logger.info(f"AutoFE descriptions: {autofe_description}")
4705
+ self.autofe_features_display_handle = display_html_dataframe(
4706
+ df=autofe_description,
4707
+ internal_df=autofe_description,
4708
+ header=self.bundle.get("autofe_descriptions_header"),
4709
+ display_id=f"autofe_descriptions_{uuid.uuid4()}",
4710
+ )
4701
4711
  else:
4702
4712
  msg = self.bundle.get("features_info_zero_important_features")
4703
4713
  self.__log_warning(msg, show_support_link=True)
4704
4714
  except (ImportError, NameError):
4705
4715
  print(self._internal_features_info)
4706
4716
 
4707
- def __show_report_button(self, display_id: Optional[str] = None, display_handle=None):
4717
+ def __show_report_button(self, display_id: str | None = None, display_handle=None):
4708
4718
  try:
4709
4719
  return prepare_and_show_report(
4710
4720
  relevant_features_df=self._features_info_without_links,
@@ -4844,7 +4854,7 @@ if response.status_code == 200:
4844
4854
  except Exception:
4845
4855
  self.logger.exception("Failed to dump python libs")
4846
4856
 
4847
- def __display_support_link(self, link_text: Optional[str] = None):
4857
+ def __display_support_link(self, link_text: str | None = None):
4848
4858
  support_link = self.bundle.get("support_link")
4849
4859
  link_text = link_text or self.bundle.get("support_text")
4850
4860
  try:
@@ -4871,9 +4881,9 @@ if response.status_code == 200:
4871
4881
  def dump_input(
4872
4882
  self,
4873
4883
  trace_id: str,
4874
- X: Union[pd.DataFrame, pd.Series],
4875
- y: Union[pd.DataFrame, pd.Series, None] = None,
4876
- eval_set: Union[tuple, None] = None,
4884
+ X: pd.DataFrame | pd.Series,
4885
+ y: pd.DataFrame | pd.Series | None = None,
4886
+ eval_set: tuple | None = None,
4877
4887
  ):
4878
4888
  def dump_task(X_, y_, eval_set_):
4879
4889
  with MDC(trace_id=trace_id):
@@ -4964,7 +4974,7 @@ def is_frames_equal(first, second, bundle: ResourceBundle) -> bool:
4964
4974
  raise ValidationError(bundle.get("x_and_eval_x_diff_types").format(type(first), type(second)))
4965
4975
 
4966
4976
 
4967
- def drop_duplicates(df: Union[pd.DataFrame, np.ndarray, Any]) -> pd.DataFrame:
4977
+ def drop_duplicates(df: pd.DataFrame | np.ndarray | Any) -> pd.DataFrame:
4968
4978
  if isinstance(df, pd.DataFrame):
4969
4979
  return df.drop_duplicates()
4970
4980
  elif isinstance(df, np.ndarray):
upgini/http.py CHANGED
@@ -413,43 +413,9 @@ class _RestClient:
413
413
  with open(path, "rb") as file:
414
414
  files = {"file": (file_name, file, "application/octet-stream")}
415
415
  self._with_unauth_retry(
416
- lambda: self._send_post_file_req_v2(
417
- api_path, files, trace_id=trace_id, need_json_response=False
418
- )
416
+ lambda: self._send_post_file_req_v2(api_path, files, trace_id=trace_id, need_json_response=False)
419
417
  )
420
418
 
421
- def dump_input_files(
422
- self,
423
- trace_id: str,
424
- x_path: str,
425
- y_path: Optional[str] = None,
426
- eval_x_path: Optional[str] = None,
427
- eval_y_path: Optional[str] = None,
428
- ):
429
- api_path = self.SEARCH_DUMP_INPUT_FILE_FMT
430
-
431
- def upload_with_check(path: str, file_name: str):
432
- digest_sha256 = file_hash(path)
433
- if self.is_file_uploaded(trace_id, digest_sha256):
434
- # print(f"File {path} was already uploaded with digest {digest_sha256}, skipping")
435
- return
436
- else:
437
- with open(path, "rb") as file:
438
- files = {"file": (file_name, file, "application/octet-stream")}
439
- self._with_unauth_retry(
440
- lambda: self._send_post_file_req_v2(
441
- api_path, files, trace_id=trace_id, need_json_response=False
442
- )
443
- )
444
-
445
- upload_with_check(x_path, "x.parquet")
446
- if y_path:
447
- upload_with_check(y_path, "y.parquet")
448
- if eval_x_path:
449
- upload_with_check(eval_x_path, "eval_x.parquet")
450
- if eval_y_path:
451
- upload_with_check(eval_y_path, "eval_y.parquet")
452
-
453
419
  def initial_search_v2(
454
420
  self,
455
421
  trace_id: str,
@@ -1080,6 +1046,7 @@ class LoggerFactory:
1080
1046
 
1081
1047
  upgini_logger = logging.getLogger(f"upgini.{hash(key)}")
1082
1048
  upgini_logger.handlers.clear()
1049
+ upgini_logger.propagate = False # Prevent duplicate logging in Jupyter notebooks
1083
1050
  rest_client = get_rest_client(backend_url, api_token, client_ip, client_visitorid)
1084
1051
  datadog_handler = BackendLogHandler(rest_client, client_ip, client_visitorid)
1085
1052
  json_formatter = jsonlogger.JsonFormatter(
upgini/metrics.py CHANGED
@@ -815,9 +815,9 @@ class CatBoostWrapper(EstimatorWrapper):
815
815
  encoded = cat_encoder.transform(x[self.cat_features]).astype(int)
816
816
  else:
817
817
  encoded = cat_encoder.transform(x[self.cat_features])
818
- cat_features = encoded.columns.to_list()
819
- x.drop(columns=encoded.columns, inplace=True, errors="ignore")
820
- x[encoded.columns] = encoded
818
+ cat_features = self.cat_features
819
+ x = x.drop(columns=self.cat_features, errors="ignore")
820
+ x[self.cat_features] = encoded
821
821
  else:
822
822
  cat_features = self.cat_features
823
823
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.114a5
3
+ Version: 1.2.116
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -749,6 +749,36 @@ enricher.fit(
749
749
  )
750
750
  ```
751
751
 
752
+ ### Control feature stability with PSI parameters
753
+
754
+ `FeaturesEnricher` supports Population Stability Index (PSI) calculation on eval_set to evaluate feature stability over time. You can control this behavior using stability parameters in `fit` and `fit_transform` methods:
755
+
756
+ ```python
757
+ enricher = FeaturesEnricher(
758
+ search_keys={"registration_date": SearchKey.DATE}
759
+ )
760
+
761
+ # Control feature stability during fit
762
+ enricher.fit(
763
+ X, y,
764
+ stability_threshold=0.2, # PSI threshold: features with PSI above this value will be dropped
765
+ stability_agg_func="max" # Aggregation function for stability values: "max", "min", "mean"
766
+ )
767
+
768
+ # Same parameters work for fit_transform
769
+ enriched_df = enricher.fit_transform(
770
+ X, y,
771
+ stability_threshold=0.1, # Stricter threshold for more stable features
772
+ stability_agg_func="mean" # Use mean aggregation instead of max
773
+ )
774
+ ```
775
+
776
+ **Stability parameters:**
777
+ - `stability_threshold` (float, default=0.2): PSI threshold value. Features with PSI below this threshold will be excluded from the final feature set. Lower values mean stricter stability requirements.
778
+ - `stability_agg_func` (str, default="max"): Function to aggregate PSI values across time intervals. Options: "max" (most conservative), "min" (least conservative), "mean" (balanced approach).
779
+
780
+ **PSI (Population Stability Index)** measures how much feature distribution changes over time. Lower PSI values indicate more stable features, which are generally more reliable for production ML models.
781
+
752
782
  ### Use custom loss function in feature selection & metrics calculation
753
783
 
754
784
  `FeaturesEnricher` can be initialized with additional string parameter `loss`.
@@ -874,7 +904,7 @@ Some convenient ways to start contributing are:
874
904
  ⚙️ **Gitpod** [![Gitpod Ready-to-Code](https://img.shields.io/badge/Gitpod-Ready--to--Code-blue?logo=gitpod)](https://gitpod.io/#https://github.com/upgini/upgini) You can use Gitpod to launch a fully functional development environment right in your browser.
875
905
 
876
906
  ## 🔗 Useful links
877
- - [Simple sales predictions as a template notebook](#-simple-sales-predictions-use-as-a-template)
907
+ - [Simple sales predictions as a template notebook](#-simple-sales-prediction-for-retail-stores)
878
908
  - [Full list of Kaggle Guides & Examples](https://www.kaggle.com/romaupgini/code)
879
909
  - [Project on PyPI](https://pypi.org/project/upgini)
880
910
  - [More perks for registered users](https://profile.upgini.com)
@@ -1,12 +1,12 @@
1
- upgini/__about__.py,sha256=iwE4cHR_k5DhwOJ-LokS39KSTP25EzJzsOu-OvIvCHA,26
1
+ upgini/__about__.py,sha256=uljlkRI1AQACoivbMb5ybeY28dehm84UBPMxgB7YFtM,24
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=pQ8JQe0cdygD-W9GefJmfE6bnj4EYzXsjlgWdIS9nS8,31578
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=YHfuasHkeUZqexG46fOFviPv7SYuavT6Bk0HGsHAnqs,228847
7
- upgini/http.py,sha256=OuvlySDc8CRgDvB8o1lqRbmsp2Gi1WWRRTYElu1D5nc,45531
6
+ upgini/features_enricher.py,sha256=iYay-Ye5WGntieg3X7uyg9W3x_1FUELrmhJnJIvQMeI,228897
7
+ upgini/http.py,sha256=-J_wOpnwVnT0ebPC6sOs6fN3AWtCD0LJLu6nlYmxaqk,44348
8
8
  upgini/metadata.py,sha256=VzgtgEbPPtNxTrj9LM5qSDP3DujHwAXqbUSKBjPcb9c,12477
9
- upgini/metrics.py,sha256=gjJDtlV6JrhUJumbNipdzjY4ojEupHGPihb9_VxjtWc,45939
9
+ upgini/metrics.py,sha256=_kBg6gSXx82LRlRadg-Qggm-GtcPBLbtK3nGCKQjELo,45925
10
10
  upgini/search_task.py,sha256=SAiUd1AytbA2Q6PSnnztr7oTRKpud1wQZ5YtKjsmQHU,18256
11
11
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
12
12
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -74,7 +74,7 @@ upgini/utils/target_utils.py,sha256=GCPn4QeJ83JJ_vyBJ3IhY5fyIRkLC9q9BE59S2FRO1I,
74
74
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
75
75
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
76
76
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
77
- upgini-1.2.114a5.dist-info/METADATA,sha256=TLzfyEHQdjQYUgRQhHilQdMtpIIOreYLC8Tb0SqQIwg,49197
78
- upgini-1.2.114a5.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
79
- upgini-1.2.114a5.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
80
- upgini-1.2.114a5.dist-info/RECORD,,
77
+ upgini-1.2.116.dist-info/METADATA,sha256=w7UkEj5YDpGksjUI1ii61tcOjPenO4XNEDrvPnaJVj4,50692
78
+ upgini-1.2.116.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
79
+ upgini-1.2.116.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
80
+ upgini-1.2.116.dist-info/RECORD,,