upgini 1.2.113a3974.dev2__py3-none-any.whl → 1.2.114__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/utils/psi.py ADDED
@@ -0,0 +1,300 @@
1
+ import itertools
2
+ import logging
3
+ import operator
4
+ from functools import reduce
5
+ from typing import Callable, Dict, Optional
6
+
7
+ import more_itertools
8
+ import numpy as np
9
+ import pandas as pd
10
+ from pandas.api.types import is_numeric_dtype
11
+ from pydantic import BaseModel
12
+
13
+ from upgini.metadata import TARGET, ModelTaskType
14
+
15
+
16
+ class StabilityParams(BaseModel):
17
+ threshold: float = 999
18
+ n_intervals: int = 12
19
+ min_intervals: int = 10
20
+ max_intervals: Optional[int] = None
21
+ min_values_in_interval: Optional[int] = None
22
+ n_bins: int = 10
23
+ min_values_in_bin: Optional[int] = None
24
+ cat_top_pct: float = 0.7
25
+ agg: str = "max"
26
+
27
+
28
+ DEFAULT_TARGET_PARAMS = StabilityParams(
29
+ n_intervals=12,
30
+ min_intervals=10,
31
+ max_intervals=None,
32
+ min_values_in_interval=None,
33
+ n_bins=5,
34
+ )
35
+
36
+ DEFAULT_FEATURES_PARAMS = StabilityParams(
37
+ n_intervals=12,
38
+ min_intervals=10,
39
+ max_intervals=None,
40
+ min_values_in_interval=None,
41
+ n_bins=10,
42
+ )
43
+
44
+
45
+ def calculate_sparsity_psi(
46
+ df: pd.DataFrame,
47
+ cat_features: list[str],
48
+ date_column: str,
49
+ logger: logging.Logger,
50
+ model_task_type: ModelTaskType,
51
+ stability_agg_func: str | None = None,
52
+ psi_features_params: StabilityParams = DEFAULT_FEATURES_PARAMS,
53
+ psi_target_params: StabilityParams = DEFAULT_TARGET_PARAMS,
54
+ ) -> Dict[str, float]:
55
+ sparse_features = df.columns[df.isna().sum() > 0].to_list()
56
+ if len(sparse_features) > 0:
57
+ logger.info(f"Calculating sparsity stability for {len(sparse_features)} sparse features")
58
+ sparse_df = df[sparse_features].notna()
59
+ sparse_df[date_column] = df[date_column]
60
+ return calculate_features_psi(
61
+ sparse_df,
62
+ cat_features,
63
+ date_column,
64
+ logger,
65
+ model_task_type,
66
+ stability_agg_func,
67
+ psi_target_params,
68
+ psi_features_params,
69
+ )
70
+ return {}
71
+
72
+
73
+ def calculate_features_psi(
74
+ df: pd.DataFrame,
75
+ cat_features: list[str],
76
+ date_column: str,
77
+ logger: logging.Logger,
78
+ model_task_type: ModelTaskType,
79
+ stability_agg_func: str | None = None,
80
+ psi_features_params: StabilityParams = DEFAULT_FEATURES_PARAMS,
81
+ psi_target_params: StabilityParams = DEFAULT_TARGET_PARAMS,
82
+ ) -> dict[str, float]:
83
+ empty_res = {col: 0.0 for col in df.columns if col not in [TARGET, date_column]}
84
+
85
+ if not is_numeric_dtype(df[date_column]):
86
+ df[date_column] = pd.to_datetime(df[date_column]).dt.floor("D").astype(np.int64) / 10**6
87
+
88
+ # Filter out rows with missing dates
89
+ df = df[df[date_column].notna()].copy()
90
+
91
+ n_months = pd.to_datetime(df[date_column], unit="ms").dt.month.nunique()
92
+
93
+ if TARGET in df.columns:
94
+ psi_target_params.n_intervals = min(
95
+ psi_target_params.max_intervals or np.inf, max(psi_target_params.min_intervals, n_months)
96
+ )
97
+ logger.info(f"Setting {psi_target_params.n_intervals} intervals for target PSI check")
98
+
99
+ logger.info(f"Calculating target PSI for {psi_target_params.n_intervals} intervals")
100
+ reference_mask, current_masks = _split_intervals(df, date_column, psi_target_params.n_intervals, logger)
101
+
102
+ if psi_target_params.min_values_in_interval is not None and any(
103
+ len(mask) < psi_target_params.min_values_in_interval
104
+ for mask in itertools.chain(current_masks, [reference_mask])
105
+ ):
106
+ logger.info(
107
+ f"Some intervals have less than {psi_target_params.min_values_in_interval} values. Skip PSI check"
108
+ )
109
+ return empty_res
110
+
111
+ target_agg_func = _get_agg_func(stability_agg_func or psi_target_params.agg)
112
+ logger.info(f"Calculating target PSI with agg function {target_agg_func}")
113
+ target_psi = _stability_agg(
114
+ [df[TARGET][cur] for cur in current_masks],
115
+ reference_data=df[TARGET][reference_mask],
116
+ is_numerical=model_task_type == ModelTaskType.REGRESSION,
117
+ min_values_in_bin=psi_target_params.min_values_in_bin,
118
+ n_bins=psi_target_params.n_bins,
119
+ cat_top_pct=psi_target_params.cat_top_pct,
120
+ agg_func=target_agg_func,
121
+ )
122
+ if target_psi is None or np.isnan(target_psi):
123
+ logger.info("Cannot determine target PSI. Skip feature PSI check")
124
+ return empty_res
125
+
126
+ if target_psi > psi_target_params.threshold:
127
+ logger.info(
128
+ f"Target PSI {target_psi} is more than threshold {psi_target_params.threshold}. Skip feature PSI check"
129
+ )
130
+ return empty_res
131
+
132
+ psi_features_params.n_intervals = min(
133
+ psi_features_params.max_intervals or np.inf, max(psi_features_params.min_intervals, n_months)
134
+ )
135
+ logger.info(f"Setting {psi_features_params.n_intervals} intervals for features PSI check")
136
+
137
+ logger.info(f"Calculating PSI for {len(df.columns)} features")
138
+ reference_mask, current_masks = _split_intervals(df, date_column, psi_features_params.n_intervals, logger)
139
+ features_agg_func = _get_agg_func(stability_agg_func or psi_features_params.agg)
140
+ logger.info(f"Calculating features PSI with agg function {features_agg_func}")
141
+ psi_values = [
142
+ _stability_agg(
143
+ [df[feature][cur] for cur in current_masks],
144
+ reference_data=df[feature][reference_mask],
145
+ is_numerical=feature not in cat_features,
146
+ min_values_in_bin=psi_features_params.min_values_in_bin,
147
+ n_bins=psi_features_params.n_bins,
148
+ cat_top_pct=psi_features_params.cat_top_pct,
149
+ agg_func=features_agg_func,
150
+ )
151
+ for feature in df.columns
152
+ if feature not in [TARGET, date_column]
153
+ ]
154
+ return {feature: psi for feature, psi in zip(df.columns, psi_values)}
155
+
156
+
157
+ def _split_intervals(
158
+ df: pd.DataFrame, date_column: str, n_intervals: int, logger: logging.Logger
159
+ ) -> tuple[pd.Series, list[pd.Series]]:
160
+ date_series = df[date_column]
161
+
162
+ # Check if we have enough unique values for the requested number of intervals
163
+ unique_values = date_series.nunique()
164
+
165
+ # If we have fewer unique values than requested intervals, adjust n_intervals
166
+ if unique_values < n_intervals:
167
+ logger.warning(f"Date column '{date_column}' has only {unique_values} unique values")
168
+
169
+ time_intervals = pd.qcut(date_series, q=n_intervals, duplicates="drop")
170
+ interval_labels = time_intervals.unique()
171
+ reference_mask = time_intervals == interval_labels[0]
172
+ current_masks = [time_intervals == label for label in interval_labels[1:]]
173
+ return reference_mask, current_masks
174
+
175
+
176
+ def _get_agg_func(agg: str):
177
+ np_agg = getattr(np, agg, None)
178
+ if np_agg is None and agg.startswith("q"):
179
+ q = int(agg[1:])
180
+ return lambda x: np.quantile(list(x), q / 100, method="higher")
181
+ return np_agg
182
+
183
+
184
+ def _psi(reference_percent: np.ndarray, current_percent: np.ndarray) -> float:
185
+ return np.sum((reference_percent - current_percent) * np.log(reference_percent / current_percent))
186
+
187
+
188
+ def _stability_agg(
189
+ current_data: list[pd.Series],
190
+ reference_data: pd.Series,
191
+ is_numerical: bool = True,
192
+ min_values_in_bin: int | None = None,
193
+ n_bins: int = 10,
194
+ cat_top_pct: float = 0.7,
195
+ agg_func: Callable = max,
196
+ ) -> float | None:
197
+ """Calculate the PSI
198
+ Args:
199
+ current_data: current data
200
+ reference_data: reference data
201
+ is_numerical: whether the feature is numerical
202
+ reference_ratio: ratio of current data to use as reference if reference_data is not provided
203
+ min_values_in_bin: minimum number of values in a bin to calculate PSI
204
+ n_bins: number of bins to use for numerical features
205
+ Returns:
206
+ psi_value: calculated PSI
207
+ """
208
+ reference, current = _get_binned_data(reference_data, current_data, is_numerical, n_bins, cat_top_pct)
209
+
210
+ if len(reference) == 0 or len(current) == 0:
211
+ return None
212
+
213
+ nonempty_current = [i for i, c in enumerate(current) if len(c) > 0]
214
+ current = [current[i] for i in nonempty_current]
215
+ current_data = [current_data[i] for i in nonempty_current]
216
+
217
+ if len(current) == 0:
218
+ return None
219
+
220
+ if min_values_in_bin is not None and (
221
+ np.array(reference).min() < min_values_in_bin or any(np.array(c).min() < min_values_in_bin for c in current)
222
+ ):
223
+ return None
224
+
225
+ reference = _fill_zeroes(reference / len(reference_data))
226
+ current = [_fill_zeroes(c / len(d)) for c, d in zip(current, current_data)]
227
+
228
+ psi_value = agg_func([_psi(reference, c) for c in current])
229
+
230
+ return float(psi_value)
231
+
232
+
233
+ def _get_binned_data(
234
+ reference_data: pd.Series,
235
+ current_data: list[pd.Series],
236
+ is_numerical: bool,
237
+ n_bins: int,
238
+ cat_top_pct: float,
239
+ ):
240
+ """Split variable into n buckets based on reference quantiles
241
+ Args:
242
+ reference_data: reference data
243
+ current_data: current data
244
+ feature_type: feature type
245
+ n: number of quantiles
246
+ Returns:
247
+ reference_counts: number of records in each bucket for reference
248
+ current_counts: number of records in each bucket for current
249
+ """
250
+ n_vals = reference_data.nunique()
251
+
252
+ if is_numerical and n_vals > 20:
253
+ bins = _get_bin_edges(reference_data, n_bins)
254
+ reference_counts = np.histogram(reference_data, bins)[0]
255
+ current_counts = [np.histogram(d, bins)[0] for d in current_data]
256
+
257
+ else:
258
+ keys = _get_unique_not_nan_values_list_from_series([reference_data] + current_data)
259
+ ref_feature_dict = {**dict.fromkeys(keys, 0), **dict(reference_data.value_counts())}
260
+ current_feature_dict = [{**dict.fromkeys(keys, 0), **dict(d.value_counts())} for d in current_data]
261
+ key_dict = more_itertools.map_reduce(
262
+ itertools.chain(ref_feature_dict.items(), *(d.items() for d in current_feature_dict)),
263
+ keyfunc=operator.itemgetter(0),
264
+ valuefunc=operator.itemgetter(1),
265
+ reducefunc=sum,
266
+ )
267
+ key_dict = pd.Series(key_dict)
268
+ keys = key_dict.index[key_dict.rank(pct=True) >= cat_top_pct]
269
+ reference_counts = np.array([ref_feature_dict[key] for key in keys])
270
+ current_counts = [np.array([current_feature_dict[i][key] for key in keys]) for i in range(len(current_data))]
271
+
272
+ reference_counts = np.append(reference_counts, reference_data.isna().sum())
273
+ current_counts = [np.append(d, current_data[i].isna().sum()) for i, d in enumerate(current_counts)]
274
+
275
+ return reference_counts, current_counts
276
+
277
+
278
+ def _fill_zeroes(percents: np.ndarray) -> np.ndarray:
279
+ eps = 0.0001
280
+ if (percents == 0).all():
281
+ np.place(percents, percents == 0, eps)
282
+ else:
283
+ min_value = min(percents[percents != 0])
284
+ if min_value <= eps:
285
+ np.place(percents, percents == 0, eps)
286
+ else:
287
+ np.place(percents, percents == 0, min_value / 10**6)
288
+ return percents
289
+
290
+
291
+ def _get_bin_edges(data: pd.Series, n_bins: int) -> np.ndarray:
292
+ bins = np.nanquantile(data, np.linspace(0, 1, n_bins + 1))
293
+ bins[0] = -np.inf
294
+ bins[-1] = np.inf
295
+ return bins
296
+
297
+
298
+ def _get_unique_not_nan_values_list_from_series(series: list[pd.Series]) -> list:
299
+ """Get unique values from current and reference series, drop NaNs"""
300
+ return list(reduce(set.union, (set(s.dropna().unique()) for s in series)))
@@ -1,55 +1,30 @@
1
- from dataclasses import dataclass, field
2
1
  import logging
3
2
  import numbers
3
+ from dataclasses import dataclass
4
4
  from typing import Callable, List, Optional
5
+
5
6
  import numpy as np
6
7
  import pandas as pd
7
8
 
8
- from upgini.metadata import SYSTEM_RECORD_ID, CVType, ModelTaskType
9
+ from upgini.metadata import (
10
+ EVAL_SET_INDEX,
11
+ SYSTEM_RECORD_ID,
12
+ TARGET,
13
+ CVType,
14
+ ModelTaskType,
15
+ )
9
16
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
17
+ from upgini.utils.config import (
18
+ TS_DEFAULT_HIGH_FREQ_TRUNC_LENGTHS,
19
+ TS_DEFAULT_LOW_FREQ_TRUNC_LENGTHS,
20
+ TS_DEFAULT_TIME_UNIT_THRESHOLD,
21
+ TS_MIN_DIFFERENT_IDS_RATIO,
22
+ SampleConfig,
23
+ )
10
24
  from upgini.utils.target_utils import balance_undersample
11
25
  from upgini.utils.ts_utils import get_most_frequent_time_unit, trunc_datetime
12
26
 
13
27
 
14
- TS_MIN_DIFFERENT_IDS_RATIO = 0.2
15
- TS_DEFAULT_HIGH_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=2, months=6), pd.DateOffset(years=2, days=7)]
16
- TS_DEFAULT_LOW_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=7), pd.DateOffset(years=5)]
17
- TS_DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
18
- FIT_SAMPLE_ROWS_TS = 100_000
19
-
20
- BINARY_MIN_SAMPLE_THRESHOLD = 5_000
21
- MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
22
- BINARY_BOOTSTRAP_LOOPS = 5
23
- MULTICLASS_BOOTSTRAP_LOOPS = 2
24
-
25
- FIT_SAMPLE_THRESHOLD = 100_000
26
- FIT_SAMPLE_ROWS = 100_000
27
- FIT_SAMPLE_ROWS_WITH_EVAL_SET = 100_000
28
- FIT_SAMPLE_THRESHOLD_WITH_EVAL_SET = 100_000
29
-
30
-
31
- @dataclass
32
- class SampleConfig:
33
- force_sample_size: int = 7000
34
- ts_min_different_ids_ratio: float = TS_MIN_DIFFERENT_IDS_RATIO
35
- ts_default_high_freq_trunc_lengths: List[pd.DateOffset] = field(
36
- default_factory=TS_DEFAULT_HIGH_FREQ_TRUNC_LENGTHS.copy
37
- )
38
- ts_default_low_freq_trunc_lengths: List[pd.DateOffset] = field(
39
- default_factory=TS_DEFAULT_LOW_FREQ_TRUNC_LENGTHS.copy
40
- )
41
- ts_default_time_unit_threshold: pd.Timedelta = TS_DEFAULT_TIME_UNIT_THRESHOLD
42
- binary_min_sample_threshold: int = BINARY_MIN_SAMPLE_THRESHOLD
43
- multiclass_min_sample_threshold: int = MULTICLASS_MIN_SAMPLE_THRESHOLD
44
- binary_bootstrap_loops: int = BINARY_BOOTSTRAP_LOOPS
45
- multiclass_bootstrap_loops: int = MULTICLASS_BOOTSTRAP_LOOPS
46
- fit_sample_threshold: int = FIT_SAMPLE_THRESHOLD
47
- fit_sample_rows: int = FIT_SAMPLE_ROWS
48
- fit_sample_rows_with_eval_set: int = FIT_SAMPLE_ROWS_WITH_EVAL_SET
49
- fit_sample_threshold_with_eval_set: int = FIT_SAMPLE_THRESHOLD_WITH_EVAL_SET
50
- fit_sample_rows_ts: int = FIT_SAMPLE_ROWS_TS
51
-
52
-
53
28
  @dataclass
54
29
  class SampleColumns:
55
30
  date: str
@@ -117,6 +92,22 @@ def sample(
117
92
  **kwargs,
118
93
  )
119
94
 
95
+ # separate OOT
96
+ oot_dfs = []
97
+ other_dfs = []
98
+ if EVAL_SET_INDEX in df.columns:
99
+ for eval_set_index in df[EVAL_SET_INDEX].unique():
100
+ eval_df = df[df[EVAL_SET_INDEX] == eval_set_index]
101
+ if TARGET in eval_df.columns and eval_df[TARGET].isna().all():
102
+ oot_dfs.append(eval_df)
103
+ else:
104
+ other_dfs.append(eval_df)
105
+ if len(oot_dfs) > 0:
106
+ oot_df = pd.concat(oot_dfs, ignore_index=False)
107
+ df = pd.concat(other_dfs, ignore_index=False)
108
+ else:
109
+ oot_df = None
110
+
120
111
  num_samples = _num_samples(df)
121
112
  if num_samples > fit_sample_threshold:
122
113
  logger.info(
@@ -126,6 +117,18 @@ def sample(
126
117
  df = df.sample(n=fit_sample_rows, random_state=random_state)
127
118
  logger.info(f"Shape after threshold resampling: {df.shape}")
128
119
 
120
+ if oot_df is not None:
121
+ num_samples_oot = _num_samples(oot_df)
122
+ if num_samples_oot > fit_sample_threshold:
123
+ logger.info(
124
+ f"OOT has size {num_samples_oot} more than threshold {fit_sample_threshold} "
125
+ f"and will be downsampled to {fit_sample_rows}"
126
+ )
127
+ oot_df = oot_df.sample(n=fit_sample_rows, random_state=random_state)
128
+ df = pd.concat([df, oot_df], ignore_index=False)
129
+
130
+ logger.info(f"Dataset size after downsampling: {len(df)}")
131
+
129
132
  return df
130
133
 
131
134
 
@@ -175,7 +178,7 @@ def sample_time_series_train_eval(
175
178
  )
176
179
  if logger is not None:
177
180
  logger.info(f"Eval set size: {len(eval_df)}")
178
- df = pd.concat([train_df, eval_df])
181
+ df = pd.concat([train_df, eval_df], ignore_index=False)
179
182
 
180
183
  elif len(train_df) > max_rows:
181
184
  df = sample_time_series_trunc(
@@ -6,9 +6,14 @@ import pandas as pd
6
6
  from pandas.api.types import is_bool_dtype, is_datetime64_any_dtype, is_numeric_dtype
7
7
 
8
8
  from upgini.errors import ValidationError
9
- from upgini.metadata import SYSTEM_RECORD_ID, ModelTaskType
10
- from upgini.resource_bundle import ResourceBundle, get_custom_bundle, bundle
9
+ from upgini.metadata import EVAL_SET_INDEX, SYSTEM_RECORD_ID, TARGET, ModelTaskType
10
+ from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
11
11
  from upgini.sampler.random_under_sampler import RandomUnderSampler
12
+ from upgini.utils.config import SampleConfig
13
+
14
+ MAX_MULTICLASS_CLASS_COUNT = 100
15
+ MIN_TARGET_CLASS_ROWS = 100
16
+ IMBALANCE_THESHOLD = 0.6
12
17
 
13
18
 
14
19
  def prepare_target(y: Union[pd.Series, np.ndarray], target_type: ModelTaskType) -> Union[pd.Series, np.ndarray]:
@@ -106,6 +111,47 @@ def define_task(
106
111
  return task
107
112
 
108
113
 
114
+ def is_imbalanced(
115
+ data: pd.DataFrame,
116
+ task_type: ModelTaskType,
117
+ sample_config: SampleConfig,
118
+ bundle: ResourceBundle,
119
+ ) -> bool:
120
+ if task_type is None or not task_type.is_classification():
121
+ return False
122
+
123
+ data = data.drop_duplicates(keep="first")
124
+ columns_without_target = [col for col in data.columns if col != TARGET]
125
+ data = data.drop_duplicates(subset=columns_without_target, keep=False)
126
+
127
+ if task_type == ModelTaskType.BINARY and len(data) <= sample_config.binary_min_sample_threshold:
128
+ return False
129
+
130
+ count = len(data)
131
+ target = data[TARGET]
132
+ target_classes_count = target.nunique()
133
+
134
+ if target_classes_count > MAX_MULTICLASS_CLASS_COUNT:
135
+ msg = bundle.get("dataset_to_many_multiclass_targets").format(target_classes_count, MAX_MULTICLASS_CLASS_COUNT)
136
+ raise ValidationError(msg)
137
+
138
+ vc = target.value_counts()
139
+ min_class_value = vc.index[len(vc) - 1]
140
+ min_class_count = vc[min_class_value]
141
+
142
+ if min_class_count < MIN_TARGET_CLASS_ROWS:
143
+ msg = bundle.get("dataset_rarest_class_less_min").format(
144
+ min_class_value, min_class_count, MIN_TARGET_CLASS_ROWS
145
+ )
146
+ raise ValidationError(msg)
147
+
148
+ min_class_percent = IMBALANCE_THESHOLD / target_classes_count
149
+ min_class_threshold = min_class_percent * count
150
+
151
+ # If min class count less than 30% for binary or (60 / classes_count)% for multiclass
152
+ return bool(min_class_count < min_class_threshold)
153
+
154
+
109
155
  def is_int_encoding(unique_values):
110
156
  return set(unique_values) == set(range(len(unique_values))) or set(unique_values) == set(
111
157
  range(1, len(unique_values) + 1)
@@ -132,6 +178,11 @@ def balance_undersample(
132
178
  if SYSTEM_RECORD_ID not in df.columns:
133
179
  raise Exception("System record id must be presented for undersampling")
134
180
 
181
+ # Rebalance and send to server only train data
182
+ # because eval set data will be sent separately in transform for metrics
183
+ if EVAL_SET_INDEX in df.columns:
184
+ df = df[df[EVAL_SET_INDEX] == 0]
185
+
135
186
  target = df[target_column].copy()
136
187
 
137
188
  vc = target.value_counts()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.113a3974.dev2
3
+ Version: 1.2.114
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -38,7 +38,7 @@ Requires-Dist: python-bidi==0.4.2
38
38
  Requires-Dist: python-dateutil>=2.8.0
39
39
  Requires-Dist: python-json-logger>=3.3.0
40
40
  Requires-Dist: requests>=2.8.0
41
- Requires-Dist: scikit-learn>=1.3.0
41
+ Requires-Dist: scikit-learn<1.8.0,>=1.3.0
42
42
  Requires-Dist: scipy>=1.10.0
43
43
  Requires-Dist: shap>=0.44.0
44
44
  Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
@@ -270,9 +270,9 @@ from upgini.metadata import SearchKey
270
270
  enricher = FeaturesEnricher(
271
271
  search_keys={
272
272
  "subscription_activation_date": SearchKey.DATE,
273
- "country": SearchKey.COUNTRY,
274
- "zip_code": SearchKey.POSTAL_CODE,
275
- "hashed_email": SearchKey.HEM,
273
+ "country": SearchKey.COUNTRY,
274
+ "zip_code": SearchKey.POSTAL_CODE,
275
+ "hashed_email": SearchKey.HEM,
276
276
  "last_visit_ip_address": SearchKey.IP,
277
277
  "registered_with_phone": SearchKey.PHONE
278
278
  })
@@ -358,9 +358,9 @@ from upgini.metadata import SearchKey
358
358
  enricher = FeaturesEnricher(
359
359
  search_keys={
360
360
  "subscription_activation_date": SearchKey.DATE,
361
- "country": SearchKey.COUNTRY,
362
- "zip_code": SearchKey.POSTAL_CODE,
363
- "hashed_email": SearchKey.HEM,
361
+ "country": SearchKey.COUNTRY,
362
+ "zip_code": SearchKey.POSTAL_CODE,
363
+ "hashed_email": SearchKey.HEM,
364
364
  "last_visit_ip_address": SearchKey.IP,
365
365
  "registered_with_phone": SearchKey.PHONE
366
366
  },
@@ -381,7 +381,7 @@ from upgini.metadata import SearchKey
381
381
  enricher = FeaturesEnricher(
382
382
  search_keys={
383
383
  "subscription_activation_date": SearchKey.DATE,
384
- "zip_code": SearchKey.POSTAL_CODE,
384
+ "zip_code": SearchKey.POSTAL_CODE,
385
385
  },
386
386
  country_code = "US",
387
387
  date_format = "%Y-%d-%m"
@@ -409,8 +409,8 @@ y = train_df["churn_flag"]
409
409
  enricher = FeaturesEnricher(
410
410
  search_keys={
411
411
  "subscription_activation_date": SearchKey.DATE,
412
- "country": SearchKey.COUNTRY,
413
- "zip_code": SearchKey.POSTAL_CODE
412
+ "country": SearchKey.COUNTRY,
413
+ "zip_code": SearchKey.POSTAL_CODE
414
414
  })
415
415
 
416
416
  # everything is ready to fit! For 200к records fitting should take around 10 minutes,
@@ -464,8 +464,8 @@ And then, for `transform` in a production ML pipeline, you'll get enrichment wit
464
464
  enricher = FeaturesEnricher(
465
465
  search_keys={
466
466
  "subscription_activation_date": SearchKey.DATE,
467
- "country": SearchKey.COUNTRY,
468
- "zip_code": SearchKey.POSTAL_CODE,
467
+ "country": SearchKey.COUNTRY,
468
+ "zip_code": SearchKey.POSTAL_CODE,
469
469
  },
470
470
  )
471
471
  ```
@@ -516,8 +516,8 @@ enricher = FeaturesEnricher(
516
516
  If you're working with multivariate time series, you should specify id columns of individual univariate series in `FeaturesEnricher`. For example, if you have a dataset predicting sales for different stores and products, you should specify store and product id columns as follows:
517
517
  ```python
518
518
  enricher = FeaturesEnricher(
519
- search_keys={
520
- "sales_date": SearchKey.DATE,
519
+ search_keys={
520
+ "sales_date": SearchKey.DATE,
521
521
  },
522
522
  id_columns=["store_id", "product_id"],
523
523
  cv=CVType.time_series
@@ -733,9 +733,52 @@ enricher.fit(
733
733
  )
734
734
  ```
735
735
  #### ⚠️ Requirements for out-of-time dataset
736
- - Same data schema as for search initialization dataset
736
+ - Same data schema as for search initialization X dataset
737
737
  - Pandas dataframe representation
738
738
 
739
+ There are 3 options to pass out-of-time without labels:
740
+ ```python
741
+ enricher.fit(
742
+ train_ids_and_features,
743
+ train_label,
744
+ eval_set = [
745
+ (eval_ids_and_features_1,), # Just tuple of 1 element
746
+ (eval_ids_and_features_2, None), # None as labels
747
+ (eval_ids_and_features_3, [np.nan] * len(eval_ids_and_features_3)), # List or Series of the same size as eval X
748
+ ]
749
+ )
750
+ ```
751
+
752
+ ### Control feature stability with PSI parameters
753
+
754
+ `FeaturesEnricher` supports Population Stability Index (PSI) calculation on eval_set to evaluate feature stability over time. You can control this behavior using stability parameters in `fit` and `fit_transform` methods:
755
+
756
+ ```python
757
+ enricher = FeaturesEnricher(
758
+ search_keys={"registration_date": SearchKey.DATE}
759
+ )
760
+
761
+ # Control feature stability during fit
762
+ enricher.fit(
763
+ X, y,
764
+ stability_threshold=0.2, # PSI threshold: features with PSI above this value will be dropped
765
+ stability_agg_func="max" # Aggregation function for stability values: "max", "min", "mean"
766
+ )
767
+
768
+ # Same parameters work for fit_transform
769
+ enriched_df = enricher.fit_transform(
770
+ X, y,
771
+ stability_threshold=0.1, # Stricter threshold for more stable features
772
+ stability_agg_func="mean" # Use mean aggregation instead of max
773
+ )
774
+ ```
775
+
776
+ **Stability parameters:**
777
+ - `stability_threshold` (float, default=0.2): PSI threshold value. Features with PSI below this threshold will be excluded from the final feature set. Lower values mean stricter stability requirements.
778
+ - `stability_agg_func` (str, default="max"): Function to aggregate PSI values across time intervals. Options: "max" (most conservative), "min" (least conservative), "mean" (balanced approach).
779
+
780
+ **PSI (Population Stability Index)** measures how much feature distribution changes over time. Lower PSI values indicate more stable features, which are generally more reliable for production ML models.
781
+
739
782
  ### Use custom loss function in feature selection & metrics calculation
740
783
 
741
784
  `FeaturesEnricher` can be initialized with additional string parameter `loss`.
@@ -756,20 +799,6 @@ enricher = FeaturesEnricher(
756
799
  enriched_dataframe.fit(X, y)
757
800
  ```
758
801
 
759
- ### Return initial dataframe enriched with TOP external features by importance
760
-
761
- `transform` and `fit_transform` methods of `FeaturesEnricher` can be used with two additional parameters:
762
- - `importance_threshold`: float = 0 - only features with *importance >= threshold* will be added to the output dataframe
763
- - `max_features`: int - only first TOP N features by importance will be returned, where *N = max_features*
764
-
765
- And `keep_input=True` will keep all initial columns from search dataset X:
766
- ```python
767
- enricher = FeaturesEnricher(
768
- search_keys={"subscription_activation_date": SearchKey.DATE}
769
- )
770
- enriched_dataframe.fit_transform(X, y, keep_input=True, max_features=2)
771
- ```
772
-
773
802
  ### Exclude premium data sources from fit, transform and metrics calculation
774
803
 
775
804
  `fit`, `fit_transform`, `transform` and `calculate_metrics` methods of `FeaturesEnricher` can be used with parameter `exclude_features_sources` that allows to exclude Trial or Paid features from Premium data sources:
@@ -797,7 +826,7 @@ enricher = FeaturesEnricher(
797
826
  enricher.fit(X, y)
798
827
  ```
799
828
 
800
- ## Turn off removing of target outliers
829
+ ### Turn off removing of target outliers
801
830
  Upgini detect rows with target outlier for regression tasks. By default such rows are dropped on metrics calculation. To turn off removing of target outlier rows use parameter `remove_outliers_calc_metrics=False` in fit, fit_transform or calculate_metrics methods:
802
831
 
803
832
  ```python
@@ -808,7 +837,7 @@ enricher = FeaturesEnricher(
808
837
  enricher.fit(X, y, remove_outliers_calc_metrics=False)
809
838
  ```
810
839
 
811
- ## Turn off generating features on search keys
840
+ ### Turn off generating features on search keys
812
841
  Upgini tries to generate features on email, date and datetime search keys. By default this generation is enabled. To disable it use parameter `generate_search_key_features` of FeaturesEnricher constructor:
813
842
 
814
843
  ```python
@@ -816,6 +845,7 @@ enricher = FeaturesEnricher(
816
845
  search_keys={"date": SearchKey.DATE},
817
846
  generate_search_key_features=False,
818
847
  )
848
+ ```
819
849
 
820
850
  ## 🔑 Open up all capabilities of Upgini
821
851