upgini 1.2.113a3974.dev2__py3-none-any.whl → 1.2.114a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/metadata.py CHANGED
@@ -285,6 +285,7 @@ class FeaturesMetadataV2(BaseModel):
285
285
  doc_link: Optional[str] = None
286
286
  update_frequency: Optional[str] = None
287
287
  from_online_api: Optional[bool] = None
288
+ psi_value: Optional[float] = None
288
289
 
289
290
 
290
291
  class HitRateMetrics(BaseModel):
upgini/metrics.py CHANGED
@@ -816,7 +816,8 @@ class CatBoostWrapper(EstimatorWrapper):
816
816
  else:
817
817
  encoded = cat_encoder.transform(x[self.cat_features])
818
818
  cat_features = encoded.columns.to_list()
819
- x.loc[:, self.cat_features] = encoded
819
+ x.drop(columns=encoded.columns, inplace=True, errors="ignore")
820
+ x[encoded.columns] = encoded
820
821
  else:
821
822
  cat_features = self.cat_features
822
823
 
@@ -1175,7 +1176,10 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
1175
1176
  >>> mean_squared_log_error(y_true, y_pred, multioutput=[0.3, 0.7])
1176
1177
  0.060...
1177
1178
  """
1178
- _, y_true, y_pred, multioutput = _check_reg_targets(y_true, y_pred, multioutput)
1179
+ try:
1180
+ _, y_true, y_pred, multioutput = _check_reg_targets(y_true, y_pred, multioutput)
1181
+ except TypeError:
1182
+ _, y_true, y_pred, sample_weight, multioutput = _check_reg_targets(y_true, y_pred, sample_weight, multioutput)
1179
1183
  check_consistent_length(y_true, y_pred, sample_weight)
1180
1184
 
1181
1185
  if (y_true < 0).any():
@@ -123,7 +123,7 @@ train_unstable_target=Your training sample contains an unstable target event, PS
123
123
  eval_unstable_target=Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
124
124
  # eval set validation
125
125
  unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
126
- eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
126
+ eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y or X only
127
127
  unsupported_x_type_eval_set=Unsupported type of X in eval_set: {}. Use pandas.DataFrame, pandas.Series or numpy.ndarray or list.
128
128
  eval_x_and_x_diff_shape=The column set in eval_set are differ from the column set in X
129
129
  unsupported_y_type_eval_set=Unsupported type of y in eval_set: {}. Use pandas.Series, numpy.ndarray or list
@@ -139,6 +139,8 @@ eval_x_is_empty=X in eval_set is empty.
139
139
  eval_y_is_empty=y in eval_set is empty.
140
140
  x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
141
141
  eval_x_has_train_samples=Eval set X has rows that are present in train set X
142
+ oot_without_date_not_supported=Eval set {} provided as OOT but date column is missing. It will be ignored for stability check
143
+ oot_with_online_sources_not_supported=Eval set {} provided as OOT and also provided columns for online API. It will be ignored for stability check
142
144
 
143
145
  baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
144
146
  baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
@@ -163,6 +165,7 @@ dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample
163
165
  dataset_empty_column_names=Some column names are empty. Add names please
164
166
  dataset_full_duplicates={:.5f}% of the rows are fully duplicated
165
167
  dataset_diff_target_duplicates={:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nSample of incorrect row indexes: {}
168
+ dataset_diff_target_duplicates_oot={:.4f}% of rows ({}) in OOT eval_set are duplicates with train or another eval_set. These rows will be deleted from OOT\nSample of incorrect row indexes: {}
166
169
  dataset_train_diff_target_duplicates_fintech={:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
167
170
  dataset_eval_diff_target_duplicates_fintech={:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
168
171
  dataset_drop_old_dates=We don't have data before '2000-01-01' and removed all earlier records from the search dataset
@@ -183,6 +186,7 @@ dataset_invalid_column_type=Unsupported data type of column {}: {}
183
186
  dataset_invalid_filter=Unknown field in filter_features. Should be {'min_importance', 'max_psi', 'max_count', 'selected_features'}.
184
187
  dataset_too_big_file=Too big size of dataframe X for processing. Please reduce number of rows or columns
185
188
  dataset_transform_diff_fit=You try to enrich dataset that column names are different from the train dataset column names that you used on the fit stage. Please make the column names the same as in the train dataset and restart.
189
+ oot_eval_set_too_small_after_dedup=OOT eval set {} has less than 1000 rows after deduplication. It will be ignored for stability check
186
190
  binary_small_dataset=The least populated class in Target contains less than 1000 rows.\nSmall numbers of observations may negatively affect the number of selected features and quality of your ML model.\nUpgini recommends you increase the number of observations in the least populated class.\n
187
191
  all_search_keys_invalid=All search keys are invalid
188
192
  all_emails_invalid=All values in column {} are invalid emails # Metrics validation
@@ -255,6 +259,7 @@ features_info_provider=Provider
255
259
  features_info_source=Source
256
260
  features_info_name=Feature name
257
261
  features_info_shap=SHAP value
262
+ features_info_psi=PSI value
258
263
  features_info_hitrate=Coverage %
259
264
  features_info_type=Type
260
265
  # Deprecated
upgini/sampler/base.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """
2
2
  Base class for the under-sampling method.
3
3
  """
4
+
4
5
  # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
5
6
  # License: MIT
6
7
 
@@ -12,6 +13,7 @@ import numpy as np
12
13
  from sklearn.base import BaseEstimator
13
14
  from sklearn.preprocessing import label_binarize
14
15
  from sklearn.utils.multiclass import check_classification_targets
16
+ from sklearn.utils.validation import check_X_y
15
17
 
16
18
  from .utils import ArraysTransformer, check_sampling_strategy, check_target_type
17
19
 
@@ -125,7 +127,7 @@ class BaseSampler(SamplerMixin):
125
127
  if accept_sparse is None:
126
128
  accept_sparse = ["csr", "csc"]
127
129
  y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
128
- X, y = self._validate_data(X, y, reset=True, accept_sparse=accept_sparse)
130
+ X, y = check_X_y(X, y, accept_sparse=accept_sparse, dtype=None, ensure_all_finite=False)
129
131
  return X, y, binarize_y
130
132
 
131
133
  def _more_tags(self):
@@ -80,14 +80,24 @@ RandomUnderSampler # doctest: +NORMALIZE_WHITESPACE
80
80
 
81
81
  def _check_X_y(self, X, y):
82
82
  y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
83
- X, y = self._validate_data(
84
- X,
85
- y,
86
- reset=True,
87
- accept_sparse=["csr", "csc"],
88
- dtype=None,
89
- force_all_finite=False,
90
- )
83
+ try:
84
+ X, y = self._validate_data(
85
+ X,
86
+ y,
87
+ reset=True,
88
+ accept_sparse=["csr", "csc"],
89
+ dtype=None,
90
+ force_all_finite=False,
91
+ )
92
+ except AttributeError:
93
+ from sklearn.utils.validation import check_X_y
94
+ X, y = check_X_y(
95
+ X,
96
+ y,
97
+ accept_sparse=["csr", "csc"],
98
+ dtype=None,
99
+ ensure_all_finite=False,
100
+ )
91
101
  return X, y, binarize_y
92
102
 
93
103
  def _fit_resample(self, X, y):
@@ -134,8 +134,13 @@ def remove_fintech_duplicates(
134
134
  logger.info(f"Train dataset shape after clean fintech duplicates: {train_df.shape}")
135
135
 
136
136
  # Process each eval_set part separately
137
+ oot_eval_dfs = []
137
138
  new_eval_dfs = []
138
139
  for i, eval_df in enumerate(eval_dfs, 1):
140
+ # Skip OOT
141
+ if eval_df[TARGET].isna().all():
142
+ oot_eval_dfs.append(eval_df)
143
+ continue
139
144
  logger.info(f"Eval {i} dataset shape before clean fintech duplicates: {eval_df.shape}")
140
145
  cleaned_eval_df, eval_warning = process_df(eval_df, i)
141
146
  if eval_warning:
@@ -145,8 +150,8 @@ def remove_fintech_duplicates(
145
150
 
146
151
  # Combine the processed train and eval parts back into one dataset
147
152
  logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
148
- if new_eval_dfs:
149
- df = pd.concat([train_df] + new_eval_dfs)
153
+ if new_eval_dfs or oot_eval_dfs:
154
+ df = pd.concat([train_df] + new_eval_dfs + oot_eval_dfs, ignore_index=False)
150
155
  else:
151
156
  df = train_df
152
157
  logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
@@ -190,16 +195,59 @@ def clean_full_duplicates(
190
195
  msg = None
191
196
  if TARGET in df.columns:
192
197
  unique_columns.remove(TARGET)
193
- marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
198
+
199
+ # Separate rows to exclude from deduplication:
200
+ # for each eval_set_index != 0 check separately, all TARGET values are NaN
201
+ df_for_dedup = df
202
+ oot_df = None
203
+
204
+ if EVAL_SET_INDEX in df.columns:
205
+ oot_eval_dfs = []
206
+ other_dfs = []
207
+ for eval_idx in df[EVAL_SET_INDEX].unique():
208
+ eval_subset = df[df[EVAL_SET_INDEX] == eval_idx]
209
+ # Check that all TARGET values for this specific eval_set_index are NaN
210
+ if eval_idx != 0 and eval_subset[TARGET].isna().all():
211
+ oot_eval_dfs.append(eval_subset)
212
+ logger.info(
213
+ f"Excluded {len(eval_subset)} rows from deduplication "
214
+ f"(eval_set_index={eval_idx} and all TARGET values are NaN)"
215
+ )
216
+ else:
217
+ other_dfs.append(eval_subset)
218
+
219
+ if oot_eval_dfs:
220
+ oot_df = pd.concat(oot_eval_dfs, ignore_index=False)
221
+ df_for_dedup = pd.concat(other_dfs, ignore_index=False)
222
+ else:
223
+ df_for_dedup = df
224
+
225
+ marked_duplicates = df_for_dedup.duplicated(subset=unique_columns, keep=False)
194
226
  if marked_duplicates.sum() > 0:
195
- dups_indices = df[marked_duplicates].index.to_list()[:100]
196
- nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
197
- num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
198
- share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
227
+ dups_indices = df_for_dedup[marked_duplicates].index.to_list()[:100]
228
+ nrows_after_tgt_dedup = len(df_for_dedup.drop_duplicates(subset=unique_columns, keep=False))
229
+ num_dup_rows = len(df_for_dedup) - nrows_after_tgt_dedup
230
+ share_tgt_dedup = 100 * num_dup_rows / len(df_for_dedup)
199
231
 
200
232
  msg = bundle.get("dataset_diff_target_duplicates").format(share_tgt_dedup, num_dup_rows, dups_indices)
201
- df = df.drop_duplicates(subset=unique_columns, keep=False)
202
- logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
233
+ df_for_dedup = df_for_dedup.drop_duplicates(subset=unique_columns, keep=False)
234
+ logger.info(f"Dataset shape after clean invalid target duplicates: {df_for_dedup.shape}")
235
+ # Combine back excluded rows
236
+ if oot_df is not None:
237
+ df = pd.concat([df_for_dedup, oot_df], ignore_index=False)
238
+ marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
239
+ if marked_duplicates.sum() > 0:
240
+ dups_indices = df[marked_duplicates].index.to_list()[:100]
241
+ nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
242
+ num_dup_rows = len(df) - nrows_after_tgt_dedup
243
+ share_tgt_dedup = 100 * num_dup_rows / len(df)
244
+ msg = bundle.get("dataset_diff_target_duplicates_oot").format(
245
+ share_tgt_dedup, num_dup_rows, dups_indices
246
+ )
247
+ df = df.drop_duplicates(subset=unique_columns, keep="first")
248
+ logger.info(f"Final dataset shape after adding back excluded rows: {df.shape}")
249
+ else:
250
+ df = df_for_dedup
203
251
 
204
252
  return df, msg
205
253
 
@@ -27,6 +27,7 @@ class FeatureInfo:
27
27
  doc_link: str
28
28
  data_provider_link: str
29
29
  data_source_link: str
30
+ psi_value: Optional[float] = None
30
31
 
31
32
  @staticmethod
32
33
  def from_metadata(
@@ -47,12 +48,14 @@ class FeatureInfo:
47
48
  doc_link=feature_meta.doc_link,
48
49
  data_provider_link=feature_meta.data_provider_link,
49
50
  data_source_link=feature_meta.data_source_link,
51
+ psi_value=feature_meta.psi_value,
50
52
  )
51
53
 
52
54
  def to_row(self, bundle: ResourceBundle) -> Dict[str, str]:
53
55
  return {
54
56
  bundle.get("features_info_name"): self.name,
55
57
  bundle.get("features_info_shap"): self.rounded_shap,
58
+ bundle.get("features_info_psi"): self.psi_value,
56
59
  bundle.get("features_info_hitrate"): self.hitrate,
57
60
  bundle.get("features_info_value_preview"): self.value_preview,
58
61
  bundle.get("features_info_provider"): self.provider,
@@ -64,6 +67,7 @@ class FeatureInfo:
64
67
  return {
65
68
  bundle.get("features_info_name"): self.internal_name,
66
69
  bundle.get("features_info_shap"): self.rounded_shap,
70
+ bundle.get("features_info_psi"): self.psi_value,
67
71
  bundle.get("features_info_hitrate"): self.hitrate,
68
72
  bundle.get("features_info_value_preview"): self.value_preview,
69
73
  bundle.get("features_info_provider"): self.internal_provider,
@@ -76,6 +80,7 @@ class FeatureInfo:
76
80
  bundle.get("features_info_name"): self.internal_name,
77
81
  "feature_link": self.doc_link,
78
82
  bundle.get("features_info_shap"): self.rounded_shap,
83
+ bundle.get("features_info_psi"): self.psi_value,
79
84
  bundle.get("features_info_hitrate"): self.hitrate,
80
85
  bundle.get("features_info_value_preview"): self.value_preview,
81
86
  bundle.get("features_info_provider"): self.internal_provider,
upgini/utils/psi.py ADDED
@@ -0,0 +1,294 @@
1
+ import itertools
2
+ import logging
3
+ import operator
4
+ from functools import reduce
5
+ from typing import Callable, Dict, Optional
6
+
7
+ import more_itertools
8
+ import numpy as np
9
+ import pandas as pd
10
+ from pandas.api.types import is_numeric_dtype
11
+ from pydantic import BaseModel
12
+
13
+ from upgini.metadata import TARGET, ModelTaskType
14
+
15
+
16
+ class StabilityParams(BaseModel):
17
+ threshold: float = 999
18
+ n_intervals: int = 12
19
+ min_intervals: int = 10
20
+ max_intervals: Optional[int] = None
21
+ min_values_in_interval: Optional[int] = None
22
+ n_bins: int = 10
23
+ min_values_in_bin: Optional[int] = None
24
+ cat_top_pct: float = 0.7
25
+ agg: str = "max"
26
+
27
+
28
+ DEFAULT_TARGET_PARAMS = StabilityParams(
29
+ n_intervals=12,
30
+ min_intervals=10,
31
+ max_intervals=None,
32
+ min_values_in_interval=None,
33
+ n_bins=5,
34
+ )
35
+
36
+ DEFAULT_FEATURES_PARAMS = StabilityParams(
37
+ n_intervals=12,
38
+ min_intervals=10,
39
+ max_intervals=None,
40
+ min_values_in_interval=None,
41
+ n_bins=10,
42
+ )
43
+
44
+
45
+ def calculate_sparsity_psi(
46
+ df: pd.DataFrame,
47
+ cat_features: list[str],
48
+ date_column: str,
49
+ logger: logging.Logger,
50
+ model_task_type: ModelTaskType,
51
+ psi_features_params: StabilityParams = DEFAULT_FEATURES_PARAMS,
52
+ psi_target_params: StabilityParams = DEFAULT_TARGET_PARAMS,
53
+ ) -> Dict[str, float]:
54
+ sparse_features = df.columns[df.isna().sum() > 0].to_list()
55
+ if len(sparse_features) > 0:
56
+ logger.info(f"Calculating sparsity stability for {len(sparse_features)} sparse features")
57
+ sparse_df = df[sparse_features].notna()
58
+ sparse_df[date_column] = df[date_column]
59
+ return calculate_features_psi(
60
+ sparse_df,
61
+ cat_features,
62
+ date_column,
63
+ logger,
64
+ model_task_type,
65
+ psi_target_params,
66
+ psi_features_params,
67
+ )
68
+ return {}
69
+
70
+
71
+ def calculate_features_psi(
72
+ df: pd.DataFrame,
73
+ cat_features: list[str],
74
+ date_column: str,
75
+ logger: logging.Logger,
76
+ model_task_type: ModelTaskType,
77
+ psi_features_params: StabilityParams = DEFAULT_FEATURES_PARAMS,
78
+ psi_target_params: StabilityParams = DEFAULT_TARGET_PARAMS,
79
+ ) -> Dict[str, float]:
80
+ empty_res = pd.Series(index=df.columns, data=0)
81
+
82
+ if not is_numeric_dtype(df[date_column]):
83
+ df[date_column] = pd.to_datetime(df[date_column]).dt.floor("D").astype(np.int64) / 10**6
84
+
85
+ n_months = pd.to_datetime(df[date_column], unit="ms").dt.month.nunique()
86
+
87
+ if TARGET in df.columns:
88
+ psi_target_params.n_intervals = min(
89
+ psi_target_params.max_intervals or np.inf, max(psi_target_params.min_intervals, n_months)
90
+ )
91
+ logger.info(f"Setting {psi_target_params.n_intervals} intervals for target PSI check")
92
+
93
+ logger.info(f"Calculating target PSI for {psi_target_params.n_intervals} intervals")
94
+ reference_mask, current_masks = _split_intervals(df, date_column, psi_target_params.n_intervals, logger)
95
+
96
+ if psi_target_params.min_values_in_interval is not None and any(
97
+ len(mask) < psi_target_params.min_values_in_interval
98
+ for mask in itertools.chain(current_masks, [reference_mask])
99
+ ):
100
+ logger.info(
101
+ f"Some intervals have less than {psi_target_params.min_values_in_interval} values. Skip PSI check"
102
+ )
103
+ return empty_res
104
+
105
+ target_agg_func = _get_agg_func(psi_target_params.agg)
106
+ logger.info(f"Calculating target PSI with agg function {target_agg_func}")
107
+ target_psi = _stability_agg(
108
+ [df[TARGET][cur] for cur in current_masks],
109
+ reference_data=df[TARGET][reference_mask],
110
+ is_numerical=model_task_type == ModelTaskType.REGRESSION,
111
+ min_values_in_bin=psi_target_params.min_values_in_bin,
112
+ n_bins=psi_target_params.n_bins,
113
+ cat_top_pct=psi_target_params.cat_top_pct,
114
+ agg_func=target_agg_func,
115
+ )
116
+ if target_psi is None:
117
+ logger.info("Cannot determine target PSI. Skip feature PSI check")
118
+ return pd.Series(index=df.columns, data=0)
119
+
120
+ if target_psi > psi_target_params.threshold:
121
+ logger.info(
122
+ f"Target PSI {target_psi} is more than threshold {psi_target_params.threshold}. Skip feature PSI check"
123
+ )
124
+ return empty_res
125
+
126
+ psi_features_params.n_intervals = min(
127
+ psi_features_params.max_intervals or np.inf, max(psi_features_params.min_intervals, n_months)
128
+ )
129
+ logger.info(f"Setting {psi_features_params.n_intervals} intervals for features PSI check")
130
+
131
+ logger.info(f"Calculating PSI for {len(df.columns)} features")
132
+ reference_mask, current_masks = _split_intervals(df, date_column, psi_features_params.n_intervals, logger)
133
+ features_agg_func = _get_agg_func(psi_features_params.agg)
134
+ logger.info(f"Calculating features PSI with agg function {features_agg_func}")
135
+ psi_values = [
136
+ _stability_agg(
137
+ [df[feature][cur] for cur in current_masks],
138
+ reference_data=df[feature][reference_mask],
139
+ is_numerical=feature not in cat_features,
140
+ min_values_in_bin=psi_features_params.min_values_in_bin,
141
+ n_bins=psi_features_params.n_bins,
142
+ cat_top_pct=psi_features_params.cat_top_pct,
143
+ agg_func=features_agg_func,
144
+ )
145
+ for feature in df.columns
146
+ if feature not in [TARGET, date_column]
147
+ ]
148
+ return {feature: psi for feature, psi in zip(df.columns, psi_values)}
149
+
150
+
151
+ def _split_intervals(
152
+ df: pd.DataFrame, date_column: str, n_intervals: int, logger: logging.Logger
153
+ ) -> tuple[pd.Series, list[pd.Series]]:
154
+ date_series = df[date_column]
155
+
156
+ # Check if we have enough unique values for the requested number of intervals
157
+ unique_values = date_series.nunique()
158
+
159
+ # If we have fewer unique values than requested intervals, adjust n_intervals
160
+ if unique_values < n_intervals:
161
+ logger.warning(f"Date column '{date_column}' has only {unique_values} unique values")
162
+
163
+ time_intervals = pd.qcut(date_series, q=n_intervals, duplicates="drop")
164
+ interval_labels = time_intervals.unique()
165
+ reference_mask = time_intervals == interval_labels[0]
166
+ current_masks = [time_intervals == label for label in interval_labels[1:]]
167
+ return reference_mask, current_masks
168
+
169
+
170
+ def _get_agg_func(agg: str):
171
+ np_agg = getattr(np, agg, None)
172
+ if np_agg is None and agg.startswith("q"):
173
+ q = int(agg[1:])
174
+ return lambda x: np.quantile(list(x), q / 100, method="higher")
175
+ return np_agg
176
+
177
+
178
+ def _psi(reference_percent: np.ndarray, current_percent: np.ndarray) -> float:
179
+ return np.sum((reference_percent - current_percent) * np.log(reference_percent / current_percent))
180
+
181
+
182
+ def _stability_agg(
183
+ current_data: list[pd.Series],
184
+ reference_data: pd.Series,
185
+ is_numerical: bool = True,
186
+ min_values_in_bin: int | None = None,
187
+ n_bins: int = 10,
188
+ cat_top_pct: float = 0.7,
189
+ agg_func: Callable = max,
190
+ ) -> float | None:
191
+ """Calculate the PSI
192
+ Args:
193
+ current_data: current data
194
+ reference_data: reference data
195
+ is_numerical: whether the feature is numerical
196
+ reference_ratio: ratio of current data to use as reference if reference_data is not provided
197
+ min_values_in_bin: minimum number of values in a bin to calculate PSI
198
+ n_bins: number of bins to use for numerical features
199
+ Returns:
200
+ psi_value: calculated PSI
201
+ """
202
+ reference, current = _get_binned_data(reference_data, current_data, is_numerical, n_bins, cat_top_pct)
203
+
204
+ if len(reference) == 0 or len(current) == 0:
205
+ return None
206
+
207
+ nonempty_current = [i for i, c in enumerate(current) if len(c) > 0]
208
+ current = [current[i] for i in nonempty_current]
209
+ current_data = [current_data[i] for i in nonempty_current]
210
+
211
+ if len(current) == 0:
212
+ return None
213
+
214
+ if min_values_in_bin is not None and (
215
+ np.array(reference).min() < min_values_in_bin or any(np.array(c).min() < min_values_in_bin for c in current)
216
+ ):
217
+ return None
218
+
219
+ reference = _fill_zeroes(reference / len(reference_data))
220
+ current = [_fill_zeroes(c / len(d)) for c, d in zip(current, current_data)]
221
+
222
+ psi_value = agg_func([_psi(reference, c) for c in current])
223
+
224
+ return psi_value
225
+
226
+
227
+ def _get_binned_data(
228
+ reference_data: pd.Series,
229
+ current_data: list[pd.Series],
230
+ is_numerical: bool,
231
+ n_bins: int,
232
+ cat_top_pct: float,
233
+ ):
234
+ """Split variable into n buckets based on reference quantiles
235
+ Args:
236
+ reference_data: reference data
237
+ current_data: current data
238
+ feature_type: feature type
239
+ n: number of quantiles
240
+ Returns:
241
+ reference_counts: number of records in each bucket for reference
242
+ current_counts: number of records in each bucket for current
243
+ """
244
+ n_vals = reference_data.nunique()
245
+
246
+ if is_numerical and n_vals > 20:
247
+ bins = _get_bin_edges(reference_data, n_bins)
248
+ reference_counts = np.histogram(reference_data, bins)[0]
249
+ current_counts = [np.histogram(d, bins)[0] for d in current_data]
250
+
251
+ else:
252
+ keys = _get_unique_not_nan_values_list_from_series([reference_data] + current_data)
253
+ ref_feature_dict = {**dict.fromkeys(keys, 0), **dict(reference_data.value_counts())}
254
+ current_feature_dict = [{**dict.fromkeys(keys, 0), **dict(d.value_counts())} for d in current_data]
255
+ key_dict = more_itertools.map_reduce(
256
+ itertools.chain(ref_feature_dict.items(), *(d.items() for d in current_feature_dict)),
257
+ keyfunc=operator.itemgetter(0),
258
+ valuefunc=operator.itemgetter(1),
259
+ reducefunc=sum,
260
+ )
261
+ key_dict = pd.Series(key_dict)
262
+ keys = key_dict.index[key_dict.rank(pct=True) >= cat_top_pct]
263
+ reference_counts = np.array([ref_feature_dict[key] for key in keys])
264
+ current_counts = [np.array([current_feature_dict[i][key] for key in keys]) for i in range(len(current_data))]
265
+
266
+ reference_counts = np.append(reference_counts, reference_data.isna().sum())
267
+ current_counts = [np.append(d, current_data[i].isna().sum()) for i, d in enumerate(current_counts)]
268
+
269
+ return reference_counts, current_counts
270
+
271
+
272
+ def _fill_zeroes(percents: np.ndarray) -> np.ndarray:
273
+ eps = 0.0001
274
+ if (percents == 0).all():
275
+ np.place(percents, percents == 0, eps)
276
+ else:
277
+ min_value = min(percents[percents != 0])
278
+ if min_value <= eps:
279
+ np.place(percents, percents == 0, eps)
280
+ else:
281
+ np.place(percents, percents == 0, min_value / 10**6)
282
+ return percents
283
+
284
+
285
+ def _get_bin_edges(data: pd.Series, n_bins: int) -> np.ndarray:
286
+ bins = np.nanquantile(data, np.linspace(0, 1, n_bins + 1))
287
+ bins[0] = -np.inf
288
+ bins[-1] = np.inf
289
+ return bins
290
+
291
+
292
+ def _get_unique_not_nan_values_list_from_series(series: list[pd.Series]) -> list:
293
+ """Get unique values from current and reference series, drop NaNs"""
294
+ return list(reduce(set.union, (set(s.dropna().unique()) for s in series)))
@@ -5,7 +5,7 @@ from typing import Callable, List, Optional
5
5
  import numpy as np
6
6
  import pandas as pd
7
7
 
8
- from upgini.metadata import SYSTEM_RECORD_ID, CVType, ModelTaskType
8
+ from upgini.metadata import EVAL_SET_INDEX, SYSTEM_RECORD_ID, TARGET, CVType, ModelTaskType
9
9
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
10
10
  from upgini.utils.target_utils import balance_undersample
11
11
  from upgini.utils.ts_utils import get_most_frequent_time_unit, trunc_datetime
@@ -117,6 +117,22 @@ def sample(
117
117
  **kwargs,
118
118
  )
119
119
 
120
+ # separate OOT
121
+ oot_dfs = []
122
+ other_dfs = []
123
+ if EVAL_SET_INDEX in df.columns:
124
+ for eval_set_index in df[EVAL_SET_INDEX].unique():
125
+ eval_df = df[df[EVAL_SET_INDEX] == eval_set_index]
126
+ if TARGET in eval_df.columns and eval_df[TARGET].isna().all():
127
+ oot_dfs.append(eval_df)
128
+ else:
129
+ other_dfs.append(eval_df)
130
+ if len(oot_dfs) > 0:
131
+ oot_df = pd.concat(oot_dfs, ignore_index=False)
132
+ df = pd.concat(other_dfs, ignore_index=False)
133
+ else:
134
+ oot_df = None
135
+
120
136
  num_samples = _num_samples(df)
121
137
  if num_samples > fit_sample_threshold:
122
138
  logger.info(
@@ -126,6 +142,18 @@ def sample(
126
142
  df = df.sample(n=fit_sample_rows, random_state=random_state)
127
143
  logger.info(f"Shape after threshold resampling: {df.shape}")
128
144
 
145
+ if oot_df is not None:
146
+ num_samples_oot = _num_samples(oot_df)
147
+ if num_samples_oot > fit_sample_threshold:
148
+ logger.info(
149
+ f"OOT has size {num_samples_oot} more than threshold {fit_sample_threshold} "
150
+ f"and will be downsampled to {fit_sample_rows}"
151
+ )
152
+ oot_df = oot_df.sample(n=fit_sample_rows, random_state=random_state)
153
+ df = pd.concat([df, oot_df], ignore_index=False)
154
+
155
+ logger.info(f"Dataset size after downsampling: {len(df)}")
156
+
129
157
  return df
130
158
 
131
159
 
@@ -175,7 +203,7 @@ def sample_time_series_train_eval(
175
203
  )
176
204
  if logger is not None:
177
205
  logger.info(f"Eval set size: {len(eval_df)}")
178
- df = pd.concat([train_df, eval_df])
206
+ df = pd.concat([train_df, eval_df], ignore_index=False)
179
207
 
180
208
  elif len(train_df) > max_rows:
181
209
  df = sample_time_series_trunc(
@@ -6,7 +6,7 @@ import pandas as pd
6
6
  from pandas.api.types import is_bool_dtype, is_datetime64_any_dtype, is_numeric_dtype
7
7
 
8
8
  from upgini.errors import ValidationError
9
- from upgini.metadata import SYSTEM_RECORD_ID, ModelTaskType
9
+ from upgini.metadata import EVAL_SET_INDEX, SYSTEM_RECORD_ID, ModelTaskType
10
10
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle, bundle
11
11
  from upgini.sampler.random_under_sampler import RandomUnderSampler
12
12
 
@@ -132,6 +132,11 @@ def balance_undersample(
132
132
  if SYSTEM_RECORD_ID not in df.columns:
133
133
  raise Exception("System record id must be presented for undersampling")
134
134
 
135
+ # Rebalance and send to server only train data
136
+ # because eval set data will be sent separately in transform for metrics
137
+ if EVAL_SET_INDEX in df.columns:
138
+ df = df[df[EVAL_SET_INDEX] == 0]
139
+
135
140
  target = df[target_column].copy()
136
141
 
137
142
  vc = target.value_counts()