upgini 1.2.113a3974.dev2__py3-none-any.whl → 1.2.114a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/metadata.py CHANGED
@@ -285,6 +285,7 @@ class FeaturesMetadataV2(BaseModel):
285
285
  doc_link: Optional[str] = None
286
286
  update_frequency: Optional[str] = None
287
287
  from_online_api: Optional[bool] = None
288
+ psi_value: Optional[float] = None
288
289
 
289
290
 
290
291
  class HitRateMetrics(BaseModel):
upgini/metrics.py CHANGED
@@ -816,7 +816,8 @@ class CatBoostWrapper(EstimatorWrapper):
816
816
  else:
817
817
  encoded = cat_encoder.transform(x[self.cat_features])
818
818
  cat_features = encoded.columns.to_list()
819
- x.loc[:, self.cat_features] = encoded
819
+ x.drop(columns=encoded.columns, inplace=True, errors="ignore")
820
+ x[encoded.columns] = encoded
820
821
  else:
821
822
  cat_features = self.cat_features
822
823
 
@@ -1175,7 +1176,10 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
1175
1176
  >>> mean_squared_log_error(y_true, y_pred, multioutput=[0.3, 0.7])
1176
1177
  0.060...
1177
1178
  """
1178
- _, y_true, y_pred, multioutput = _check_reg_targets(y_true, y_pred, multioutput)
1179
+ try:
1180
+ _, y_true, y_pred, multioutput = _check_reg_targets(y_true, y_pred, multioutput)
1181
+ except TypeError:
1182
+ _, y_true, y_pred, sample_weight, multioutput = _check_reg_targets(y_true, y_pred, sample_weight, multioutput)
1179
1183
  check_consistent_length(y_true, y_pred, sample_weight)
1180
1184
 
1181
1185
  if (y_true < 0).any():
@@ -123,7 +123,7 @@ train_unstable_target=Your training sample contains an unstable target event, PS
123
123
  eval_unstable_target=Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
124
124
  # eval set validation
125
125
  unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
126
- eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
126
+ eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y or X only
127
127
  unsupported_x_type_eval_set=Unsupported type of X in eval_set: {}. Use pandas.DataFrame, pandas.Series or numpy.ndarray or list.
128
128
  eval_x_and_x_diff_shape=The column set in eval_set are differ from the column set in X
129
129
  unsupported_y_type_eval_set=Unsupported type of y in eval_set: {}. Use pandas.Series, numpy.ndarray or list
@@ -139,6 +139,8 @@ eval_x_is_empty=X in eval_set is empty.
139
139
  eval_y_is_empty=y in eval_set is empty.
140
140
  x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
141
141
  eval_x_has_train_samples=Eval set X has rows that are present in train set X
142
+ oot_without_date_not_supported=Eval set {} provided as OOT but date column is missing. It will be ignored for stability check
143
+ oot_with_online_sources_not_supported=Eval set {} provided as OOT and also provided columns for online API. It will be ignored for stability check
142
144
 
143
145
  baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
144
146
  baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
@@ -255,6 +257,7 @@ features_info_provider=Provider
255
257
  features_info_source=Source
256
258
  features_info_name=Feature name
257
259
  features_info_shap=SHAP value
260
+ features_info_psi=PSI value
258
261
  features_info_hitrate=Coverage %
259
262
  features_info_type=Type
260
263
  # Deprecated
upgini/sampler/base.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """
2
2
  Base class for the under-sampling method.
3
3
  """
4
+
4
5
  # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
5
6
  # License: MIT
6
7
 
@@ -12,6 +13,7 @@ import numpy as np
12
13
  from sklearn.base import BaseEstimator
13
14
  from sklearn.preprocessing import label_binarize
14
15
  from sklearn.utils.multiclass import check_classification_targets
16
+ from sklearn.utils.validation import check_X_y
15
17
 
16
18
  from .utils import ArraysTransformer, check_sampling_strategy, check_target_type
17
19
 
@@ -125,7 +127,7 @@ class BaseSampler(SamplerMixin):
125
127
  if accept_sparse is None:
126
128
  accept_sparse = ["csr", "csc"]
127
129
  y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
128
- X, y = self._validate_data(X, y, reset=True, accept_sparse=accept_sparse)
130
+ X, y = check_X_y(X, y, accept_sparse=accept_sparse, dtype=None, ensure_all_finite=False)
129
131
  return X, y, binarize_y
130
132
 
131
133
  def _more_tags(self):
@@ -80,14 +80,24 @@ RandomUnderSampler # doctest: +NORMALIZE_WHITESPACE
80
80
 
81
81
  def _check_X_y(self, X, y):
82
82
  y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
83
- X, y = self._validate_data(
84
- X,
85
- y,
86
- reset=True,
87
- accept_sparse=["csr", "csc"],
88
- dtype=None,
89
- force_all_finite=False,
90
- )
83
+ try:
84
+ X, y = self._validate_data(
85
+ X,
86
+ y,
87
+ reset=True,
88
+ accept_sparse=["csr", "csc"],
89
+ dtype=None,
90
+ force_all_finite=False,
91
+ )
92
+ except AttributeError:
93
+ from sklearn.utils.validation import check_X_y
94
+ X, y = check_X_y(
95
+ X,
96
+ y,
97
+ accept_sparse=["csr", "csc"],
98
+ dtype=None,
99
+ ensure_all_finite=False,
100
+ )
91
101
  return X, y, binarize_y
92
102
 
93
103
  def _fit_resample(self, X, y):
@@ -136,6 +136,9 @@ def remove_fintech_duplicates(
136
136
  # Process each eval_set part separately
137
137
  new_eval_dfs = []
138
138
  for i, eval_df in enumerate(eval_dfs, 1):
139
+ # Skip OOT
140
+ if eval_df[TARGET].isna().all():
141
+ continue
139
142
  logger.info(f"Eval {i} dataset shape before clean fintech duplicates: {eval_df.shape}")
140
143
  cleaned_eval_df, eval_warning = process_df(eval_df, i)
141
144
  if eval_warning:
@@ -190,16 +193,49 @@ def clean_full_duplicates(
190
193
  msg = None
191
194
  if TARGET in df.columns:
192
195
  unique_columns.remove(TARGET)
193
- marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
196
+
197
+ # Separate rows to exclude from deduplication:
198
+ # for each eval_set_index != 0 check separately, all TARGET values are NaN
199
+ excluded_from_dedup = pd.DataFrame()
200
+ df_for_dedup = df
201
+
202
+ if EVAL_SET_INDEX in df.columns:
203
+ excluded_parts = []
204
+ # Get all unique eval_set_index values, except 0
205
+ unique_eval_indices = df[df[EVAL_SET_INDEX] != 0][EVAL_SET_INDEX].unique()
206
+
207
+ for eval_idx in unique_eval_indices:
208
+ eval_subset = df[df[EVAL_SET_INDEX] == eval_idx]
209
+ # Check that all TARGET values for this specific eval_set_index are NaN
210
+ if len(eval_subset) > 0 and eval_subset[TARGET].isna().all():
211
+ excluded_parts.append(eval_subset)
212
+ logger.info(
213
+ f"Excluded {len(eval_subset)} rows from deduplication "
214
+ f"(eval_set_index={eval_idx} and all TARGET values are NaN)"
215
+ )
216
+
217
+ # Combine all excluded parts
218
+ if excluded_parts:
219
+ excluded_from_dedup = pd.concat(excluded_parts, ignore_index=False)
220
+ # Remove excluded rows from dataframe for deduplication
221
+ excluded_indices = excluded_from_dedup.index
222
+ df_for_dedup = df[~df.index.isin(excluded_indices)]
223
+ marked_duplicates = df_for_dedup.duplicated(subset=unique_columns, keep=False)
194
224
  if marked_duplicates.sum() > 0:
195
- dups_indices = df[marked_duplicates].index.to_list()[:100]
196
- nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
197
- num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
198
- share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
225
+ dups_indices = df_for_dedup[marked_duplicates].index.to_list()[:100]
226
+ nrows_after_tgt_dedup = len(df_for_dedup.drop_duplicates(subset=unique_columns, keep=False))
227
+ num_dup_rows = len(df_for_dedup) - nrows_after_tgt_dedup
228
+ share_tgt_dedup = 100 * num_dup_rows / len(df_for_dedup)
199
229
 
200
230
  msg = bundle.get("dataset_diff_target_duplicates").format(share_tgt_dedup, num_dup_rows, dups_indices)
201
- df = df.drop_duplicates(subset=unique_columns, keep=False)
202
- logger.info(f"Dataset shape after clean invalid target duplicates: {df.shape}")
231
+ df_for_dedup = df_for_dedup.drop_duplicates(subset=unique_columns, keep=False)
232
+ logger.info(f"Dataset shape after clean invalid target duplicates: {df_for_dedup.shape}")
233
+ # Combine back excluded rows
234
+ if len(excluded_from_dedup) > 0:
235
+ df = pd.concat([df_for_dedup, excluded_from_dedup], ignore_index=False)
236
+ logger.info(f"Final dataset shape after adding back excluded rows: {df.shape}")
237
+ else:
238
+ df = df_for_dedup
203
239
 
204
240
  return df, msg
205
241
 
@@ -27,6 +27,7 @@ class FeatureInfo:
27
27
  doc_link: str
28
28
  data_provider_link: str
29
29
  data_source_link: str
30
+ psi_value: Optional[float] = None
30
31
 
31
32
  @staticmethod
32
33
  def from_metadata(
@@ -47,12 +48,14 @@ class FeatureInfo:
47
48
  doc_link=feature_meta.doc_link,
48
49
  data_provider_link=feature_meta.data_provider_link,
49
50
  data_source_link=feature_meta.data_source_link,
51
+ psi_value=feature_meta.psi_value,
50
52
  )
51
53
 
52
54
  def to_row(self, bundle: ResourceBundle) -> Dict[str, str]:
53
55
  return {
54
56
  bundle.get("features_info_name"): self.name,
55
57
  bundle.get("features_info_shap"): self.rounded_shap,
58
+ bundle.get("features_info_psi"): self.psi_value,
56
59
  bundle.get("features_info_hitrate"): self.hitrate,
57
60
  bundle.get("features_info_value_preview"): self.value_preview,
58
61
  bundle.get("features_info_provider"): self.provider,
@@ -64,6 +67,7 @@ class FeatureInfo:
64
67
  return {
65
68
  bundle.get("features_info_name"): self.internal_name,
66
69
  bundle.get("features_info_shap"): self.rounded_shap,
70
+ bundle.get("features_info_psi"): self.psi_value,
67
71
  bundle.get("features_info_hitrate"): self.hitrate,
68
72
  bundle.get("features_info_value_preview"): self.value_preview,
69
73
  bundle.get("features_info_provider"): self.internal_provider,
@@ -76,6 +80,7 @@ class FeatureInfo:
76
80
  bundle.get("features_info_name"): self.internal_name,
77
81
  "feature_link": self.doc_link,
78
82
  bundle.get("features_info_shap"): self.rounded_shap,
83
+ bundle.get("features_info_psi"): self.psi_value,
79
84
  bundle.get("features_info_hitrate"): self.hitrate,
80
85
  bundle.get("features_info_value_preview"): self.value_preview,
81
86
  bundle.get("features_info_provider"): self.internal_provider,
upgini/utils/psi.py ADDED
@@ -0,0 +1,294 @@
1
+ import itertools
2
+ import logging
3
+ import operator
4
+ from functools import reduce
5
+ from typing import Callable, Dict, Optional
6
+
7
+ import more_itertools
8
+ import numpy as np
9
+ import pandas as pd
10
+ from pandas.api.types import is_numeric_dtype
11
+ from pydantic import BaseModel
12
+
13
+ from upgini.metadata import TARGET, ModelTaskType
14
+
15
+
16
+ class StabilityParams(BaseModel):
17
+ threshold: float = 999
18
+ n_intervals: int = 12
19
+ min_intervals: int = 10
20
+ max_intervals: Optional[int] = None
21
+ min_values_in_interval: Optional[int] = None
22
+ n_bins: int = 10
23
+ min_values_in_bin: Optional[int] = None
24
+ cat_top_pct: float = 0.7
25
+ agg: str = "max"
26
+
27
+
28
+ DEFAULT_TARGET_PARAMS = StabilityParams(
29
+ n_intervals=12,
30
+ min_intervals=10,
31
+ max_intervals=None,
32
+ min_values_in_interval=None,
33
+ n_bins=5,
34
+ )
35
+
36
+ DEFAULT_FEATURES_PARAMS = StabilityParams(
37
+ n_intervals=12,
38
+ min_intervals=10,
39
+ max_intervals=None,
40
+ min_values_in_interval=None,
41
+ n_bins=10,
42
+ )
43
+
44
+
45
+ def calculate_sparsity_psi(
46
+ df: pd.DataFrame,
47
+ cat_features: list[str],
48
+ date_column: str,
49
+ logger: logging.Logger,
50
+ model_task_type: ModelTaskType,
51
+ psi_features_params: StabilityParams = DEFAULT_FEATURES_PARAMS,
52
+ psi_target_params: StabilityParams = DEFAULT_TARGET_PARAMS,
53
+ ) -> Dict[str, float]:
54
+ sparse_features = df.columns[df.isna().sum() > 0].to_list()
55
+ if len(sparse_features) > 0:
56
+ logger.info(f"Calculating sparsity stability for {len(sparse_features)} sparse features")
57
+ sparse_df = df[sparse_features].notna()
58
+ sparse_df[date_column] = df[date_column]
59
+ return calculate_features_psi(
60
+ sparse_df,
61
+ cat_features,
62
+ date_column,
63
+ logger,
64
+ model_task_type,
65
+ psi_target_params,
66
+ psi_features_params,
67
+ )
68
+ return {}
69
+
70
+
71
+ def calculate_features_psi(
72
+ df: pd.DataFrame,
73
+ cat_features: list[str],
74
+ date_column: str,
75
+ logger: logging.Logger,
76
+ model_task_type: ModelTaskType,
77
+ psi_features_params: StabilityParams = DEFAULT_FEATURES_PARAMS,
78
+ psi_target_params: StabilityParams = DEFAULT_TARGET_PARAMS,
79
+ ) -> Dict[str, float]:
80
+ empty_res = pd.Series(index=df.columns, data=0)
81
+
82
+ if not is_numeric_dtype(df[date_column]):
83
+ df[date_column] = pd.to_datetime(df[date_column]).dt.floor("D").astype(np.int64) / 10**6
84
+
85
+ n_months = pd.to_datetime(df[date_column], unit="ms").dt.month.nunique()
86
+
87
+ if TARGET in df.columns:
88
+ psi_target_params.n_intervals = min(
89
+ psi_target_params.max_intervals or np.inf, max(psi_target_params.min_intervals, n_months)
90
+ )
91
+ logger.info(f"Setting {psi_target_params.n_intervals} intervals for target PSI check")
92
+
93
+ logger.info(f"Calculating target PSI for {psi_target_params.n_intervals} intervals")
94
+ reference_mask, current_masks = _split_intervals(df, date_column, psi_target_params.n_intervals, logger)
95
+
96
+ if psi_target_params.min_values_in_interval is not None and any(
97
+ len(mask) < psi_target_params.min_values_in_interval
98
+ for mask in itertools.chain(current_masks, [reference_mask])
99
+ ):
100
+ logger.info(
101
+ f"Some intervals have less than {psi_target_params.min_values_in_interval} values. Skip PSI check"
102
+ )
103
+ return empty_res
104
+
105
+ target_agg_func = _get_agg_func(psi_target_params.agg)
106
+ logger.info(f"Calculating target PSI with agg function {target_agg_func}")
107
+ target_psi = _stability_agg(
108
+ [df[TARGET][cur] for cur in current_masks],
109
+ reference_data=df[TARGET][reference_mask],
110
+ is_numerical=model_task_type == ModelTaskType.REGRESSION,
111
+ min_values_in_bin=psi_target_params.min_values_in_bin,
112
+ n_bins=psi_target_params.n_bins,
113
+ cat_top_pct=psi_target_params.cat_top_pct,
114
+ agg_func=target_agg_func,
115
+ )
116
+ if target_psi is None:
117
+ logger.info("Cannot determine target PSI. Skip feature PSI check")
118
+ return pd.Series(index=df.columns, data=0)
119
+
120
+ if target_psi > psi_target_params.threshold:
121
+ logger.info(
122
+ f"Target PSI {target_psi} is more than threshold {psi_target_params.threshold}. Skip feature PSI check"
123
+ )
124
+ return empty_res
125
+
126
+ psi_features_params.n_intervals = min(
127
+ psi_features_params.max_intervals or np.inf, max(psi_features_params.min_intervals, n_months)
128
+ )
129
+ logger.info(f"Setting {psi_features_params.n_intervals} intervals for features PSI check")
130
+
131
+ logger.info(f"Calculating PSI for {len(df.columns)} features")
132
+ reference_mask, current_masks = _split_intervals(df, date_column, psi_features_params.n_intervals, logger)
133
+ features_agg_func = _get_agg_func(psi_features_params.agg)
134
+ logger.info(f"Calculating features PSI with agg function {features_agg_func}")
135
+ psi_values = [
136
+ _stability_agg(
137
+ [df[feature][cur] for cur in current_masks],
138
+ reference_data=df[feature][reference_mask],
139
+ is_numerical=feature not in cat_features,
140
+ min_values_in_bin=psi_features_params.min_values_in_bin,
141
+ n_bins=psi_features_params.n_bins,
142
+ cat_top_pct=psi_features_params.cat_top_pct,
143
+ agg_func=features_agg_func,
144
+ )
145
+ for feature in df.columns
146
+ if feature not in [TARGET, date_column]
147
+ ]
148
+ return {feature: psi for feature, psi in zip(df.columns, psi_values)}
149
+
150
+
151
+ def _split_intervals(
152
+ df: pd.DataFrame, date_column: str, n_intervals: int, logger: logging.Logger
153
+ ) -> tuple[pd.Series, list[pd.Series]]:
154
+ date_series = df[date_column]
155
+
156
+ # Check if we have enough unique values for the requested number of intervals
157
+ unique_values = date_series.nunique()
158
+
159
+ # If we have fewer unique values than requested intervals, adjust n_intervals
160
+ if unique_values < n_intervals:
161
+ logger.warning(f"Date column '{date_column}' has only {unique_values} unique values")
162
+
163
+ time_intervals = pd.qcut(date_series, q=n_intervals, duplicates="drop")
164
+ interval_labels = time_intervals.unique()
165
+ reference_mask = time_intervals == interval_labels[0]
166
+ current_masks = [time_intervals == label for label in interval_labels[1:]]
167
+ return reference_mask, current_masks
168
+
169
+
170
+ def _get_agg_func(agg: str):
171
+ np_agg = getattr(np, agg, None)
172
+ if np_agg is None and agg.startswith("q"):
173
+ q = int(agg[1:])
174
+ return lambda x: np.quantile(list(x), q / 100, method="higher")
175
+ return np_agg
176
+
177
+
178
+ def _psi(reference_percent: np.ndarray, current_percent: np.ndarray) -> float:
179
+ return np.sum((reference_percent - current_percent) * np.log(reference_percent / current_percent))
180
+
181
+
182
+ def _stability_agg(
183
+ current_data: list[pd.Series],
184
+ reference_data: pd.Series,
185
+ is_numerical: bool = True,
186
+ min_values_in_bin: int | None = None,
187
+ n_bins: int = 10,
188
+ cat_top_pct: float = 0.7,
189
+ agg_func: Callable = max,
190
+ ) -> float | None:
191
+ """Calculate the PSI
192
+ Args:
193
+ current_data: current data
194
+ reference_data: reference data
195
+ is_numerical: whether the feature is numerical
196
+ reference_ratio: ratio of current data to use as reference if reference_data is not provided
197
+ min_values_in_bin: minimum number of values in a bin to calculate PSI
198
+ n_bins: number of bins to use for numerical features
199
+ Returns:
200
+ psi_value: calculated PSI
201
+ """
202
+ reference, current = _get_binned_data(reference_data, current_data, is_numerical, n_bins, cat_top_pct)
203
+
204
+ if len(reference) == 0 or len(current) == 0:
205
+ return None
206
+
207
+ nonempty_current = [i for i, c in enumerate(current) if len(c) > 0]
208
+ current = [current[i] for i in nonempty_current]
209
+ current_data = [current_data[i] for i in nonempty_current]
210
+
211
+ if len(current) == 0:
212
+ return None
213
+
214
+ if min_values_in_bin is not None and (
215
+ np.array(reference).min() < min_values_in_bin or any(np.array(c).min() < min_values_in_bin for c in current)
216
+ ):
217
+ return None
218
+
219
+ reference = _fill_zeroes(reference / len(reference_data))
220
+ current = [_fill_zeroes(c / len(d)) for c, d in zip(current, current_data)]
221
+
222
+ psi_value = agg_func([_psi(reference, c) for c in current])
223
+
224
+ return psi_value
225
+
226
+
227
+ def _get_binned_data(
228
+ reference_data: pd.Series,
229
+ current_data: list[pd.Series],
230
+ is_numerical: bool,
231
+ n_bins: int,
232
+ cat_top_pct: float,
233
+ ):
234
+ """Split variable into n buckets based on reference quantiles
235
+ Args:
236
+ reference_data: reference data
237
+ current_data: current data
238
+ feature_type: feature type
239
+ n: number of quantiles
240
+ Returns:
241
+ reference_counts: number of records in each bucket for reference
242
+ current_counts: number of records in each bucket for current
243
+ """
244
+ n_vals = reference_data.nunique()
245
+
246
+ if is_numerical and n_vals > 20:
247
+ bins = _get_bin_edges(reference_data, n_bins)
248
+ reference_counts = np.histogram(reference_data, bins)[0]
249
+ current_counts = [np.histogram(d, bins)[0] for d in current_data]
250
+
251
+ else:
252
+ keys = _get_unique_not_nan_values_list_from_series([reference_data] + current_data)
253
+ ref_feature_dict = {**dict.fromkeys(keys, 0), **dict(reference_data.value_counts())}
254
+ current_feature_dict = [{**dict.fromkeys(keys, 0), **dict(d.value_counts())} for d in current_data]
255
+ key_dict = more_itertools.map_reduce(
256
+ itertools.chain(ref_feature_dict.items(), *(d.items() for d in current_feature_dict)),
257
+ keyfunc=operator.itemgetter(0),
258
+ valuefunc=operator.itemgetter(1),
259
+ reducefunc=sum,
260
+ )
261
+ key_dict = pd.Series(key_dict)
262
+ keys = key_dict.index[key_dict.rank(pct=True) >= cat_top_pct]
263
+ reference_counts = np.array([ref_feature_dict[key] for key in keys])
264
+ current_counts = [np.array([current_feature_dict[i][key] for key in keys]) for i in range(len(current_data))]
265
+
266
+ reference_counts = np.append(reference_counts, reference_data.isna().sum())
267
+ current_counts = [np.append(d, current_data[i].isna().sum()) for i, d in enumerate(current_counts)]
268
+
269
+ return reference_counts, current_counts
270
+
271
+
272
+ def _fill_zeroes(percents: np.ndarray) -> np.ndarray:
273
+ eps = 0.0001
274
+ if (percents == 0).all():
275
+ np.place(percents, percents == 0, eps)
276
+ else:
277
+ min_value = min(percents[percents != 0])
278
+ if min_value <= eps:
279
+ np.place(percents, percents == 0, eps)
280
+ else:
281
+ np.place(percents, percents == 0, min_value / 10**6)
282
+ return percents
283
+
284
+
285
+ def _get_bin_edges(data: pd.Series, n_bins: int) -> np.ndarray:
286
+ bins = np.nanquantile(data, np.linspace(0, 1, n_bins + 1))
287
+ bins[0] = -np.inf
288
+ bins[-1] = np.inf
289
+ return bins
290
+
291
+
292
+ def _get_unique_not_nan_values_list_from_series(series: list[pd.Series]) -> list:
293
+ """Get unique values from current and reference series, drop NaNs"""
294
+ return list(reduce(set.union, (set(s.dropna().unique()) for s in series)))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.113a3974.dev2
3
+ Version: 1.2.114a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -270,9 +270,9 @@ from upgini.metadata import SearchKey
270
270
  enricher = FeaturesEnricher(
271
271
  search_keys={
272
272
  "subscription_activation_date": SearchKey.DATE,
273
- "country": SearchKey.COUNTRY,
274
- "zip_code": SearchKey.POSTAL_CODE,
275
- "hashed_email": SearchKey.HEM,
273
+ "country": SearchKey.COUNTRY,
274
+ "zip_code": SearchKey.POSTAL_CODE,
275
+ "hashed_email": SearchKey.HEM,
276
276
  "last_visit_ip_address": SearchKey.IP,
277
277
  "registered_with_phone": SearchKey.PHONE
278
278
  })
@@ -358,9 +358,9 @@ from upgini.metadata import SearchKey
358
358
  enricher = FeaturesEnricher(
359
359
  search_keys={
360
360
  "subscription_activation_date": SearchKey.DATE,
361
- "country": SearchKey.COUNTRY,
362
- "zip_code": SearchKey.POSTAL_CODE,
363
- "hashed_email": SearchKey.HEM,
361
+ "country": SearchKey.COUNTRY,
362
+ "zip_code": SearchKey.POSTAL_CODE,
363
+ "hashed_email": SearchKey.HEM,
364
364
  "last_visit_ip_address": SearchKey.IP,
365
365
  "registered_with_phone": SearchKey.PHONE
366
366
  },
@@ -381,7 +381,7 @@ from upgini.metadata import SearchKey
381
381
  enricher = FeaturesEnricher(
382
382
  search_keys={
383
383
  "subscription_activation_date": SearchKey.DATE,
384
- "zip_code": SearchKey.POSTAL_CODE,
384
+ "zip_code": SearchKey.POSTAL_CODE,
385
385
  },
386
386
  country_code = "US",
387
387
  date_format = "%Y-%d-%m"
@@ -409,8 +409,8 @@ y = train_df["churn_flag"]
409
409
  enricher = FeaturesEnricher(
410
410
  search_keys={
411
411
  "subscription_activation_date": SearchKey.DATE,
412
- "country": SearchKey.COUNTRY,
413
- "zip_code": SearchKey.POSTAL_CODE
412
+ "country": SearchKey.COUNTRY,
413
+ "zip_code": SearchKey.POSTAL_CODE
414
414
  })
415
415
 
416
416
  # everything is ready to fit! For 200к records fitting should take around 10 minutes,
@@ -464,8 +464,8 @@ And then, for `transform` in a production ML pipeline, you'll get enrichment wit
464
464
  enricher = FeaturesEnricher(
465
465
  search_keys={
466
466
  "subscription_activation_date": SearchKey.DATE,
467
- "country": SearchKey.COUNTRY,
468
- "zip_code": SearchKey.POSTAL_CODE,
467
+ "country": SearchKey.COUNTRY,
468
+ "zip_code": SearchKey.POSTAL_CODE,
469
469
  },
470
470
  )
471
471
  ```
@@ -516,8 +516,8 @@ enricher = FeaturesEnricher(
516
516
  If you're working with multivariate time series, you should specify id columns of individual univariate series in `FeaturesEnricher`. For example, if you have a dataset predicting sales for different stores and products, you should specify store and product id columns as follows:
517
517
  ```python
518
518
  enricher = FeaturesEnricher(
519
- search_keys={
520
- "sales_date": SearchKey.DATE,
519
+ search_keys={
520
+ "sales_date": SearchKey.DATE,
521
521
  },
522
522
  id_columns=["store_id", "product_id"],
523
523
  cv=CVType.time_series
@@ -733,9 +733,22 @@ enricher.fit(
733
733
  )
734
734
  ```
735
735
  #### ⚠️ Requirements for out-of-time dataset
736
- - Same data schema as for search initialization dataset
736
+ - Same data schema as for search initialization X dataset
737
737
  - Pandas dataframe representation
738
738
 
739
+ There are 3 options to pass out-of-time without labels:
740
+ ```python
741
+ enricher.fit(
742
+ train_ids_and_features,
743
+ train_label,
744
+ eval_set = [
745
+ (eval_ids_and_features_1,), # Just tuple of 1 element
746
+ (eval_ids_and_features_2, None), # None as labels
747
+ (eval_ids_and_features_3, [np.nan] * len(eval_ids_and_features_3)), # List or Series of the same size as eval X
748
+ ]
749
+ )
750
+ ```
751
+
739
752
  ### Use custom loss function in feature selection & metrics calculation
740
753
 
741
754
  `FeaturesEnricher` can be initialized with additional string parameter `loss`.
@@ -797,7 +810,7 @@ enricher = FeaturesEnricher(
797
810
  enricher.fit(X, y)
798
811
  ```
799
812
 
800
- ## Turn off removing of target outliers
813
+ ### Turn off removing of target outliers
801
814
  Upgini detect rows with target outlier for regression tasks. By default such rows are dropped on metrics calculation. To turn off removing of target outlier rows use parameter `remove_outliers_calc_metrics=False` in fit, fit_transform or calculate_metrics methods:
802
815
 
803
816
  ```python
@@ -808,7 +821,7 @@ enricher = FeaturesEnricher(
808
821
  enricher.fit(X, y, remove_outliers_calc_metrics=False)
809
822
  ```
810
823
 
811
- ## Turn off generating features on search keys
824
+ ### Turn off generating features on search keys
812
825
  Upgini tries to generate features on email, date and datetime search keys. By default this generation is enabled. To disable it use parameter `generate_search_key_features` of FeaturesEnricher constructor:
813
826
 
814
827
  ```python
@@ -816,6 +829,7 @@ enricher = FeaturesEnricher(
816
829
  search_keys={"date": SearchKey.DATE},
817
830
  generate_search_key_features=False,
818
831
  )
832
+ ```
819
833
 
820
834
  ## 🔑 Open up all capabilities of Upgini
821
835