upgini 1.2.113a3974.dev2__py3-none-any.whl → 1.2.114a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/dataset.py +40 -6
- upgini/features_enricher.py +489 -147
- upgini/metadata.py +1 -0
- upgini/metrics.py +6 -2
- upgini/resource_bundle/strings.properties +6 -1
- upgini/sampler/base.py +3 -1
- upgini/sampler/random_under_sampler.py +18 -8
- upgini/utils/deduplicate_utils.py +57 -9
- upgini/utils/feature_info.py +5 -0
- upgini/utils/psi.py +294 -0
- upgini/utils/sample_utils.py +30 -2
- upgini/utils/target_utils.py +6 -1
- {upgini-1.2.113a3974.dev2.dist-info → upgini-1.2.114a2.dist-info}/METADATA +31 -17
- {upgini-1.2.113a3974.dev2.dist-info → upgini-1.2.114a2.dist-info}/RECORD +17 -16
- {upgini-1.2.113a3974.dev2.dist-info → upgini-1.2.114a2.dist-info}/WHEEL +1 -1
- {upgini-1.2.113a3974.dev2.dist-info → upgini-1.2.114a2.dist-info}/licenses/LICENSE +0 -0
upgini/metadata.py
CHANGED
upgini/metrics.py
CHANGED
@@ -816,7 +816,8 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
816
816
|
else:
|
817
817
|
encoded = cat_encoder.transform(x[self.cat_features])
|
818
818
|
cat_features = encoded.columns.to_list()
|
819
|
-
x.
|
819
|
+
x.drop(columns=encoded.columns, inplace=True, errors="ignore")
|
820
|
+
x[encoded.columns] = encoded
|
820
821
|
else:
|
821
822
|
cat_features = self.cat_features
|
822
823
|
|
@@ -1175,7 +1176,10 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
|
|
1175
1176
|
>>> mean_squared_log_error(y_true, y_pred, multioutput=[0.3, 0.7])
|
1176
1177
|
0.060...
|
1177
1178
|
"""
|
1178
|
-
|
1179
|
+
try:
|
1180
|
+
_, y_true, y_pred, multioutput = _check_reg_targets(y_true, y_pred, multioutput)
|
1181
|
+
except TypeError:
|
1182
|
+
_, y_true, y_pred, sample_weight, multioutput = _check_reg_targets(y_true, y_pred, sample_weight, multioutput)
|
1179
1183
|
check_consistent_length(y_true, y_pred, sample_weight)
|
1180
1184
|
|
1181
1185
|
if (y_true < 0).any():
|
@@ -123,7 +123,7 @@ train_unstable_target=Your training sample contains an unstable target event, PS
|
|
123
123
|
eval_unstable_target=Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
|
124
124
|
# eval set validation
|
125
125
|
unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
|
126
|
-
eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
|
126
|
+
eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y or X only
|
127
127
|
unsupported_x_type_eval_set=Unsupported type of X in eval_set: {}. Use pandas.DataFrame, pandas.Series or numpy.ndarray or list.
|
128
128
|
eval_x_and_x_diff_shape=The column set in eval_set are differ from the column set in X
|
129
129
|
unsupported_y_type_eval_set=Unsupported type of y in eval_set: {}. Use pandas.Series, numpy.ndarray or list
|
@@ -139,6 +139,8 @@ eval_x_is_empty=X in eval_set is empty.
|
|
139
139
|
eval_y_is_empty=y in eval_set is empty.
|
140
140
|
x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
|
141
141
|
eval_x_has_train_samples=Eval set X has rows that are present in train set X
|
142
|
+
oot_without_date_not_supported=Eval set {} provided as OOT but date column is missing. It will be ignored for stability check
|
143
|
+
oot_with_online_sources_not_supported=Eval set {} provided as OOT and also provided columns for online API. It will be ignored for stability check
|
142
144
|
|
143
145
|
baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
|
144
146
|
baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
|
@@ -163,6 +165,7 @@ dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample
|
|
163
165
|
dataset_empty_column_names=Some column names are empty. Add names please
|
164
166
|
dataset_full_duplicates={:.5f}% of the rows are fully duplicated
|
165
167
|
dataset_diff_target_duplicates={:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nSample of incorrect row indexes: {}
|
168
|
+
dataset_diff_target_duplicates_oot={:.4f}% of rows ({}) in OOT eval_set are duplicates with train or another eval_set. These rows will be deleted from OOT\nSample of incorrect row indexes: {}
|
166
169
|
dataset_train_diff_target_duplicates_fintech={:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
167
170
|
dataset_eval_diff_target_duplicates_fintech={:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
168
171
|
dataset_drop_old_dates=We don't have data before '2000-01-01' and removed all earlier records from the search dataset
|
@@ -183,6 +186,7 @@ dataset_invalid_column_type=Unsupported data type of column {}: {}
|
|
183
186
|
dataset_invalid_filter=Unknown field in filter_features. Should be {'min_importance', 'max_psi', 'max_count', 'selected_features'}.
|
184
187
|
dataset_too_big_file=Too big size of dataframe X for processing. Please reduce number of rows or columns
|
185
188
|
dataset_transform_diff_fit=You try to enrich dataset that column names are different from the train dataset column names that you used on the fit stage. Please make the column names the same as in the train dataset and restart.
|
189
|
+
oot_eval_set_too_small_after_dedup=OOT eval set {} has less than 1000 rows after deduplication. It will be ignored for stability check
|
186
190
|
binary_small_dataset=The least populated class in Target contains less than 1000 rows.\nSmall numbers of observations may negatively affect the number of selected features and quality of your ML model.\nUpgini recommends you increase the number of observations in the least populated class.\n
|
187
191
|
all_search_keys_invalid=All search keys are invalid
|
188
192
|
all_emails_invalid=All values in column {} are invalid emails # Metrics validation
|
@@ -255,6 +259,7 @@ features_info_provider=Provider
|
|
255
259
|
features_info_source=Source
|
256
260
|
features_info_name=Feature name
|
257
261
|
features_info_shap=SHAP value
|
262
|
+
features_info_psi=PSI value
|
258
263
|
features_info_hitrate=Coverage %
|
259
264
|
features_info_type=Type
|
260
265
|
# Deprecated
|
upgini/sampler/base.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
"""
|
2
2
|
Base class for the under-sampling method.
|
3
3
|
"""
|
4
|
+
|
4
5
|
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
|
5
6
|
# License: MIT
|
6
7
|
|
@@ -12,6 +13,7 @@ import numpy as np
|
|
12
13
|
from sklearn.base import BaseEstimator
|
13
14
|
from sklearn.preprocessing import label_binarize
|
14
15
|
from sklearn.utils.multiclass import check_classification_targets
|
16
|
+
from sklearn.utils.validation import check_X_y
|
15
17
|
|
16
18
|
from .utils import ArraysTransformer, check_sampling_strategy, check_target_type
|
17
19
|
|
@@ -125,7 +127,7 @@ class BaseSampler(SamplerMixin):
|
|
125
127
|
if accept_sparse is None:
|
126
128
|
accept_sparse = ["csr", "csc"]
|
127
129
|
y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
|
128
|
-
X, y =
|
130
|
+
X, y = check_X_y(X, y, accept_sparse=accept_sparse, dtype=None, ensure_all_finite=False)
|
129
131
|
return X, y, binarize_y
|
130
132
|
|
131
133
|
def _more_tags(self):
|
@@ -80,14 +80,24 @@ RandomUnderSampler # doctest: +NORMALIZE_WHITESPACE
|
|
80
80
|
|
81
81
|
def _check_X_y(self, X, y):
|
82
82
|
y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
|
83
|
-
|
84
|
-
X,
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
83
|
+
try:
|
84
|
+
X, y = self._validate_data(
|
85
|
+
X,
|
86
|
+
y,
|
87
|
+
reset=True,
|
88
|
+
accept_sparse=["csr", "csc"],
|
89
|
+
dtype=None,
|
90
|
+
force_all_finite=False,
|
91
|
+
)
|
92
|
+
except AttributeError:
|
93
|
+
from sklearn.utils.validation import check_X_y
|
94
|
+
X, y = check_X_y(
|
95
|
+
X,
|
96
|
+
y,
|
97
|
+
accept_sparse=["csr", "csc"],
|
98
|
+
dtype=None,
|
99
|
+
ensure_all_finite=False,
|
100
|
+
)
|
91
101
|
return X, y, binarize_y
|
92
102
|
|
93
103
|
def _fit_resample(self, X, y):
|
@@ -134,8 +134,13 @@ def remove_fintech_duplicates(
|
|
134
134
|
logger.info(f"Train dataset shape after clean fintech duplicates: {train_df.shape}")
|
135
135
|
|
136
136
|
# Process each eval_set part separately
|
137
|
+
oot_eval_dfs = []
|
137
138
|
new_eval_dfs = []
|
138
139
|
for i, eval_df in enumerate(eval_dfs, 1):
|
140
|
+
# Skip OOT
|
141
|
+
if eval_df[TARGET].isna().all():
|
142
|
+
oot_eval_dfs.append(eval_df)
|
143
|
+
continue
|
139
144
|
logger.info(f"Eval {i} dataset shape before clean fintech duplicates: {eval_df.shape}")
|
140
145
|
cleaned_eval_df, eval_warning = process_df(eval_df, i)
|
141
146
|
if eval_warning:
|
@@ -145,8 +150,8 @@ def remove_fintech_duplicates(
|
|
145
150
|
|
146
151
|
# Combine the processed train and eval parts back into one dataset
|
147
152
|
logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
|
148
|
-
if new_eval_dfs:
|
149
|
-
df = pd.concat([train_df] + new_eval_dfs)
|
153
|
+
if new_eval_dfs or oot_eval_dfs:
|
154
|
+
df = pd.concat([train_df] + new_eval_dfs + oot_eval_dfs, ignore_index=False)
|
150
155
|
else:
|
151
156
|
df = train_df
|
152
157
|
logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
|
@@ -190,16 +195,59 @@ def clean_full_duplicates(
|
|
190
195
|
msg = None
|
191
196
|
if TARGET in df.columns:
|
192
197
|
unique_columns.remove(TARGET)
|
193
|
-
|
198
|
+
|
199
|
+
# Separate rows to exclude from deduplication:
|
200
|
+
# for each eval_set_index != 0 check separately, all TARGET values are NaN
|
201
|
+
df_for_dedup = df
|
202
|
+
oot_df = None
|
203
|
+
|
204
|
+
if EVAL_SET_INDEX in df.columns:
|
205
|
+
oot_eval_dfs = []
|
206
|
+
other_dfs = []
|
207
|
+
for eval_idx in df[EVAL_SET_INDEX].unique():
|
208
|
+
eval_subset = df[df[EVAL_SET_INDEX] == eval_idx]
|
209
|
+
# Check that all TARGET values for this specific eval_set_index are NaN
|
210
|
+
if eval_idx != 0 and eval_subset[TARGET].isna().all():
|
211
|
+
oot_eval_dfs.append(eval_subset)
|
212
|
+
logger.info(
|
213
|
+
f"Excluded {len(eval_subset)} rows from deduplication "
|
214
|
+
f"(eval_set_index={eval_idx} and all TARGET values are NaN)"
|
215
|
+
)
|
216
|
+
else:
|
217
|
+
other_dfs.append(eval_subset)
|
218
|
+
|
219
|
+
if oot_eval_dfs:
|
220
|
+
oot_df = pd.concat(oot_eval_dfs, ignore_index=False)
|
221
|
+
df_for_dedup = pd.concat(other_dfs, ignore_index=False)
|
222
|
+
else:
|
223
|
+
df_for_dedup = df
|
224
|
+
|
225
|
+
marked_duplicates = df_for_dedup.duplicated(subset=unique_columns, keep=False)
|
194
226
|
if marked_duplicates.sum() > 0:
|
195
|
-
dups_indices =
|
196
|
-
nrows_after_tgt_dedup = len(
|
197
|
-
num_dup_rows =
|
198
|
-
share_tgt_dedup = 100 * num_dup_rows /
|
227
|
+
dups_indices = df_for_dedup[marked_duplicates].index.to_list()[:100]
|
228
|
+
nrows_after_tgt_dedup = len(df_for_dedup.drop_duplicates(subset=unique_columns, keep=False))
|
229
|
+
num_dup_rows = len(df_for_dedup) - nrows_after_tgt_dedup
|
230
|
+
share_tgt_dedup = 100 * num_dup_rows / len(df_for_dedup)
|
199
231
|
|
200
232
|
msg = bundle.get("dataset_diff_target_duplicates").format(share_tgt_dedup, num_dup_rows, dups_indices)
|
201
|
-
|
202
|
-
logger.info(f"Dataset shape after clean invalid target duplicates: {
|
233
|
+
df_for_dedup = df_for_dedup.drop_duplicates(subset=unique_columns, keep=False)
|
234
|
+
logger.info(f"Dataset shape after clean invalid target duplicates: {df_for_dedup.shape}")
|
235
|
+
# Combine back excluded rows
|
236
|
+
if oot_df is not None:
|
237
|
+
df = pd.concat([df_for_dedup, oot_df], ignore_index=False)
|
238
|
+
marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
|
239
|
+
if marked_duplicates.sum() > 0:
|
240
|
+
dups_indices = df[marked_duplicates].index.to_list()[:100]
|
241
|
+
nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
|
242
|
+
num_dup_rows = len(df) - nrows_after_tgt_dedup
|
243
|
+
share_tgt_dedup = 100 * num_dup_rows / len(df)
|
244
|
+
msg = bundle.get("dataset_diff_target_duplicates_oot").format(
|
245
|
+
share_tgt_dedup, num_dup_rows, dups_indices
|
246
|
+
)
|
247
|
+
df = df.drop_duplicates(subset=unique_columns, keep="first")
|
248
|
+
logger.info(f"Final dataset shape after adding back excluded rows: {df.shape}")
|
249
|
+
else:
|
250
|
+
df = df_for_dedup
|
203
251
|
|
204
252
|
return df, msg
|
205
253
|
|
upgini/utils/feature_info.py
CHANGED
@@ -27,6 +27,7 @@ class FeatureInfo:
|
|
27
27
|
doc_link: str
|
28
28
|
data_provider_link: str
|
29
29
|
data_source_link: str
|
30
|
+
psi_value: Optional[float] = None
|
30
31
|
|
31
32
|
@staticmethod
|
32
33
|
def from_metadata(
|
@@ -47,12 +48,14 @@ class FeatureInfo:
|
|
47
48
|
doc_link=feature_meta.doc_link,
|
48
49
|
data_provider_link=feature_meta.data_provider_link,
|
49
50
|
data_source_link=feature_meta.data_source_link,
|
51
|
+
psi_value=feature_meta.psi_value,
|
50
52
|
)
|
51
53
|
|
52
54
|
def to_row(self, bundle: ResourceBundle) -> Dict[str, str]:
|
53
55
|
return {
|
54
56
|
bundle.get("features_info_name"): self.name,
|
55
57
|
bundle.get("features_info_shap"): self.rounded_shap,
|
58
|
+
bundle.get("features_info_psi"): self.psi_value,
|
56
59
|
bundle.get("features_info_hitrate"): self.hitrate,
|
57
60
|
bundle.get("features_info_value_preview"): self.value_preview,
|
58
61
|
bundle.get("features_info_provider"): self.provider,
|
@@ -64,6 +67,7 @@ class FeatureInfo:
|
|
64
67
|
return {
|
65
68
|
bundle.get("features_info_name"): self.internal_name,
|
66
69
|
bundle.get("features_info_shap"): self.rounded_shap,
|
70
|
+
bundle.get("features_info_psi"): self.psi_value,
|
67
71
|
bundle.get("features_info_hitrate"): self.hitrate,
|
68
72
|
bundle.get("features_info_value_preview"): self.value_preview,
|
69
73
|
bundle.get("features_info_provider"): self.internal_provider,
|
@@ -76,6 +80,7 @@ class FeatureInfo:
|
|
76
80
|
bundle.get("features_info_name"): self.internal_name,
|
77
81
|
"feature_link": self.doc_link,
|
78
82
|
bundle.get("features_info_shap"): self.rounded_shap,
|
83
|
+
bundle.get("features_info_psi"): self.psi_value,
|
79
84
|
bundle.get("features_info_hitrate"): self.hitrate,
|
80
85
|
bundle.get("features_info_value_preview"): self.value_preview,
|
81
86
|
bundle.get("features_info_provider"): self.internal_provider,
|
upgini/utils/psi.py
ADDED
@@ -0,0 +1,294 @@
|
|
1
|
+
import itertools
|
2
|
+
import logging
|
3
|
+
import operator
|
4
|
+
from functools import reduce
|
5
|
+
from typing import Callable, Dict, Optional
|
6
|
+
|
7
|
+
import more_itertools
|
8
|
+
import numpy as np
|
9
|
+
import pandas as pd
|
10
|
+
from pandas.api.types import is_numeric_dtype
|
11
|
+
from pydantic import BaseModel
|
12
|
+
|
13
|
+
from upgini.metadata import TARGET, ModelTaskType
|
14
|
+
|
15
|
+
|
16
|
+
class StabilityParams(BaseModel):
|
17
|
+
threshold: float = 999
|
18
|
+
n_intervals: int = 12
|
19
|
+
min_intervals: int = 10
|
20
|
+
max_intervals: Optional[int] = None
|
21
|
+
min_values_in_interval: Optional[int] = None
|
22
|
+
n_bins: int = 10
|
23
|
+
min_values_in_bin: Optional[int] = None
|
24
|
+
cat_top_pct: float = 0.7
|
25
|
+
agg: str = "max"
|
26
|
+
|
27
|
+
|
28
|
+
DEFAULT_TARGET_PARAMS = StabilityParams(
|
29
|
+
n_intervals=12,
|
30
|
+
min_intervals=10,
|
31
|
+
max_intervals=None,
|
32
|
+
min_values_in_interval=None,
|
33
|
+
n_bins=5,
|
34
|
+
)
|
35
|
+
|
36
|
+
DEFAULT_FEATURES_PARAMS = StabilityParams(
|
37
|
+
n_intervals=12,
|
38
|
+
min_intervals=10,
|
39
|
+
max_intervals=None,
|
40
|
+
min_values_in_interval=None,
|
41
|
+
n_bins=10,
|
42
|
+
)
|
43
|
+
|
44
|
+
|
45
|
+
def calculate_sparsity_psi(
|
46
|
+
df: pd.DataFrame,
|
47
|
+
cat_features: list[str],
|
48
|
+
date_column: str,
|
49
|
+
logger: logging.Logger,
|
50
|
+
model_task_type: ModelTaskType,
|
51
|
+
psi_features_params: StabilityParams = DEFAULT_FEATURES_PARAMS,
|
52
|
+
psi_target_params: StabilityParams = DEFAULT_TARGET_PARAMS,
|
53
|
+
) -> Dict[str, float]:
|
54
|
+
sparse_features = df.columns[df.isna().sum() > 0].to_list()
|
55
|
+
if len(sparse_features) > 0:
|
56
|
+
logger.info(f"Calculating sparsity stability for {len(sparse_features)} sparse features")
|
57
|
+
sparse_df = df[sparse_features].notna()
|
58
|
+
sparse_df[date_column] = df[date_column]
|
59
|
+
return calculate_features_psi(
|
60
|
+
sparse_df,
|
61
|
+
cat_features,
|
62
|
+
date_column,
|
63
|
+
logger,
|
64
|
+
model_task_type,
|
65
|
+
psi_target_params,
|
66
|
+
psi_features_params,
|
67
|
+
)
|
68
|
+
return {}
|
69
|
+
|
70
|
+
|
71
|
+
def calculate_features_psi(
|
72
|
+
df: pd.DataFrame,
|
73
|
+
cat_features: list[str],
|
74
|
+
date_column: str,
|
75
|
+
logger: logging.Logger,
|
76
|
+
model_task_type: ModelTaskType,
|
77
|
+
psi_features_params: StabilityParams = DEFAULT_FEATURES_PARAMS,
|
78
|
+
psi_target_params: StabilityParams = DEFAULT_TARGET_PARAMS,
|
79
|
+
) -> Dict[str, float]:
|
80
|
+
empty_res = pd.Series(index=df.columns, data=0)
|
81
|
+
|
82
|
+
if not is_numeric_dtype(df[date_column]):
|
83
|
+
df[date_column] = pd.to_datetime(df[date_column]).dt.floor("D").astype(np.int64) / 10**6
|
84
|
+
|
85
|
+
n_months = pd.to_datetime(df[date_column], unit="ms").dt.month.nunique()
|
86
|
+
|
87
|
+
if TARGET in df.columns:
|
88
|
+
psi_target_params.n_intervals = min(
|
89
|
+
psi_target_params.max_intervals or np.inf, max(psi_target_params.min_intervals, n_months)
|
90
|
+
)
|
91
|
+
logger.info(f"Setting {psi_target_params.n_intervals} intervals for target PSI check")
|
92
|
+
|
93
|
+
logger.info(f"Calculating target PSI for {psi_target_params.n_intervals} intervals")
|
94
|
+
reference_mask, current_masks = _split_intervals(df, date_column, psi_target_params.n_intervals, logger)
|
95
|
+
|
96
|
+
if psi_target_params.min_values_in_interval is not None and any(
|
97
|
+
len(mask) < psi_target_params.min_values_in_interval
|
98
|
+
for mask in itertools.chain(current_masks, [reference_mask])
|
99
|
+
):
|
100
|
+
logger.info(
|
101
|
+
f"Some intervals have less than {psi_target_params.min_values_in_interval} values. Skip PSI check"
|
102
|
+
)
|
103
|
+
return empty_res
|
104
|
+
|
105
|
+
target_agg_func = _get_agg_func(psi_target_params.agg)
|
106
|
+
logger.info(f"Calculating target PSI with agg function {target_agg_func}")
|
107
|
+
target_psi = _stability_agg(
|
108
|
+
[df[TARGET][cur] for cur in current_masks],
|
109
|
+
reference_data=df[TARGET][reference_mask],
|
110
|
+
is_numerical=model_task_type == ModelTaskType.REGRESSION,
|
111
|
+
min_values_in_bin=psi_target_params.min_values_in_bin,
|
112
|
+
n_bins=psi_target_params.n_bins,
|
113
|
+
cat_top_pct=psi_target_params.cat_top_pct,
|
114
|
+
agg_func=target_agg_func,
|
115
|
+
)
|
116
|
+
if target_psi is None:
|
117
|
+
logger.info("Cannot determine target PSI. Skip feature PSI check")
|
118
|
+
return pd.Series(index=df.columns, data=0)
|
119
|
+
|
120
|
+
if target_psi > psi_target_params.threshold:
|
121
|
+
logger.info(
|
122
|
+
f"Target PSI {target_psi} is more than threshold {psi_target_params.threshold}. Skip feature PSI check"
|
123
|
+
)
|
124
|
+
return empty_res
|
125
|
+
|
126
|
+
psi_features_params.n_intervals = min(
|
127
|
+
psi_features_params.max_intervals or np.inf, max(psi_features_params.min_intervals, n_months)
|
128
|
+
)
|
129
|
+
logger.info(f"Setting {psi_features_params.n_intervals} intervals for features PSI check")
|
130
|
+
|
131
|
+
logger.info(f"Calculating PSI for {len(df.columns)} features")
|
132
|
+
reference_mask, current_masks = _split_intervals(df, date_column, psi_features_params.n_intervals, logger)
|
133
|
+
features_agg_func = _get_agg_func(psi_features_params.agg)
|
134
|
+
logger.info(f"Calculating features PSI with agg function {features_agg_func}")
|
135
|
+
psi_values = [
|
136
|
+
_stability_agg(
|
137
|
+
[df[feature][cur] for cur in current_masks],
|
138
|
+
reference_data=df[feature][reference_mask],
|
139
|
+
is_numerical=feature not in cat_features,
|
140
|
+
min_values_in_bin=psi_features_params.min_values_in_bin,
|
141
|
+
n_bins=psi_features_params.n_bins,
|
142
|
+
cat_top_pct=psi_features_params.cat_top_pct,
|
143
|
+
agg_func=features_agg_func,
|
144
|
+
)
|
145
|
+
for feature in df.columns
|
146
|
+
if feature not in [TARGET, date_column]
|
147
|
+
]
|
148
|
+
return {feature: psi for feature, psi in zip(df.columns, psi_values)}
|
149
|
+
|
150
|
+
|
151
|
+
def _split_intervals(
|
152
|
+
df: pd.DataFrame, date_column: str, n_intervals: int, logger: logging.Logger
|
153
|
+
) -> tuple[pd.Series, list[pd.Series]]:
|
154
|
+
date_series = df[date_column]
|
155
|
+
|
156
|
+
# Check if we have enough unique values for the requested number of intervals
|
157
|
+
unique_values = date_series.nunique()
|
158
|
+
|
159
|
+
# If we have fewer unique values than requested intervals, adjust n_intervals
|
160
|
+
if unique_values < n_intervals:
|
161
|
+
logger.warning(f"Date column '{date_column}' has only {unique_values} unique values")
|
162
|
+
|
163
|
+
time_intervals = pd.qcut(date_series, q=n_intervals, duplicates="drop")
|
164
|
+
interval_labels = time_intervals.unique()
|
165
|
+
reference_mask = time_intervals == interval_labels[0]
|
166
|
+
current_masks = [time_intervals == label for label in interval_labels[1:]]
|
167
|
+
return reference_mask, current_masks
|
168
|
+
|
169
|
+
|
170
|
+
def _get_agg_func(agg: str):
|
171
|
+
np_agg = getattr(np, agg, None)
|
172
|
+
if np_agg is None and agg.startswith("q"):
|
173
|
+
q = int(agg[1:])
|
174
|
+
return lambda x: np.quantile(list(x), q / 100, method="higher")
|
175
|
+
return np_agg
|
176
|
+
|
177
|
+
|
178
|
+
def _psi(reference_percent: np.ndarray, current_percent: np.ndarray) -> float:
|
179
|
+
return np.sum((reference_percent - current_percent) * np.log(reference_percent / current_percent))
|
180
|
+
|
181
|
+
|
182
|
+
def _stability_agg(
|
183
|
+
current_data: list[pd.Series],
|
184
|
+
reference_data: pd.Series,
|
185
|
+
is_numerical: bool = True,
|
186
|
+
min_values_in_bin: int | None = None,
|
187
|
+
n_bins: int = 10,
|
188
|
+
cat_top_pct: float = 0.7,
|
189
|
+
agg_func: Callable = max,
|
190
|
+
) -> float | None:
|
191
|
+
"""Calculate the PSI
|
192
|
+
Args:
|
193
|
+
current_data: current data
|
194
|
+
reference_data: reference data
|
195
|
+
is_numerical: whether the feature is numerical
|
196
|
+
reference_ratio: ratio of current data to use as reference if reference_data is not provided
|
197
|
+
min_values_in_bin: minimum number of values in a bin to calculate PSI
|
198
|
+
n_bins: number of bins to use for numerical features
|
199
|
+
Returns:
|
200
|
+
psi_value: calculated PSI
|
201
|
+
"""
|
202
|
+
reference, current = _get_binned_data(reference_data, current_data, is_numerical, n_bins, cat_top_pct)
|
203
|
+
|
204
|
+
if len(reference) == 0 or len(current) == 0:
|
205
|
+
return None
|
206
|
+
|
207
|
+
nonempty_current = [i for i, c in enumerate(current) if len(c) > 0]
|
208
|
+
current = [current[i] for i in nonempty_current]
|
209
|
+
current_data = [current_data[i] for i in nonempty_current]
|
210
|
+
|
211
|
+
if len(current) == 0:
|
212
|
+
return None
|
213
|
+
|
214
|
+
if min_values_in_bin is not None and (
|
215
|
+
np.array(reference).min() < min_values_in_bin or any(np.array(c).min() < min_values_in_bin for c in current)
|
216
|
+
):
|
217
|
+
return None
|
218
|
+
|
219
|
+
reference = _fill_zeroes(reference / len(reference_data))
|
220
|
+
current = [_fill_zeroes(c / len(d)) for c, d in zip(current, current_data)]
|
221
|
+
|
222
|
+
psi_value = agg_func([_psi(reference, c) for c in current])
|
223
|
+
|
224
|
+
return psi_value
|
225
|
+
|
226
|
+
|
227
|
+
def _get_binned_data(
|
228
|
+
reference_data: pd.Series,
|
229
|
+
current_data: list[pd.Series],
|
230
|
+
is_numerical: bool,
|
231
|
+
n_bins: int,
|
232
|
+
cat_top_pct: float,
|
233
|
+
):
|
234
|
+
"""Split variable into n buckets based on reference quantiles
|
235
|
+
Args:
|
236
|
+
reference_data: reference data
|
237
|
+
current_data: current data
|
238
|
+
feature_type: feature type
|
239
|
+
n: number of quantiles
|
240
|
+
Returns:
|
241
|
+
reference_counts: number of records in each bucket for reference
|
242
|
+
current_counts: number of records in each bucket for current
|
243
|
+
"""
|
244
|
+
n_vals = reference_data.nunique()
|
245
|
+
|
246
|
+
if is_numerical and n_vals > 20:
|
247
|
+
bins = _get_bin_edges(reference_data, n_bins)
|
248
|
+
reference_counts = np.histogram(reference_data, bins)[0]
|
249
|
+
current_counts = [np.histogram(d, bins)[0] for d in current_data]
|
250
|
+
|
251
|
+
else:
|
252
|
+
keys = _get_unique_not_nan_values_list_from_series([reference_data] + current_data)
|
253
|
+
ref_feature_dict = {**dict.fromkeys(keys, 0), **dict(reference_data.value_counts())}
|
254
|
+
current_feature_dict = [{**dict.fromkeys(keys, 0), **dict(d.value_counts())} for d in current_data]
|
255
|
+
key_dict = more_itertools.map_reduce(
|
256
|
+
itertools.chain(ref_feature_dict.items(), *(d.items() for d in current_feature_dict)),
|
257
|
+
keyfunc=operator.itemgetter(0),
|
258
|
+
valuefunc=operator.itemgetter(1),
|
259
|
+
reducefunc=sum,
|
260
|
+
)
|
261
|
+
key_dict = pd.Series(key_dict)
|
262
|
+
keys = key_dict.index[key_dict.rank(pct=True) >= cat_top_pct]
|
263
|
+
reference_counts = np.array([ref_feature_dict[key] for key in keys])
|
264
|
+
current_counts = [np.array([current_feature_dict[i][key] for key in keys]) for i in range(len(current_data))]
|
265
|
+
|
266
|
+
reference_counts = np.append(reference_counts, reference_data.isna().sum())
|
267
|
+
current_counts = [np.append(d, current_data[i].isna().sum()) for i, d in enumerate(current_counts)]
|
268
|
+
|
269
|
+
return reference_counts, current_counts
|
270
|
+
|
271
|
+
|
272
|
+
def _fill_zeroes(percents: np.ndarray) -> np.ndarray:
|
273
|
+
eps = 0.0001
|
274
|
+
if (percents == 0).all():
|
275
|
+
np.place(percents, percents == 0, eps)
|
276
|
+
else:
|
277
|
+
min_value = min(percents[percents != 0])
|
278
|
+
if min_value <= eps:
|
279
|
+
np.place(percents, percents == 0, eps)
|
280
|
+
else:
|
281
|
+
np.place(percents, percents == 0, min_value / 10**6)
|
282
|
+
return percents
|
283
|
+
|
284
|
+
|
285
|
+
def _get_bin_edges(data: pd.Series, n_bins: int) -> np.ndarray:
|
286
|
+
bins = np.nanquantile(data, np.linspace(0, 1, n_bins + 1))
|
287
|
+
bins[0] = -np.inf
|
288
|
+
bins[-1] = np.inf
|
289
|
+
return bins
|
290
|
+
|
291
|
+
|
292
|
+
def _get_unique_not_nan_values_list_from_series(series: list[pd.Series]) -> list:
|
293
|
+
"""Get unique values from current and reference series, drop NaNs"""
|
294
|
+
return list(reduce(set.union, (set(s.dropna().unique()) for s in series)))
|
upgini/utils/sample_utils.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Callable, List, Optional
|
|
5
5
|
import numpy as np
|
6
6
|
import pandas as pd
|
7
7
|
|
8
|
-
from upgini.metadata import SYSTEM_RECORD_ID, CVType, ModelTaskType
|
8
|
+
from upgini.metadata import EVAL_SET_INDEX, SYSTEM_RECORD_ID, TARGET, CVType, ModelTaskType
|
9
9
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
10
10
|
from upgini.utils.target_utils import balance_undersample
|
11
11
|
from upgini.utils.ts_utils import get_most_frequent_time_unit, trunc_datetime
|
@@ -117,6 +117,22 @@ def sample(
|
|
117
117
|
**kwargs,
|
118
118
|
)
|
119
119
|
|
120
|
+
# separate OOT
|
121
|
+
oot_dfs = []
|
122
|
+
other_dfs = []
|
123
|
+
if EVAL_SET_INDEX in df.columns:
|
124
|
+
for eval_set_index in df[EVAL_SET_INDEX].unique():
|
125
|
+
eval_df = df[df[EVAL_SET_INDEX] == eval_set_index]
|
126
|
+
if TARGET in eval_df.columns and eval_df[TARGET].isna().all():
|
127
|
+
oot_dfs.append(eval_df)
|
128
|
+
else:
|
129
|
+
other_dfs.append(eval_df)
|
130
|
+
if len(oot_dfs) > 0:
|
131
|
+
oot_df = pd.concat(oot_dfs, ignore_index=False)
|
132
|
+
df = pd.concat(other_dfs, ignore_index=False)
|
133
|
+
else:
|
134
|
+
oot_df = None
|
135
|
+
|
120
136
|
num_samples = _num_samples(df)
|
121
137
|
if num_samples > fit_sample_threshold:
|
122
138
|
logger.info(
|
@@ -126,6 +142,18 @@ def sample(
|
|
126
142
|
df = df.sample(n=fit_sample_rows, random_state=random_state)
|
127
143
|
logger.info(f"Shape after threshold resampling: {df.shape}")
|
128
144
|
|
145
|
+
if oot_df is not None:
|
146
|
+
num_samples_oot = _num_samples(oot_df)
|
147
|
+
if num_samples_oot > fit_sample_threshold:
|
148
|
+
logger.info(
|
149
|
+
f"OOT has size {num_samples_oot} more than threshold {fit_sample_threshold} "
|
150
|
+
f"and will be downsampled to {fit_sample_rows}"
|
151
|
+
)
|
152
|
+
oot_df = oot_df.sample(n=fit_sample_rows, random_state=random_state)
|
153
|
+
df = pd.concat([df, oot_df], ignore_index=False)
|
154
|
+
|
155
|
+
logger.info(f"Dataset size after downsampling: {len(df)}")
|
156
|
+
|
129
157
|
return df
|
130
158
|
|
131
159
|
|
@@ -175,7 +203,7 @@ def sample_time_series_train_eval(
|
|
175
203
|
)
|
176
204
|
if logger is not None:
|
177
205
|
logger.info(f"Eval set size: {len(eval_df)}")
|
178
|
-
df = pd.concat([train_df, eval_df])
|
206
|
+
df = pd.concat([train_df, eval_df], ignore_index=False)
|
179
207
|
|
180
208
|
elif len(train_df) > max_rows:
|
181
209
|
df = sample_time_series_trunc(
|
upgini/utils/target_utils.py
CHANGED
@@ -6,7 +6,7 @@ import pandas as pd
|
|
6
6
|
from pandas.api.types import is_bool_dtype, is_datetime64_any_dtype, is_numeric_dtype
|
7
7
|
|
8
8
|
from upgini.errors import ValidationError
|
9
|
-
from upgini.metadata import SYSTEM_RECORD_ID, ModelTaskType
|
9
|
+
from upgini.metadata import EVAL_SET_INDEX, SYSTEM_RECORD_ID, ModelTaskType
|
10
10
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle, bundle
|
11
11
|
from upgini.sampler.random_under_sampler import RandomUnderSampler
|
12
12
|
|
@@ -132,6 +132,11 @@ def balance_undersample(
|
|
132
132
|
if SYSTEM_RECORD_ID not in df.columns:
|
133
133
|
raise Exception("System record id must be presented for undersampling")
|
134
134
|
|
135
|
+
# Rebalance and send to server only train data
|
136
|
+
# because eval set data will be sent separately in transform for metrics
|
137
|
+
if EVAL_SET_INDEX in df.columns:
|
138
|
+
df = df[df[EVAL_SET_INDEX] == 0]
|
139
|
+
|
135
140
|
target = df[target_column].copy()
|
136
141
|
|
137
142
|
vc = target.value_counts()
|