upgini 1.2.113a3974.dev1__py3-none-any.whl → 1.2.114__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/autofe/date.py +8 -4
- upgini/dataset.py +48 -78
- upgini/features_enricher.py +726 -516
- upgini/http.py +15 -19
- upgini/metadata.py +1 -10
- upgini/metrics.py +6 -2
- upgini/resource_bundle/strings.properties +8 -6
- upgini/sampler/base.py +3 -1
- upgini/sampler/random_under_sampler.py +18 -8
- upgini/search_task.py +6 -0
- upgini/utils/config.py +43 -0
- upgini/utils/deduplicate_utils.py +57 -9
- upgini/utils/display_utils.py +1 -1
- upgini/utils/feature_info.py +5 -0
- upgini/utils/hash_utils.py +159 -0
- upgini/utils/psi.py +300 -0
- upgini/utils/sample_utils.py +45 -42
- upgini/utils/target_utils.py +53 -2
- {upgini-1.2.113a3974.dev1.dist-info → upgini-1.2.114.dist-info}/METADATA +62 -32
- {upgini-1.2.113a3974.dev1.dist-info → upgini-1.2.114.dist-info}/RECORD +23 -20
- {upgini-1.2.113a3974.dev1.dist-info → upgini-1.2.114.dist-info}/WHEEL +1 -1
- {upgini-1.2.113a3974.dev1.dist-info → upgini-1.2.114.dist-info}/licenses/LICENSE +0 -0
upgini/utils/psi.py
ADDED
@@ -0,0 +1,300 @@
|
|
1
|
+
import itertools
|
2
|
+
import logging
|
3
|
+
import operator
|
4
|
+
from functools import reduce
|
5
|
+
from typing import Callable, Dict, Optional
|
6
|
+
|
7
|
+
import more_itertools
|
8
|
+
import numpy as np
|
9
|
+
import pandas as pd
|
10
|
+
from pandas.api.types import is_numeric_dtype
|
11
|
+
from pydantic import BaseModel
|
12
|
+
|
13
|
+
from upgini.metadata import TARGET, ModelTaskType
|
14
|
+
|
15
|
+
|
16
|
+
class StabilityParams(BaseModel):
|
17
|
+
threshold: float = 999
|
18
|
+
n_intervals: int = 12
|
19
|
+
min_intervals: int = 10
|
20
|
+
max_intervals: Optional[int] = None
|
21
|
+
min_values_in_interval: Optional[int] = None
|
22
|
+
n_bins: int = 10
|
23
|
+
min_values_in_bin: Optional[int] = None
|
24
|
+
cat_top_pct: float = 0.7
|
25
|
+
agg: str = "max"
|
26
|
+
|
27
|
+
|
28
|
+
DEFAULT_TARGET_PARAMS = StabilityParams(
|
29
|
+
n_intervals=12,
|
30
|
+
min_intervals=10,
|
31
|
+
max_intervals=None,
|
32
|
+
min_values_in_interval=None,
|
33
|
+
n_bins=5,
|
34
|
+
)
|
35
|
+
|
36
|
+
DEFAULT_FEATURES_PARAMS = StabilityParams(
|
37
|
+
n_intervals=12,
|
38
|
+
min_intervals=10,
|
39
|
+
max_intervals=None,
|
40
|
+
min_values_in_interval=None,
|
41
|
+
n_bins=10,
|
42
|
+
)
|
43
|
+
|
44
|
+
|
45
|
+
def calculate_sparsity_psi(
|
46
|
+
df: pd.DataFrame,
|
47
|
+
cat_features: list[str],
|
48
|
+
date_column: str,
|
49
|
+
logger: logging.Logger,
|
50
|
+
model_task_type: ModelTaskType,
|
51
|
+
stability_agg_func: str | None = None,
|
52
|
+
psi_features_params: StabilityParams = DEFAULT_FEATURES_PARAMS,
|
53
|
+
psi_target_params: StabilityParams = DEFAULT_TARGET_PARAMS,
|
54
|
+
) -> Dict[str, float]:
|
55
|
+
sparse_features = df.columns[df.isna().sum() > 0].to_list()
|
56
|
+
if len(sparse_features) > 0:
|
57
|
+
logger.info(f"Calculating sparsity stability for {len(sparse_features)} sparse features")
|
58
|
+
sparse_df = df[sparse_features].notna()
|
59
|
+
sparse_df[date_column] = df[date_column]
|
60
|
+
return calculate_features_psi(
|
61
|
+
sparse_df,
|
62
|
+
cat_features,
|
63
|
+
date_column,
|
64
|
+
logger,
|
65
|
+
model_task_type,
|
66
|
+
stability_agg_func,
|
67
|
+
psi_target_params,
|
68
|
+
psi_features_params,
|
69
|
+
)
|
70
|
+
return {}
|
71
|
+
|
72
|
+
|
73
|
+
def calculate_features_psi(
|
74
|
+
df: pd.DataFrame,
|
75
|
+
cat_features: list[str],
|
76
|
+
date_column: str,
|
77
|
+
logger: logging.Logger,
|
78
|
+
model_task_type: ModelTaskType,
|
79
|
+
stability_agg_func: str | None = None,
|
80
|
+
psi_features_params: StabilityParams = DEFAULT_FEATURES_PARAMS,
|
81
|
+
psi_target_params: StabilityParams = DEFAULT_TARGET_PARAMS,
|
82
|
+
) -> dict[str, float]:
|
83
|
+
empty_res = {col: 0.0 for col in df.columns if col not in [TARGET, date_column]}
|
84
|
+
|
85
|
+
if not is_numeric_dtype(df[date_column]):
|
86
|
+
df[date_column] = pd.to_datetime(df[date_column]).dt.floor("D").astype(np.int64) / 10**6
|
87
|
+
|
88
|
+
# Filter out rows with missing dates
|
89
|
+
df = df[df[date_column].notna()].copy()
|
90
|
+
|
91
|
+
n_months = pd.to_datetime(df[date_column], unit="ms").dt.month.nunique()
|
92
|
+
|
93
|
+
if TARGET in df.columns:
|
94
|
+
psi_target_params.n_intervals = min(
|
95
|
+
psi_target_params.max_intervals or np.inf, max(psi_target_params.min_intervals, n_months)
|
96
|
+
)
|
97
|
+
logger.info(f"Setting {psi_target_params.n_intervals} intervals for target PSI check")
|
98
|
+
|
99
|
+
logger.info(f"Calculating target PSI for {psi_target_params.n_intervals} intervals")
|
100
|
+
reference_mask, current_masks = _split_intervals(df, date_column, psi_target_params.n_intervals, logger)
|
101
|
+
|
102
|
+
if psi_target_params.min_values_in_interval is not None and any(
|
103
|
+
len(mask) < psi_target_params.min_values_in_interval
|
104
|
+
for mask in itertools.chain(current_masks, [reference_mask])
|
105
|
+
):
|
106
|
+
logger.info(
|
107
|
+
f"Some intervals have less than {psi_target_params.min_values_in_interval} values. Skip PSI check"
|
108
|
+
)
|
109
|
+
return empty_res
|
110
|
+
|
111
|
+
target_agg_func = _get_agg_func(stability_agg_func or psi_target_params.agg)
|
112
|
+
logger.info(f"Calculating target PSI with agg function {target_agg_func}")
|
113
|
+
target_psi = _stability_agg(
|
114
|
+
[df[TARGET][cur] for cur in current_masks],
|
115
|
+
reference_data=df[TARGET][reference_mask],
|
116
|
+
is_numerical=model_task_type == ModelTaskType.REGRESSION,
|
117
|
+
min_values_in_bin=psi_target_params.min_values_in_bin,
|
118
|
+
n_bins=psi_target_params.n_bins,
|
119
|
+
cat_top_pct=psi_target_params.cat_top_pct,
|
120
|
+
agg_func=target_agg_func,
|
121
|
+
)
|
122
|
+
if target_psi is None or np.isnan(target_psi):
|
123
|
+
logger.info("Cannot determine target PSI. Skip feature PSI check")
|
124
|
+
return empty_res
|
125
|
+
|
126
|
+
if target_psi > psi_target_params.threshold:
|
127
|
+
logger.info(
|
128
|
+
f"Target PSI {target_psi} is more than threshold {psi_target_params.threshold}. Skip feature PSI check"
|
129
|
+
)
|
130
|
+
return empty_res
|
131
|
+
|
132
|
+
psi_features_params.n_intervals = min(
|
133
|
+
psi_features_params.max_intervals or np.inf, max(psi_features_params.min_intervals, n_months)
|
134
|
+
)
|
135
|
+
logger.info(f"Setting {psi_features_params.n_intervals} intervals for features PSI check")
|
136
|
+
|
137
|
+
logger.info(f"Calculating PSI for {len(df.columns)} features")
|
138
|
+
reference_mask, current_masks = _split_intervals(df, date_column, psi_features_params.n_intervals, logger)
|
139
|
+
features_agg_func = _get_agg_func(stability_agg_func or psi_features_params.agg)
|
140
|
+
logger.info(f"Calculating features PSI with agg function {features_agg_func}")
|
141
|
+
psi_values = [
|
142
|
+
_stability_agg(
|
143
|
+
[df[feature][cur] for cur in current_masks],
|
144
|
+
reference_data=df[feature][reference_mask],
|
145
|
+
is_numerical=feature not in cat_features,
|
146
|
+
min_values_in_bin=psi_features_params.min_values_in_bin,
|
147
|
+
n_bins=psi_features_params.n_bins,
|
148
|
+
cat_top_pct=psi_features_params.cat_top_pct,
|
149
|
+
agg_func=features_agg_func,
|
150
|
+
)
|
151
|
+
for feature in df.columns
|
152
|
+
if feature not in [TARGET, date_column]
|
153
|
+
]
|
154
|
+
return {feature: psi for feature, psi in zip(df.columns, psi_values)}
|
155
|
+
|
156
|
+
|
157
|
+
def _split_intervals(
|
158
|
+
df: pd.DataFrame, date_column: str, n_intervals: int, logger: logging.Logger
|
159
|
+
) -> tuple[pd.Series, list[pd.Series]]:
|
160
|
+
date_series = df[date_column]
|
161
|
+
|
162
|
+
# Check if we have enough unique values for the requested number of intervals
|
163
|
+
unique_values = date_series.nunique()
|
164
|
+
|
165
|
+
# If we have fewer unique values than requested intervals, adjust n_intervals
|
166
|
+
if unique_values < n_intervals:
|
167
|
+
logger.warning(f"Date column '{date_column}' has only {unique_values} unique values")
|
168
|
+
|
169
|
+
time_intervals = pd.qcut(date_series, q=n_intervals, duplicates="drop")
|
170
|
+
interval_labels = time_intervals.unique()
|
171
|
+
reference_mask = time_intervals == interval_labels[0]
|
172
|
+
current_masks = [time_intervals == label for label in interval_labels[1:]]
|
173
|
+
return reference_mask, current_masks
|
174
|
+
|
175
|
+
|
176
|
+
def _get_agg_func(agg: str):
|
177
|
+
np_agg = getattr(np, agg, None)
|
178
|
+
if np_agg is None and agg.startswith("q"):
|
179
|
+
q = int(agg[1:])
|
180
|
+
return lambda x: np.quantile(list(x), q / 100, method="higher")
|
181
|
+
return np_agg
|
182
|
+
|
183
|
+
|
184
|
+
def _psi(reference_percent: np.ndarray, current_percent: np.ndarray) -> float:
|
185
|
+
return np.sum((reference_percent - current_percent) * np.log(reference_percent / current_percent))
|
186
|
+
|
187
|
+
|
188
|
+
def _stability_agg(
|
189
|
+
current_data: list[pd.Series],
|
190
|
+
reference_data: pd.Series,
|
191
|
+
is_numerical: bool = True,
|
192
|
+
min_values_in_bin: int | None = None,
|
193
|
+
n_bins: int = 10,
|
194
|
+
cat_top_pct: float = 0.7,
|
195
|
+
agg_func: Callable = max,
|
196
|
+
) -> float | None:
|
197
|
+
"""Calculate the PSI
|
198
|
+
Args:
|
199
|
+
current_data: current data
|
200
|
+
reference_data: reference data
|
201
|
+
is_numerical: whether the feature is numerical
|
202
|
+
reference_ratio: ratio of current data to use as reference if reference_data is not provided
|
203
|
+
min_values_in_bin: minimum number of values in a bin to calculate PSI
|
204
|
+
n_bins: number of bins to use for numerical features
|
205
|
+
Returns:
|
206
|
+
psi_value: calculated PSI
|
207
|
+
"""
|
208
|
+
reference, current = _get_binned_data(reference_data, current_data, is_numerical, n_bins, cat_top_pct)
|
209
|
+
|
210
|
+
if len(reference) == 0 or len(current) == 0:
|
211
|
+
return None
|
212
|
+
|
213
|
+
nonempty_current = [i for i, c in enumerate(current) if len(c) > 0]
|
214
|
+
current = [current[i] for i in nonempty_current]
|
215
|
+
current_data = [current_data[i] for i in nonempty_current]
|
216
|
+
|
217
|
+
if len(current) == 0:
|
218
|
+
return None
|
219
|
+
|
220
|
+
if min_values_in_bin is not None and (
|
221
|
+
np.array(reference).min() < min_values_in_bin or any(np.array(c).min() < min_values_in_bin for c in current)
|
222
|
+
):
|
223
|
+
return None
|
224
|
+
|
225
|
+
reference = _fill_zeroes(reference / len(reference_data))
|
226
|
+
current = [_fill_zeroes(c / len(d)) for c, d in zip(current, current_data)]
|
227
|
+
|
228
|
+
psi_value = agg_func([_psi(reference, c) for c in current])
|
229
|
+
|
230
|
+
return float(psi_value)
|
231
|
+
|
232
|
+
|
233
|
+
def _get_binned_data(
|
234
|
+
reference_data: pd.Series,
|
235
|
+
current_data: list[pd.Series],
|
236
|
+
is_numerical: bool,
|
237
|
+
n_bins: int,
|
238
|
+
cat_top_pct: float,
|
239
|
+
):
|
240
|
+
"""Split variable into n buckets based on reference quantiles
|
241
|
+
Args:
|
242
|
+
reference_data: reference data
|
243
|
+
current_data: current data
|
244
|
+
feature_type: feature type
|
245
|
+
n: number of quantiles
|
246
|
+
Returns:
|
247
|
+
reference_counts: number of records in each bucket for reference
|
248
|
+
current_counts: number of records in each bucket for current
|
249
|
+
"""
|
250
|
+
n_vals = reference_data.nunique()
|
251
|
+
|
252
|
+
if is_numerical and n_vals > 20:
|
253
|
+
bins = _get_bin_edges(reference_data, n_bins)
|
254
|
+
reference_counts = np.histogram(reference_data, bins)[0]
|
255
|
+
current_counts = [np.histogram(d, bins)[0] for d in current_data]
|
256
|
+
|
257
|
+
else:
|
258
|
+
keys = _get_unique_not_nan_values_list_from_series([reference_data] + current_data)
|
259
|
+
ref_feature_dict = {**dict.fromkeys(keys, 0), **dict(reference_data.value_counts())}
|
260
|
+
current_feature_dict = [{**dict.fromkeys(keys, 0), **dict(d.value_counts())} for d in current_data]
|
261
|
+
key_dict = more_itertools.map_reduce(
|
262
|
+
itertools.chain(ref_feature_dict.items(), *(d.items() for d in current_feature_dict)),
|
263
|
+
keyfunc=operator.itemgetter(0),
|
264
|
+
valuefunc=operator.itemgetter(1),
|
265
|
+
reducefunc=sum,
|
266
|
+
)
|
267
|
+
key_dict = pd.Series(key_dict)
|
268
|
+
keys = key_dict.index[key_dict.rank(pct=True) >= cat_top_pct]
|
269
|
+
reference_counts = np.array([ref_feature_dict[key] for key in keys])
|
270
|
+
current_counts = [np.array([current_feature_dict[i][key] for key in keys]) for i in range(len(current_data))]
|
271
|
+
|
272
|
+
reference_counts = np.append(reference_counts, reference_data.isna().sum())
|
273
|
+
current_counts = [np.append(d, current_data[i].isna().sum()) for i, d in enumerate(current_counts)]
|
274
|
+
|
275
|
+
return reference_counts, current_counts
|
276
|
+
|
277
|
+
|
278
|
+
def _fill_zeroes(percents: np.ndarray) -> np.ndarray:
|
279
|
+
eps = 0.0001
|
280
|
+
if (percents == 0).all():
|
281
|
+
np.place(percents, percents == 0, eps)
|
282
|
+
else:
|
283
|
+
min_value = min(percents[percents != 0])
|
284
|
+
if min_value <= eps:
|
285
|
+
np.place(percents, percents == 0, eps)
|
286
|
+
else:
|
287
|
+
np.place(percents, percents == 0, min_value / 10**6)
|
288
|
+
return percents
|
289
|
+
|
290
|
+
|
291
|
+
def _get_bin_edges(data: pd.Series, n_bins: int) -> np.ndarray:
|
292
|
+
bins = np.nanquantile(data, np.linspace(0, 1, n_bins + 1))
|
293
|
+
bins[0] = -np.inf
|
294
|
+
bins[-1] = np.inf
|
295
|
+
return bins
|
296
|
+
|
297
|
+
|
298
|
+
def _get_unique_not_nan_values_list_from_series(series: list[pd.Series]) -> list:
|
299
|
+
"""Get unique values from current and reference series, drop NaNs"""
|
300
|
+
return list(reduce(set.union, (set(s.dropna().unique()) for s in series)))
|
upgini/utils/sample_utils.py
CHANGED
@@ -1,55 +1,30 @@
|
|
1
|
-
from dataclasses import dataclass, field
|
2
1
|
import logging
|
3
2
|
import numbers
|
3
|
+
from dataclasses import dataclass
|
4
4
|
from typing import Callable, List, Optional
|
5
|
+
|
5
6
|
import numpy as np
|
6
7
|
import pandas as pd
|
7
8
|
|
8
|
-
from upgini.metadata import
|
9
|
+
from upgini.metadata import (
|
10
|
+
EVAL_SET_INDEX,
|
11
|
+
SYSTEM_RECORD_ID,
|
12
|
+
TARGET,
|
13
|
+
CVType,
|
14
|
+
ModelTaskType,
|
15
|
+
)
|
9
16
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
17
|
+
from upgini.utils.config import (
|
18
|
+
TS_DEFAULT_HIGH_FREQ_TRUNC_LENGTHS,
|
19
|
+
TS_DEFAULT_LOW_FREQ_TRUNC_LENGTHS,
|
20
|
+
TS_DEFAULT_TIME_UNIT_THRESHOLD,
|
21
|
+
TS_MIN_DIFFERENT_IDS_RATIO,
|
22
|
+
SampleConfig,
|
23
|
+
)
|
10
24
|
from upgini.utils.target_utils import balance_undersample
|
11
25
|
from upgini.utils.ts_utils import get_most_frequent_time_unit, trunc_datetime
|
12
26
|
|
13
27
|
|
14
|
-
TS_MIN_DIFFERENT_IDS_RATIO = 0.2
|
15
|
-
TS_DEFAULT_HIGH_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=2, months=6), pd.DateOffset(years=2, days=7)]
|
16
|
-
TS_DEFAULT_LOW_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=7), pd.DateOffset(years=5)]
|
17
|
-
TS_DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
|
18
|
-
FIT_SAMPLE_ROWS_TS = 100_000
|
19
|
-
|
20
|
-
BINARY_MIN_SAMPLE_THRESHOLD = 5_000
|
21
|
-
MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
|
22
|
-
BINARY_BOOTSTRAP_LOOPS = 5
|
23
|
-
MULTICLASS_BOOTSTRAP_LOOPS = 2
|
24
|
-
|
25
|
-
FIT_SAMPLE_THRESHOLD = 100_000
|
26
|
-
FIT_SAMPLE_ROWS = 100_000
|
27
|
-
FIT_SAMPLE_ROWS_WITH_EVAL_SET = 100_000
|
28
|
-
FIT_SAMPLE_THRESHOLD_WITH_EVAL_SET = 100_000
|
29
|
-
|
30
|
-
|
31
|
-
@dataclass
|
32
|
-
class SampleConfig:
|
33
|
-
force_sample_size: int = 7000
|
34
|
-
ts_min_different_ids_ratio: float = TS_MIN_DIFFERENT_IDS_RATIO
|
35
|
-
ts_default_high_freq_trunc_lengths: List[pd.DateOffset] = field(
|
36
|
-
default_factory=TS_DEFAULT_HIGH_FREQ_TRUNC_LENGTHS.copy
|
37
|
-
)
|
38
|
-
ts_default_low_freq_trunc_lengths: List[pd.DateOffset] = field(
|
39
|
-
default_factory=TS_DEFAULT_LOW_FREQ_TRUNC_LENGTHS.copy
|
40
|
-
)
|
41
|
-
ts_default_time_unit_threshold: pd.Timedelta = TS_DEFAULT_TIME_UNIT_THRESHOLD
|
42
|
-
binary_min_sample_threshold: int = BINARY_MIN_SAMPLE_THRESHOLD
|
43
|
-
multiclass_min_sample_threshold: int = MULTICLASS_MIN_SAMPLE_THRESHOLD
|
44
|
-
binary_bootstrap_loops: int = BINARY_BOOTSTRAP_LOOPS
|
45
|
-
multiclass_bootstrap_loops: int = MULTICLASS_BOOTSTRAP_LOOPS
|
46
|
-
fit_sample_threshold: int = FIT_SAMPLE_THRESHOLD
|
47
|
-
fit_sample_rows: int = FIT_SAMPLE_ROWS
|
48
|
-
fit_sample_rows_with_eval_set: int = FIT_SAMPLE_ROWS_WITH_EVAL_SET
|
49
|
-
fit_sample_threshold_with_eval_set: int = FIT_SAMPLE_THRESHOLD_WITH_EVAL_SET
|
50
|
-
fit_sample_rows_ts: int = FIT_SAMPLE_ROWS_TS
|
51
|
-
|
52
|
-
|
53
28
|
@dataclass
|
54
29
|
class SampleColumns:
|
55
30
|
date: str
|
@@ -117,6 +92,22 @@ def sample(
|
|
117
92
|
**kwargs,
|
118
93
|
)
|
119
94
|
|
95
|
+
# separate OOT
|
96
|
+
oot_dfs = []
|
97
|
+
other_dfs = []
|
98
|
+
if EVAL_SET_INDEX in df.columns:
|
99
|
+
for eval_set_index in df[EVAL_SET_INDEX].unique():
|
100
|
+
eval_df = df[df[EVAL_SET_INDEX] == eval_set_index]
|
101
|
+
if TARGET in eval_df.columns and eval_df[TARGET].isna().all():
|
102
|
+
oot_dfs.append(eval_df)
|
103
|
+
else:
|
104
|
+
other_dfs.append(eval_df)
|
105
|
+
if len(oot_dfs) > 0:
|
106
|
+
oot_df = pd.concat(oot_dfs, ignore_index=False)
|
107
|
+
df = pd.concat(other_dfs, ignore_index=False)
|
108
|
+
else:
|
109
|
+
oot_df = None
|
110
|
+
|
120
111
|
num_samples = _num_samples(df)
|
121
112
|
if num_samples > fit_sample_threshold:
|
122
113
|
logger.info(
|
@@ -126,6 +117,18 @@ def sample(
|
|
126
117
|
df = df.sample(n=fit_sample_rows, random_state=random_state)
|
127
118
|
logger.info(f"Shape after threshold resampling: {df.shape}")
|
128
119
|
|
120
|
+
if oot_df is not None:
|
121
|
+
num_samples_oot = _num_samples(oot_df)
|
122
|
+
if num_samples_oot > fit_sample_threshold:
|
123
|
+
logger.info(
|
124
|
+
f"OOT has size {num_samples_oot} more than threshold {fit_sample_threshold} "
|
125
|
+
f"and will be downsampled to {fit_sample_rows}"
|
126
|
+
)
|
127
|
+
oot_df = oot_df.sample(n=fit_sample_rows, random_state=random_state)
|
128
|
+
df = pd.concat([df, oot_df], ignore_index=False)
|
129
|
+
|
130
|
+
logger.info(f"Dataset size after downsampling: {len(df)}")
|
131
|
+
|
129
132
|
return df
|
130
133
|
|
131
134
|
|
@@ -175,7 +178,7 @@ def sample_time_series_train_eval(
|
|
175
178
|
)
|
176
179
|
if logger is not None:
|
177
180
|
logger.info(f"Eval set size: {len(eval_df)}")
|
178
|
-
df = pd.concat([train_df, eval_df])
|
181
|
+
df = pd.concat([train_df, eval_df], ignore_index=False)
|
179
182
|
|
180
183
|
elif len(train_df) > max_rows:
|
181
184
|
df = sample_time_series_trunc(
|
upgini/utils/target_utils.py
CHANGED
@@ -6,9 +6,14 @@ import pandas as pd
|
|
6
6
|
from pandas.api.types import is_bool_dtype, is_datetime64_any_dtype, is_numeric_dtype
|
7
7
|
|
8
8
|
from upgini.errors import ValidationError
|
9
|
-
from upgini.metadata import SYSTEM_RECORD_ID, ModelTaskType
|
10
|
-
from upgini.resource_bundle import ResourceBundle,
|
9
|
+
from upgini.metadata import EVAL_SET_INDEX, SYSTEM_RECORD_ID, TARGET, ModelTaskType
|
10
|
+
from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
|
11
11
|
from upgini.sampler.random_under_sampler import RandomUnderSampler
|
12
|
+
from upgini.utils.config import SampleConfig
|
13
|
+
|
14
|
+
MAX_MULTICLASS_CLASS_COUNT = 100
|
15
|
+
MIN_TARGET_CLASS_ROWS = 100
|
16
|
+
IMBALANCE_THESHOLD = 0.6
|
12
17
|
|
13
18
|
|
14
19
|
def prepare_target(y: Union[pd.Series, np.ndarray], target_type: ModelTaskType) -> Union[pd.Series, np.ndarray]:
|
@@ -106,6 +111,47 @@ def define_task(
|
|
106
111
|
return task
|
107
112
|
|
108
113
|
|
114
|
+
def is_imbalanced(
|
115
|
+
data: pd.DataFrame,
|
116
|
+
task_type: ModelTaskType,
|
117
|
+
sample_config: SampleConfig,
|
118
|
+
bundle: ResourceBundle,
|
119
|
+
) -> bool:
|
120
|
+
if task_type is None or not task_type.is_classification():
|
121
|
+
return False
|
122
|
+
|
123
|
+
data = data.drop_duplicates(keep="first")
|
124
|
+
columns_without_target = [col for col in data.columns if col != TARGET]
|
125
|
+
data = data.drop_duplicates(subset=columns_without_target, keep=False)
|
126
|
+
|
127
|
+
if task_type == ModelTaskType.BINARY and len(data) <= sample_config.binary_min_sample_threshold:
|
128
|
+
return False
|
129
|
+
|
130
|
+
count = len(data)
|
131
|
+
target = data[TARGET]
|
132
|
+
target_classes_count = target.nunique()
|
133
|
+
|
134
|
+
if target_classes_count > MAX_MULTICLASS_CLASS_COUNT:
|
135
|
+
msg = bundle.get("dataset_to_many_multiclass_targets").format(target_classes_count, MAX_MULTICLASS_CLASS_COUNT)
|
136
|
+
raise ValidationError(msg)
|
137
|
+
|
138
|
+
vc = target.value_counts()
|
139
|
+
min_class_value = vc.index[len(vc) - 1]
|
140
|
+
min_class_count = vc[min_class_value]
|
141
|
+
|
142
|
+
if min_class_count < MIN_TARGET_CLASS_ROWS:
|
143
|
+
msg = bundle.get("dataset_rarest_class_less_min").format(
|
144
|
+
min_class_value, min_class_count, MIN_TARGET_CLASS_ROWS
|
145
|
+
)
|
146
|
+
raise ValidationError(msg)
|
147
|
+
|
148
|
+
min_class_percent = IMBALANCE_THESHOLD / target_classes_count
|
149
|
+
min_class_threshold = min_class_percent * count
|
150
|
+
|
151
|
+
# If min class count less than 30% for binary or (60 / classes_count)% for multiclass
|
152
|
+
return bool(min_class_count < min_class_threshold)
|
153
|
+
|
154
|
+
|
109
155
|
def is_int_encoding(unique_values):
|
110
156
|
return set(unique_values) == set(range(len(unique_values))) or set(unique_values) == set(
|
111
157
|
range(1, len(unique_values) + 1)
|
@@ -132,6 +178,11 @@ def balance_undersample(
|
|
132
178
|
if SYSTEM_RECORD_ID not in df.columns:
|
133
179
|
raise Exception("System record id must be presented for undersampling")
|
134
180
|
|
181
|
+
# Rebalance and send to server only train data
|
182
|
+
# because eval set data will be sent separately in transform for metrics
|
183
|
+
if EVAL_SET_INDEX in df.columns:
|
184
|
+
df = df[df[EVAL_SET_INDEX] == 0]
|
185
|
+
|
135
186
|
target = df[target_column].copy()
|
136
187
|
|
137
188
|
vc = target.value_counts()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: upgini
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.114
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
@@ -38,7 +38,7 @@ Requires-Dist: python-bidi==0.4.2
|
|
38
38
|
Requires-Dist: python-dateutil>=2.8.0
|
39
39
|
Requires-Dist: python-json-logger>=3.3.0
|
40
40
|
Requires-Dist: requests>=2.8.0
|
41
|
-
Requires-Dist: scikit-learn
|
41
|
+
Requires-Dist: scikit-learn<1.8.0,>=1.3.0
|
42
42
|
Requires-Dist: scipy>=1.10.0
|
43
43
|
Requires-Dist: shap>=0.44.0
|
44
44
|
Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
|
@@ -270,9 +270,9 @@ from upgini.metadata import SearchKey
|
|
270
270
|
enricher = FeaturesEnricher(
|
271
271
|
search_keys={
|
272
272
|
"subscription_activation_date": SearchKey.DATE,
|
273
|
-
|
274
|
-
|
275
|
-
|
273
|
+
"country": SearchKey.COUNTRY,
|
274
|
+
"zip_code": SearchKey.POSTAL_CODE,
|
275
|
+
"hashed_email": SearchKey.HEM,
|
276
276
|
"last_visit_ip_address": SearchKey.IP,
|
277
277
|
"registered_with_phone": SearchKey.PHONE
|
278
278
|
})
|
@@ -358,9 +358,9 @@ from upgini.metadata import SearchKey
|
|
358
358
|
enricher = FeaturesEnricher(
|
359
359
|
search_keys={
|
360
360
|
"subscription_activation_date": SearchKey.DATE,
|
361
|
-
|
362
|
-
|
363
|
-
|
361
|
+
"country": SearchKey.COUNTRY,
|
362
|
+
"zip_code": SearchKey.POSTAL_CODE,
|
363
|
+
"hashed_email": SearchKey.HEM,
|
364
364
|
"last_visit_ip_address": SearchKey.IP,
|
365
365
|
"registered_with_phone": SearchKey.PHONE
|
366
366
|
},
|
@@ -381,7 +381,7 @@ from upgini.metadata import SearchKey
|
|
381
381
|
enricher = FeaturesEnricher(
|
382
382
|
search_keys={
|
383
383
|
"subscription_activation_date": SearchKey.DATE,
|
384
|
-
|
384
|
+
"zip_code": SearchKey.POSTAL_CODE,
|
385
385
|
},
|
386
386
|
country_code = "US",
|
387
387
|
date_format = "%Y-%d-%m"
|
@@ -409,8 +409,8 @@ y = train_df["churn_flag"]
|
|
409
409
|
enricher = FeaturesEnricher(
|
410
410
|
search_keys={
|
411
411
|
"subscription_activation_date": SearchKey.DATE,
|
412
|
-
|
413
|
-
|
412
|
+
"country": SearchKey.COUNTRY,
|
413
|
+
"zip_code": SearchKey.POSTAL_CODE
|
414
414
|
})
|
415
415
|
|
416
416
|
# everything is ready to fit! For 200к records fitting should take around 10 minutes,
|
@@ -464,8 +464,8 @@ And then, for `transform` in a production ML pipeline, you'll get enrichment wit
|
|
464
464
|
enricher = FeaturesEnricher(
|
465
465
|
search_keys={
|
466
466
|
"subscription_activation_date": SearchKey.DATE,
|
467
|
-
|
468
|
-
|
467
|
+
"country": SearchKey.COUNTRY,
|
468
|
+
"zip_code": SearchKey.POSTAL_CODE,
|
469
469
|
},
|
470
470
|
)
|
471
471
|
```
|
@@ -516,8 +516,8 @@ enricher = FeaturesEnricher(
|
|
516
516
|
If you're working with multivariate time series, you should specify id columns of individual univariate series in `FeaturesEnricher`. For example, if you have a dataset predicting sales for different stores and products, you should specify store and product id columns as follows:
|
517
517
|
```python
|
518
518
|
enricher = FeaturesEnricher(
|
519
|
-
|
520
|
-
|
519
|
+
search_keys={
|
520
|
+
"sales_date": SearchKey.DATE,
|
521
521
|
},
|
522
522
|
id_columns=["store_id", "product_id"],
|
523
523
|
cv=CVType.time_series
|
@@ -733,9 +733,52 @@ enricher.fit(
|
|
733
733
|
)
|
734
734
|
```
|
735
735
|
#### ⚠️ Requirements for out-of-time dataset
|
736
|
-
- Same data schema as for search initialization dataset
|
736
|
+
- Same data schema as for search initialization X dataset
|
737
737
|
- Pandas dataframe representation
|
738
738
|
|
739
|
+
There are 3 options to pass out-of-time without labels:
|
740
|
+
```python
|
741
|
+
enricher.fit(
|
742
|
+
train_ids_and_features,
|
743
|
+
train_label,
|
744
|
+
eval_set = [
|
745
|
+
(eval_ids_and_features_1,), # Just tuple of 1 element
|
746
|
+
(eval_ids_and_features_2, None), # None as labels
|
747
|
+
(eval_ids_and_features_3, [np.nan] * len(eval_ids_and_features_3)), # List or Series of the same size as eval X
|
748
|
+
]
|
749
|
+
)
|
750
|
+
```
|
751
|
+
|
752
|
+
### Control feature stability with PSI parameters
|
753
|
+
|
754
|
+
`FeaturesEnricher` supports Population Stability Index (PSI) calculation on eval_set to evaluate feature stability over time. You can control this behavior using stability parameters in `fit` and `fit_transform` methods:
|
755
|
+
|
756
|
+
```python
|
757
|
+
enricher = FeaturesEnricher(
|
758
|
+
search_keys={"registration_date": SearchKey.DATE}
|
759
|
+
)
|
760
|
+
|
761
|
+
# Control feature stability during fit
|
762
|
+
enricher.fit(
|
763
|
+
X, y,
|
764
|
+
stability_threshold=0.2, # PSI threshold: features with PSI above this value will be dropped
|
765
|
+
stability_agg_func="max" # Aggregation function for stability values: "max", "min", "mean"
|
766
|
+
)
|
767
|
+
|
768
|
+
# Same parameters work for fit_transform
|
769
|
+
enriched_df = enricher.fit_transform(
|
770
|
+
X, y,
|
771
|
+
stability_threshold=0.1, # Stricter threshold for more stable features
|
772
|
+
stability_agg_func="mean" # Use mean aggregation instead of max
|
773
|
+
)
|
774
|
+
```
|
775
|
+
|
776
|
+
**Stability parameters:**
|
777
|
+
- `stability_threshold` (float, default=0.2): PSI threshold value. Features with PSI below this threshold will be excluded from the final feature set. Lower values mean stricter stability requirements.
|
778
|
+
- `stability_agg_func` (str, default="max"): Function to aggregate PSI values across time intervals. Options: "max" (most conservative), "min" (least conservative), "mean" (balanced approach).
|
779
|
+
|
780
|
+
**PSI (Population Stability Index)** measures how much feature distribution changes over time. Lower PSI values indicate more stable features, which are generally more reliable for production ML models.
|
781
|
+
|
739
782
|
### Use custom loss function in feature selection & metrics calculation
|
740
783
|
|
741
784
|
`FeaturesEnricher` can be initialized with additional string parameter `loss`.
|
@@ -756,20 +799,6 @@ enricher = FeaturesEnricher(
|
|
756
799
|
enriched_dataframe.fit(X, y)
|
757
800
|
```
|
758
801
|
|
759
|
-
### Return initial dataframe enriched with TOP external features by importance
|
760
|
-
|
761
|
-
`transform` and `fit_transform` methods of `FeaturesEnricher` can be used with two additional parameters:
|
762
|
-
- `importance_threshold`: float = 0 - only features with *importance >= threshold* will be added to the output dataframe
|
763
|
-
- `max_features`: int - only first TOP N features by importance will be returned, where *N = max_features*
|
764
|
-
|
765
|
-
And `keep_input=True` will keep all initial columns from search dataset X:
|
766
|
-
```python
|
767
|
-
enricher = FeaturesEnricher(
|
768
|
-
search_keys={"subscription_activation_date": SearchKey.DATE}
|
769
|
-
)
|
770
|
-
enriched_dataframe.fit_transform(X, y, keep_input=True, max_features=2)
|
771
|
-
```
|
772
|
-
|
773
802
|
### Exclude premium data sources from fit, transform and metrics calculation
|
774
803
|
|
775
804
|
`fit`, `fit_transform`, `transform` and `calculate_metrics` methods of `FeaturesEnricher` can be used with parameter `exclude_features_sources` that allows to exclude Trial or Paid features from Premium data sources:
|
@@ -797,7 +826,7 @@ enricher = FeaturesEnricher(
|
|
797
826
|
enricher.fit(X, y)
|
798
827
|
```
|
799
828
|
|
800
|
-
|
829
|
+
### Turn off removing of target outliers
|
801
830
|
Upgini detect rows with target outlier for regression tasks. By default such rows are dropped on metrics calculation. To turn off removing of target outlier rows use parameter `remove_outliers_calc_metrics=False` in fit, fit_transform or calculate_metrics methods:
|
802
831
|
|
803
832
|
```python
|
@@ -808,7 +837,7 @@ enricher = FeaturesEnricher(
|
|
808
837
|
enricher.fit(X, y, remove_outliers_calc_metrics=False)
|
809
838
|
```
|
810
839
|
|
811
|
-
|
840
|
+
### Turn off generating features on search keys
|
812
841
|
Upgini tries to generate features on email, date and datetime search keys. By default this generation is enabled. To disable it use parameter `generate_search_key_features` of FeaturesEnricher constructor:
|
813
842
|
|
814
843
|
```python
|
@@ -816,6 +845,7 @@ enricher = FeaturesEnricher(
|
|
816
845
|
search_keys={"date": SearchKey.DATE},
|
817
846
|
generate_search_key_features=False,
|
818
847
|
)
|
848
|
+
```
|
819
849
|
|
820
850
|
## 🔑 Open up all capabilities of Upgini
|
821
851
|
|