upgini 1.2.90__py3-none-any.whl → 1.2.91a3884.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/metadata.py CHANGED
@@ -159,6 +159,9 @@ class ModelTaskType(Enum):
159
159
  REGRESSION = "REGRESSION"
160
160
  TIMESERIES = "TIMESERIES"
161
161
 
162
+ def is_classification(self) -> bool:
163
+ return self in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]
164
+
162
165
 
163
166
  class ModelLabelType(Enum):
164
167
  GINI = "gini"
upgini/metrics.py CHANGED
@@ -332,7 +332,7 @@ class EstimatorWrapper:
332
332
  self.groups = groups
333
333
  self.text_features = text_features
334
334
  self.logger = logger or logging.getLogger()
335
- self.droped_features = []
335
+ self.dropped_features = []
336
336
  self.converted_to_int = []
337
337
  self.converted_to_str = []
338
338
  self.converted_to_numeric = []
@@ -381,10 +381,11 @@ class EstimatorWrapper:
381
381
  x, y, groups = self._prepare_data(x, y, groups=self.groups)
382
382
 
383
383
  self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
384
- self.droped_features = []
384
+ self.dropped_features = []
385
385
  self.converted_to_int = []
386
386
  self.converted_to_str = []
387
387
  self.converted_to_numeric = []
388
+
388
389
  for c in x.columns:
389
390
 
390
391
  if _get_unique_count(x[c]) < 2:
@@ -392,7 +393,7 @@ class EstimatorWrapper:
392
393
  if c in self.cat_features:
393
394
  self.cat_features.remove(c)
394
395
  x.drop(columns=[c], inplace=True)
395
- self.droped_features.append(c)
396
+ self.dropped_features.append(c)
396
397
  elif self.text_features is not None and c in self.text_features:
397
398
  x[c] = x[c].astype(str)
398
399
  self.converted_to_str.append(c)
@@ -427,16 +428,16 @@ class EstimatorWrapper:
427
428
  except (ValueError, TypeError):
428
429
  self.logger.warning(f"Remove feature {c} because it is not numeric and not in cat_features")
429
430
  x.drop(columns=[c], inplace=True)
430
- self.droped_features.append(c)
431
+ self.dropped_features.append(c)
431
432
 
432
433
  return x, y, groups, {}
433
434
 
434
435
  def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
435
436
  x, y, _ = self._prepare_data(x, y)
436
437
 
437
- if self.droped_features:
438
- self.logger.info(f"Drop features on calculate metrics: {self.droped_features}")
439
- x = x.drop(columns=self.droped_features)
438
+ if self.dropped_features:
439
+ self.logger.info(f"Drop features on calculate metrics: {self.dropped_features}")
440
+ x = x.drop(columns=self.dropped_features)
440
441
 
441
442
  if self.converted_to_int:
442
443
  self.logger.info(f"Convert to int features on calculate metrics: {self.converted_to_int}")
@@ -797,7 +798,7 @@ class CatBoostWrapper(EstimatorWrapper):
797
798
  )
798
799
  for f in high_cardinality_features:
799
800
  self.text_features.remove(f)
800
- self.droped_features.append(f)
801
+ self.dropped_features.append(f)
801
802
  x = x.drop(columns=f, errors="ignore")
802
803
  return super().cross_val_predict(x, y, baseline_score_column)
803
804
  else:
@@ -897,7 +898,7 @@ class LightGBMWrapper(EstimatorWrapper):
897
898
  for c in x.columns:
898
899
  if x[c].dtype not in ["category", "int64", "float64", "bool"]:
899
900
  self.logger.warning(f"Feature {c} is not numeric and will be dropped")
900
- self.droped_features.append(c)
901
+ self.dropped_features.append(c)
901
902
  x = x.drop(columns=c, errors="ignore")
902
903
  return x, y_numpy, groups, params
903
904
 
@@ -988,7 +989,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
988
989
  for c in x.columns:
989
990
  if x[c].dtype not in ["category", "int64", "float64", "bool"]:
990
991
  self.logger.warning(f"Feature {c} is not numeric and will be dropped")
991
- self.droped_features.append(c)
992
+ self.dropped_features.append(c)
992
993
  x = x.drop(columns=c, errors="ignore")
993
994
  return x, y_numpy, groups, params
994
995
 
@@ -144,6 +144,7 @@ baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and an
144
144
  missing_features_for_transform=Missing some features for transform that were presented on fit: {}
145
145
  missing_target_for_transform=Search contains features on target. Please add y to the call and try again
146
146
  missing_id_column=Id column {} not found in X: {}
147
+ unknown_id_column_value_in_eval_set=Unknown values in id columns: {}
147
148
  # target validation
148
149
  empty_target=Target is empty in all rows
149
150
  # non_numeric_target=Binary target should be numerical type
@@ -0,0 +1,348 @@
1
+ from dataclasses import dataclass, field
2
+ import logging
3
+ import numbers
4
+ from typing import Callable, List, Optional
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ from upgini.metadata import SYSTEM_RECORD_ID, CVType, ModelTaskType
9
+ from upgini.resource_bundle import ResourceBundle, get_custom_bundle
10
+ from upgini.utils.target_utils import balance_undersample
11
+ from upgini.utils.ts_utils import get_most_frequent_time_unit, trunc_datetime
12
+
13
+
14
+ TS_MIN_DIFFERENT_IDS_RATIO = 0.2
15
+ TS_DEFAULT_HIGH_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=2, months=6), pd.DateOffset(years=2, days=7)]
16
+ TS_DEFAULT_LOW_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=7), pd.DateOffset(years=5)]
17
+ TS_DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
18
+ FIT_SAMPLE_ROWS_TS = 54_000
19
+
20
+ BINARY_MIN_SAMPLE_THRESHOLD = 5_000
21
+ MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
22
+ BINARY_BOOTSTRAP_LOOPS = 5
23
+ MULTICLASS_BOOTSTRAP_LOOPS = 2
24
+
25
+ FIT_SAMPLE_THRESHOLD = 200_000
26
+ FIT_SAMPLE_ROWS = 200_000
27
+ FIT_SAMPLE_ROWS_WITH_EVAL_SET = 200_000
28
+ FIT_SAMPLE_THRESHOLD_WITH_EVAL_SET = 200_000
29
+
30
+
31
+ @dataclass
32
+ class SampleConfig:
33
+ force_sample_size: int = 7000
34
+ ts_min_different_ids_ratio: float = TS_MIN_DIFFERENT_IDS_RATIO
35
+ ts_default_high_freq_trunc_lengths: List[pd.DateOffset] = field(
36
+ default_factory=TS_DEFAULT_HIGH_FREQ_TRUNC_LENGTHS.copy
37
+ )
38
+ ts_default_low_freq_trunc_lengths: List[pd.DateOffset] = field(
39
+ default_factory=TS_DEFAULT_LOW_FREQ_TRUNC_LENGTHS.copy
40
+ )
41
+ ts_default_time_unit_threshold: pd.Timedelta = TS_DEFAULT_TIME_UNIT_THRESHOLD
42
+ binary_min_sample_threshold: int = BINARY_MIN_SAMPLE_THRESHOLD
43
+ multiclass_min_sample_threshold: int = MULTICLASS_MIN_SAMPLE_THRESHOLD
44
+ binary_bootstrap_loops: int = BINARY_BOOTSTRAP_LOOPS
45
+ multiclass_bootstrap_loops: int = MULTICLASS_BOOTSTRAP_LOOPS
46
+ fit_sample_threshold: int = FIT_SAMPLE_THRESHOLD
47
+ fit_sample_rows: int = FIT_SAMPLE_ROWS
48
+ fit_sample_rows_with_eval_set: int = FIT_SAMPLE_ROWS_WITH_EVAL_SET
49
+ fit_sample_threshold_with_eval_set: int = FIT_SAMPLE_THRESHOLD_WITH_EVAL_SET
50
+ fit_sample_rows_ts: int = FIT_SAMPLE_ROWS_TS
51
+
52
+
53
+ @dataclass
54
+ class SampleColumns:
55
+ date: str
56
+ target: str
57
+ ids: Optional[List[str]] = None
58
+ eval_set_index: Optional[str] = None
59
+
60
+
61
+ def sample(
62
+ df: pd.DataFrame,
63
+ task_type: Optional[ModelTaskType],
64
+ cv_type: Optional[CVType],
65
+ sample_config: SampleConfig,
66
+ sample_columns: SampleColumns,
67
+ random_state: int = 42,
68
+ balance: bool = True,
69
+ force_downsampling: bool = False,
70
+ logger: Optional[logging.Logger] = None,
71
+ **kwargs,
72
+ ) -> pd.DataFrame:
73
+ if force_downsampling:
74
+ return balance_undersample_forced(
75
+ df,
76
+ sample_columns.target,
77
+ sample_columns.ids,
78
+ sample_columns.date,
79
+ task_type,
80
+ cv_type,
81
+ random_state,
82
+ sample_config.force_sample_size,
83
+ logger=logger,
84
+ **kwargs,
85
+ )
86
+
87
+ if sample_columns.eval_set_index in df.columns:
88
+ fit_sample_threshold = sample_config.fit_sample_threshold_with_eval_set
89
+ fit_sample_rows = sample_config.fit_sample_rows_with_eval_set
90
+ else:
91
+ fit_sample_threshold = sample_config.fit_sample_threshold
92
+ fit_sample_rows = sample_config.fit_sample_rows
93
+
94
+ if cv_type is not None and cv_type.is_time_series():
95
+ return sample_time_series_trunc(
96
+ df,
97
+ sample_columns.ids,
98
+ sample_columns.date,
99
+ sample_config.fit_sample_rows_ts,
100
+ random_state,
101
+ logger=logger,
102
+ **kwargs,
103
+ )
104
+
105
+ if task_type is not None and task_type.is_classification() and balance:
106
+ df = balance_undersample(
107
+ df=df,
108
+ target_column=sample_columns.target,
109
+ task_type=task_type,
110
+ random_state=random_state,
111
+ binary_min_sample_threshold=sample_config.binary_min_sample_threshold,
112
+ multiclass_min_sample_threshold=sample_config.multiclass_min_sample_threshold,
113
+ binary_bootstrap_loops=sample_config.binary_bootstrap_loops,
114
+ multiclass_bootstrap_loops=sample_config.multiclass_bootstrap_loops,
115
+ logger=logger,
116
+ **kwargs,
117
+ )
118
+
119
+ num_samples = _num_samples(df)
120
+ if num_samples > fit_sample_threshold:
121
+ logger.info(
122
+ f"Etalon has size {num_samples} more than threshold {fit_sample_threshold} "
123
+ f"and will be downsampled to {fit_sample_rows}"
124
+ )
125
+ df = df.sample(n=fit_sample_rows, random_state=random_state)
126
+ logger.info(f"Shape after threshold resampling: {df.shape}")
127
+
128
+ return df
129
+
130
+
131
+ def sample_time_series_trunc(
132
+ df: pd.DataFrame,
133
+ id_columns: Optional[List[str]],
134
+ date_column: str,
135
+ sample_size: int,
136
+ random_state: int = 42,
137
+ logger: Optional[logging.Logger] = None,
138
+ highfreq_trunc_lengths: List[pd.DateOffset] = TS_DEFAULT_HIGH_FREQ_TRUNC_LENGTHS,
139
+ lowfreq_trunc_lengths: List[pd.DateOffset] = TS_DEFAULT_LOW_FREQ_TRUNC_LENGTHS,
140
+ time_unit_threshold: pd.Timedelta = TS_DEFAULT_TIME_UNIT_THRESHOLD,
141
+ **kwargs,
142
+ ):
143
+ if id_columns is None:
144
+ id_columns = []
145
+ # Convert date column to datetime
146
+ dates_df = df[id_columns + [date_column]].copy().reset_index(drop=True)
147
+ if pd.api.types.is_numeric_dtype(dates_df[date_column]):
148
+ dates_df[date_column] = pd.to_datetime(dates_df[date_column], unit="ms")
149
+ else:
150
+ dates_df[date_column] = pd.to_datetime(dates_df[date_column])
151
+
152
+ time_unit = get_most_frequent_time_unit(dates_df, id_columns, date_column)
153
+ if logger is not None:
154
+ logger.info(f"Time unit: {time_unit}")
155
+
156
+ if time_unit is None:
157
+ if logger is not None:
158
+ logger.info("Cannot detect time unit, returning original dataset")
159
+ return df
160
+
161
+ if time_unit < time_unit_threshold:
162
+ for trunc_length in highfreq_trunc_lengths:
163
+ sampled_df = trunc_datetime(dates_df, id_columns, date_column, trunc_length, logger=logger)
164
+ if len(sampled_df) <= sample_size:
165
+ break
166
+ if len(sampled_df) > sample_size:
167
+ sampled_df = sample_time_series(
168
+ sampled_df, id_columns, date_column, sample_size, random_state, logger=logger, **kwargs
169
+ )
170
+ else:
171
+ for trunc_length in lowfreq_trunc_lengths:
172
+ sampled_df = trunc_datetime(dates_df, id_columns, date_column, trunc_length, logger=logger)
173
+ if len(sampled_df) <= sample_size:
174
+ break
175
+ if len(sampled_df) > sample_size:
176
+ sampled_df = sample_time_series(
177
+ sampled_df, id_columns, date_column, sample_size, random_state, logger=logger, **kwargs
178
+ )
179
+
180
+ return df.iloc[sampled_df.index]
181
+
182
+
183
+ def sample_time_series(
184
+ df: pd.DataFrame,
185
+ id_columns: List[str],
186
+ date_column: str,
187
+ sample_size: int,
188
+ random_state: int = 42,
189
+ min_different_ids_ratio: float = TS_MIN_DIFFERENT_IDS_RATIO,
190
+ prefer_recent_dates: bool = True,
191
+ logger: Optional[logging.Logger] = None,
192
+ ):
193
+ def ensure_tuple(x):
194
+ return tuple([x]) if not isinstance(x, tuple) else x
195
+
196
+ random_state = np.random.RandomState(random_state)
197
+
198
+ if not id_columns:
199
+ id_columns = [date_column]
200
+ ids_sort = df.groupby(id_columns)[date_column].aggregate(["max", "count"]).T.to_dict()
201
+ ids_sort = {
202
+ ensure_tuple(k): (
203
+ (v["max"], v["count"], random_state.rand()) if prefer_recent_dates else (v["count"], random_state.rand())
204
+ )
205
+ for k, v in ids_sort.items()
206
+ }
207
+ id_counts = df[id_columns].value_counts()
208
+ id_counts.index = [ensure_tuple(i) for i in id_counts.index]
209
+ id_counts = id_counts.sort_index(key=lambda x: [ids_sort[y] for y in x], ascending=False).cumsum()
210
+ id_counts = id_counts[id_counts <= sample_size]
211
+ min_different_ids = max(int(len(df[id_columns].drop_duplicates()) * min_different_ids_ratio), 1)
212
+
213
+ def id_mask(sample_index: pd.Index) -> pd.Index:
214
+ if isinstance(sample_index, pd.MultiIndex):
215
+ return pd.MultiIndex.from_frame(df[id_columns]).isin(sample_index)
216
+ else:
217
+ return df[id_columns[0]].isin(sample_index)
218
+
219
+ if len(id_counts) < min_different_ids:
220
+ if logger is not None:
221
+ logger.info(
222
+ f"Different ids count {len(id_counts)} for sample size {sample_size}"
223
+ f" is less than min different ids {min_different_ids}, sampling time window"
224
+ )
225
+ date_counts = df.groupby(id_columns)[date_column].nunique().sort_values(ascending=False)
226
+ ids_to_sample = date_counts.index[:min_different_ids] if len(id_counts) > 0 else date_counts.index
227
+ mask = id_mask(ids_to_sample)
228
+ df = df[mask]
229
+ sample_date_counts = df[date_column].value_counts().sort_index(ascending=False).cumsum()
230
+ sample_date_counts = sample_date_counts[sample_date_counts <= sample_size]
231
+ df = df[df[date_column].isin(sample_date_counts.index)]
232
+ else:
233
+ if len(id_columns) > 1:
234
+ id_counts.index = pd.MultiIndex.from_tuples(id_counts.index)
235
+ else:
236
+ id_counts.index = [i[0] for i in id_counts.index]
237
+ mask = id_mask(id_counts.index)
238
+ df = df[mask]
239
+
240
+ return df
241
+
242
+
243
+ def balance_undersample_forced(
244
+ df: pd.DataFrame,
245
+ target_column: str,
246
+ id_columns: Optional[List[str]],
247
+ date_column: str,
248
+ task_type: ModelTaskType,
249
+ cv_type: Optional[CVType],
250
+ random_state: int,
251
+ sample_size: int = 7000,
252
+ logger: Optional[logging.Logger] = None,
253
+ bundle: Optional[ResourceBundle] = None,
254
+ warning_callback: Optional[Callable] = None,
255
+ ):
256
+ if len(df) <= sample_size:
257
+ return df
258
+
259
+ if logger is None:
260
+ logger = logging.getLogger("muted_logger")
261
+ logger.setLevel("FATAL")
262
+ bundle = bundle or get_custom_bundle()
263
+ if SYSTEM_RECORD_ID not in df.columns:
264
+ raise Exception("System record id must be presented for undersampling")
265
+
266
+ msg = bundle.get("forced_balance_undersample")
267
+ logger.info(msg)
268
+ if warning_callback is not None:
269
+ warning_callback(msg)
270
+
271
+ target = df[target_column].copy()
272
+
273
+ vc = target.value_counts()
274
+ max_class_value = vc.index[0]
275
+ min_class_value = vc.index[len(vc) - 1]
276
+ max_class_count = vc[max_class_value]
277
+ min_class_count = vc[min_class_value]
278
+
279
+ resampled_data = df
280
+ df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
281
+ if cv_type is not None and cv_type.is_time_series():
282
+ logger.warning(f"Sampling time series dataset from {len(df)} to {sample_size}")
283
+ resampled_data = sample_time_series_trunc(
284
+ df,
285
+ id_columns=id_columns,
286
+ date_column=date_column,
287
+ sample_size=sample_size,
288
+ random_state=random_state,
289
+ logger=logger,
290
+ )
291
+ elif task_type in [ModelTaskType.MULTICLASS, ModelTaskType.REGRESSION]:
292
+ logger.warning(f"Sampling dataset from {len(df)} to {sample_size}")
293
+ resampled_data = df.sample(n=sample_size, random_state=random_state)
294
+ else:
295
+ msg = bundle.get("imbalanced_target").format(min_class_value, min_class_count)
296
+ logger.warning(msg)
297
+
298
+ # fill up to min_sample_threshold by majority class
299
+ minority_class = df[df[target_column] == min_class_value]
300
+ majority_class = df[df[target_column] != min_class_value]
301
+ logger.info(
302
+ f"Min class count: {min_class_count}. Max class count: {max_class_count}."
303
+ f" Rebalance sample size: {sample_size}"
304
+ )
305
+ if len(minority_class) > (sample_size / 2):
306
+ sampled_minority_class = minority_class.sample(n=int(sample_size / 2), random_state=random_state)
307
+ else:
308
+ sampled_minority_class = minority_class
309
+
310
+ if len(majority_class) > (sample_size) / 2:
311
+ sampled_majority_class = majority_class.sample(n=int(sample_size / 2), random_state=random_state)
312
+
313
+ resampled_data = df[
314
+ (df[SYSTEM_RECORD_ID].isin(sampled_minority_class[SYSTEM_RECORD_ID]))
315
+ | (df[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
316
+ ]
317
+
318
+ logger.info(f"Shape after forced rebalance resampling: {resampled_data}")
319
+ return resampled_data
320
+
321
+
322
+ def _num_samples(x):
323
+ """Return number of samples in array-like x."""
324
+ if x is None:
325
+ return 0
326
+ message = "Expected sequence or array-like, got %s" % type(x)
327
+ if hasattr(x, "fit") and callable(x.fit):
328
+ # Don't get num_samples from an ensembles length!
329
+ raise TypeError(message)
330
+
331
+ if not hasattr(x, "__len__") and not hasattr(x, "shape"):
332
+ if hasattr(x, "__array__"):
333
+ x = np.asarray(x)
334
+ else:
335
+ raise TypeError(message)
336
+
337
+ if hasattr(x, "shape") and x.shape is not None:
338
+ if len(x.shape) == 0:
339
+ raise TypeError("Singleton array %r cannot be considered a valid collection." % x)
340
+ # Check that shape is returning an integer or default to len
341
+ # Dask dataframes may not return numeric shape[0] value
342
+ if isinstance(x.shape[0], numbers.Integral):
343
+ return x.shape[0]
344
+
345
+ try:
346
+ return len(x)
347
+ except TypeError as type_error:
348
+ raise TypeError(message) from type_error
@@ -1,17 +1,14 @@
1
1
  import logging
2
- from typing import Callable, List, Optional, Union
2
+ from typing import Callable, Optional, Union
3
3
 
4
4
  import numpy as np
5
5
  import pandas as pd
6
6
  from pandas.api.types import is_bool_dtype, is_datetime64_any_dtype, is_numeric_dtype
7
7
 
8
8
  from upgini.errors import ValidationError
9
- from upgini.metadata import SYSTEM_RECORD_ID, CVType, ModelTaskType
10
- from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
9
+ from upgini.metadata import SYSTEM_RECORD_ID, ModelTaskType
10
+ from upgini.resource_bundle import ResourceBundle, get_custom_bundle, bundle
11
11
  from upgini.sampler.random_under_sampler import RandomUnderSampler
12
- from upgini.utils.ts_utils import get_most_frequent_time_unit, trunc_datetime
13
-
14
- TS_MIN_DIFFERENT_IDS_RATIO = 0.2
15
12
 
16
13
 
17
14
  def prepare_target(y: Union[pd.Series, np.ndarray], target_type: ModelTaskType) -> Union[pd.Series, np.ndarray]:
@@ -204,199 +201,6 @@ def balance_undersample(
204
201
  return resampled_data
205
202
 
206
203
 
207
- def balance_undersample_forced(
208
- df: pd.DataFrame,
209
- target_column: str,
210
- id_columns: Optional[List[str]],
211
- date_column: str,
212
- task_type: ModelTaskType,
213
- cv_type: Optional[CVType],
214
- random_state: int,
215
- sample_size: int = 7000,
216
- logger: Optional[logging.Logger] = None,
217
- bundle: Optional[ResourceBundle] = None,
218
- warning_callback: Optional[Callable] = None,
219
- ):
220
- if len(df) <= sample_size:
221
- return df
222
-
223
- if logger is None:
224
- logger = logging.getLogger("muted_logger")
225
- logger.setLevel("FATAL")
226
- bundle = bundle or get_custom_bundle()
227
- if SYSTEM_RECORD_ID not in df.columns:
228
- raise Exception("System record id must be presented for undersampling")
229
-
230
- msg = bundle.get("forced_balance_undersample")
231
- logger.info(msg)
232
- if warning_callback is not None:
233
- warning_callback(msg)
234
-
235
- target = df[target_column].copy()
236
-
237
- vc = target.value_counts()
238
- max_class_value = vc.index[0]
239
- min_class_value = vc.index[len(vc) - 1]
240
- max_class_count = vc[max_class_value]
241
- min_class_count = vc[min_class_value]
242
-
243
- resampled_data = df
244
- df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
245
- if cv_type is not None and cv_type.is_time_series():
246
- logger.warning(f"Sampling time series dataset from {len(df)} to {sample_size}")
247
- resampled_data = balance_undersample_time_series_trunc(
248
- df,
249
- id_columns=id_columns,
250
- date_column=date_column,
251
- sample_size=sample_size,
252
- random_state=random_state,
253
- logger=logger,
254
- )
255
- elif task_type in [ModelTaskType.MULTICLASS, ModelTaskType.REGRESSION]:
256
- logger.warning(f"Sampling dataset from {len(df)} to {sample_size}")
257
- resampled_data = df.sample(n=sample_size, random_state=random_state)
258
- else:
259
- msg = bundle.get("imbalanced_target").format(min_class_value, min_class_count)
260
- logger.warning(msg)
261
-
262
- # fill up to min_sample_threshold by majority class
263
- minority_class = df[df[target_column] == min_class_value]
264
- majority_class = df[df[target_column] != min_class_value]
265
- logger.info(
266
- f"Min class count: {min_class_count}. Max class count: {max_class_count}."
267
- f" Rebalance sample size: {sample_size}"
268
- )
269
- if len(minority_class) > (sample_size / 2):
270
- sampled_minority_class = minority_class.sample(n=int(sample_size / 2), random_state=random_state)
271
- else:
272
- sampled_minority_class = minority_class
273
-
274
- if len(majority_class) > (sample_size) / 2:
275
- sampled_majority_class = majority_class.sample(n=int(sample_size / 2), random_state=random_state)
276
-
277
- resampled_data = df[
278
- (df[SYSTEM_RECORD_ID].isin(sampled_minority_class[SYSTEM_RECORD_ID]))
279
- | (df[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
280
- ]
281
-
282
- logger.info(f"Shape after forced rebalance resampling: {resampled_data}")
283
- return resampled_data
284
-
285
-
286
- DEFAULT_HIGH_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=2, months=6), pd.DateOffset(years=2, days=7)]
287
- DEFAULT_LOW_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=7), pd.DateOffset(years=5)]
288
- DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
289
-
290
-
291
- def balance_undersample_time_series_trunc(
292
- df: pd.DataFrame,
293
- id_columns: Optional[List[str]],
294
- date_column: str,
295
- sample_size: int,
296
- random_state: int = 42,
297
- logger: Optional[logging.Logger] = None,
298
- highfreq_trunc_lengths: List[pd.DateOffset] = DEFAULT_HIGH_FREQ_TRUNC_LENGTHS,
299
- lowfreq_trunc_lengths: List[pd.DateOffset] = DEFAULT_LOW_FREQ_TRUNC_LENGTHS,
300
- time_unit_threshold: pd.Timedelta = DEFAULT_TIME_UNIT_THRESHOLD,
301
- **kwargs,
302
- ):
303
- if id_columns is None:
304
- id_columns = []
305
- # Convert date column to datetime
306
- dates_df = df[id_columns + [date_column]].copy()
307
- dates_df[date_column] = pd.to_datetime(dates_df[date_column], unit="ms")
308
-
309
- time_unit = get_most_frequent_time_unit(dates_df, id_columns, date_column)
310
- if logger is not None:
311
- logger.info(f"Time unit: {time_unit}")
312
-
313
- if time_unit is None:
314
- if logger is not None:
315
- logger.info("Cannot detect time unit, returning original dataset")
316
- return df
317
-
318
- if time_unit < time_unit_threshold:
319
- for trunc_length in highfreq_trunc_lengths:
320
- sampled_df = trunc_datetime(dates_df, id_columns, date_column, trunc_length, logger=logger)
321
- if len(sampled_df) <= sample_size:
322
- break
323
- if len(sampled_df) > sample_size:
324
- sampled_df = balance_undersample_time_series(
325
- sampled_df, id_columns, date_column, sample_size, random_state, logger=logger, **kwargs
326
- )
327
- else:
328
- for trunc_length in lowfreq_trunc_lengths:
329
- sampled_df = trunc_datetime(dates_df, id_columns, date_column, trunc_length, logger=logger)
330
- if len(sampled_df) <= sample_size:
331
- break
332
- if len(sampled_df) > sample_size:
333
- sampled_df = balance_undersample_time_series(
334
- sampled_df, id_columns, date_column, sample_size, random_state, logger=logger, **kwargs
335
- )
336
-
337
- return df.loc[sampled_df.index]
338
-
339
-
340
- def balance_undersample_time_series(
341
- df: pd.DataFrame,
342
- id_columns: List[str],
343
- date_column: str,
344
- sample_size: int,
345
- random_state: int = 42,
346
- min_different_ids_ratio: float = TS_MIN_DIFFERENT_IDS_RATIO,
347
- prefer_recent_dates: bool = True,
348
- logger: Optional[logging.Logger] = None,
349
- ):
350
- def ensure_tuple(x):
351
- return tuple([x]) if not isinstance(x, tuple) else x
352
-
353
- random_state = np.random.RandomState(random_state)
354
-
355
- if not id_columns:
356
- id_columns = [date_column]
357
- ids_sort = df.groupby(id_columns)[date_column].aggregate(["max", "count"]).T.to_dict()
358
- ids_sort = {
359
- ensure_tuple(k): (
360
- (v["max"], v["count"], random_state.rand()) if prefer_recent_dates else (v["count"], random_state.rand())
361
- )
362
- for k, v in ids_sort.items()
363
- }
364
- id_counts = df[id_columns].value_counts()
365
- id_counts.index = [ensure_tuple(i) for i in id_counts.index]
366
- id_counts = id_counts.sort_index(key=lambda x: [ids_sort[y] for y in x], ascending=False).cumsum()
367
- id_counts = id_counts[id_counts <= sample_size]
368
- min_different_ids = max(int(len(df[id_columns].drop_duplicates()) * min_different_ids_ratio), 1)
369
-
370
- def id_mask(sample_index: pd.Index) -> pd.Index:
371
- if isinstance(sample_index, pd.MultiIndex):
372
- return pd.MultiIndex.from_frame(df[id_columns]).isin(sample_index)
373
- else:
374
- return df[id_columns[0]].isin(sample_index)
375
-
376
- if len(id_counts) < min_different_ids:
377
- if logger is not None:
378
- logger.info(
379
- f"Different ids count {len(id_counts)} for sample size {sample_size}"
380
- f" is less than min different ids {min_different_ids}, sampling time window"
381
- )
382
- date_counts = df.groupby(id_columns)[date_column].nunique().sort_values(ascending=False)
383
- ids_to_sample = date_counts.index[:min_different_ids] if len(id_counts) > 0 else date_counts.index
384
- mask = id_mask(ids_to_sample)
385
- df = df[mask]
386
- sample_date_counts = df[date_column].value_counts().sort_index(ascending=False).cumsum()
387
- sample_date_counts = sample_date_counts[sample_date_counts <= sample_size]
388
- df = df[df[date_column].isin(sample_date_counts.index)]
389
- else:
390
- if len(id_columns) > 1:
391
- id_counts.index = pd.MultiIndex.from_tuples(id_counts.index)
392
- else:
393
- id_counts.index = [i[0] for i in id_counts.index]
394
- mask = id_mask(id_counts.index)
395
- df = df[mask]
396
-
397
- return df
398
-
399
-
400
204
  def calculate_psi(expected: pd.Series, actual: pd.Series) -> Union[float, Exception]:
401
205
  try:
402
206
  df = pd.concat([expected, actual])
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.90
3
+ Version: 1.2.91a3884.dev1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -808,6 +808,15 @@ enricher = FeaturesEnricher(
808
808
  enricher.fit(X, y, remove_outliers_calc_metrics=False)
809
809
  ```
810
810
 
811
+ ## Turn off generating features on search keys
812
+ Upgini tries to generate features on email, date and datetime search keys. By default this generation is enabled. To disable it use parameter `generate_search_key_features` of FeaturesEnricher constructor:
813
+
814
+ ```python
815
+ enricher = FeaturesEnricher(
816
+ search_keys={"date": SearchKey.DATE},
817
+ generate_search_key_features=False,
818
+ )
819
+
811
820
  ## 🔑 Open up all capabilities of Upgini
812
821
 
813
822
  [Register](https://profile.upgini.com) and get a free API key for exclusive data sources and features: 600 mln+ phone numbers, 350 mln+ emails, 2^32 IP addresses