upgini 1.2.91a3884.dev4__py3-none-any.whl → 1.2.91a3906.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,414 +0,0 @@
1
- from dataclasses import dataclass, field
2
- import logging
3
- import numbers
4
- from typing import Callable, List, Optional
5
- import numpy as np
6
- import pandas as pd
7
-
8
- from upgini.metadata import SYSTEM_RECORD_ID, CVType, ModelTaskType
9
- from upgini.resource_bundle import ResourceBundle, get_custom_bundle
10
- from upgini.utils.target_utils import balance_undersample
11
- from upgini.utils.ts_utils import get_most_frequent_time_unit, trunc_datetime
12
-
13
-
14
- TS_MIN_DIFFERENT_IDS_RATIO = 0.2
15
- TS_DEFAULT_HIGH_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=2, months=6), pd.DateOffset(years=2, days=7)]
16
- TS_DEFAULT_LOW_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=7), pd.DateOffset(years=5)]
17
- TS_DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
18
- FIT_SAMPLE_ROWS_TS = 54_000
19
-
20
- BINARY_MIN_SAMPLE_THRESHOLD = 5_000
21
- MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
22
- BINARY_BOOTSTRAP_LOOPS = 5
23
- MULTICLASS_BOOTSTRAP_LOOPS = 2
24
-
25
- FIT_SAMPLE_THRESHOLD = 200_000
26
- FIT_SAMPLE_ROWS = 200_000
27
- FIT_SAMPLE_ROWS_WITH_EVAL_SET = 200_000
28
- FIT_SAMPLE_THRESHOLD_WITH_EVAL_SET = 200_000
29
-
30
-
31
- @dataclass
32
- class SampleConfig:
33
- force_sample_size: int = 7000
34
- ts_min_different_ids_ratio: float = TS_MIN_DIFFERENT_IDS_RATIO
35
- ts_default_high_freq_trunc_lengths: List[pd.DateOffset] = field(
36
- default_factory=TS_DEFAULT_HIGH_FREQ_TRUNC_LENGTHS.copy
37
- )
38
- ts_default_low_freq_trunc_lengths: List[pd.DateOffset] = field(
39
- default_factory=TS_DEFAULT_LOW_FREQ_TRUNC_LENGTHS.copy
40
- )
41
- ts_default_time_unit_threshold: pd.Timedelta = TS_DEFAULT_TIME_UNIT_THRESHOLD
42
- binary_min_sample_threshold: int = BINARY_MIN_SAMPLE_THRESHOLD
43
- multiclass_min_sample_threshold: int = MULTICLASS_MIN_SAMPLE_THRESHOLD
44
- binary_bootstrap_loops: int = BINARY_BOOTSTRAP_LOOPS
45
- multiclass_bootstrap_loops: int = MULTICLASS_BOOTSTRAP_LOOPS
46
- fit_sample_threshold: int = FIT_SAMPLE_THRESHOLD
47
- fit_sample_rows: int = FIT_SAMPLE_ROWS
48
- fit_sample_rows_with_eval_set: int = FIT_SAMPLE_ROWS_WITH_EVAL_SET
49
- fit_sample_threshold_with_eval_set: int = FIT_SAMPLE_THRESHOLD_WITH_EVAL_SET
50
- fit_sample_rows_ts: int = FIT_SAMPLE_ROWS_TS
51
-
52
-
53
- @dataclass
54
- class SampleColumns:
55
- date: str
56
- target: str
57
- ids: Optional[List[str]] = None
58
- eval_set_index: Optional[str] = None
59
-
60
-
61
- def sample(
62
- df: pd.DataFrame,
63
- task_type: Optional[ModelTaskType],
64
- cv_type: Optional[CVType],
65
- sample_config: SampleConfig,
66
- sample_columns: SampleColumns,
67
- random_state: int = 42,
68
- balance: bool = True,
69
- force_downsampling: bool = False,
70
- logger: Optional[logging.Logger] = None,
71
- **kwargs,
72
- ) -> pd.DataFrame:
73
- if force_downsampling:
74
- return balance_undersample_forced(
75
- df,
76
- sample_columns.target,
77
- sample_columns.ids,
78
- sample_columns.date,
79
- task_type,
80
- cv_type,
81
- random_state,
82
- sample_config.force_sample_size,
83
- logger=logger,
84
- **kwargs,
85
- )
86
-
87
- if sample_columns.eval_set_index in df.columns:
88
- fit_sample_threshold = sample_config.fit_sample_threshold_with_eval_set
89
- fit_sample_rows = sample_config.fit_sample_rows_with_eval_set
90
- else:
91
- fit_sample_threshold = sample_config.fit_sample_threshold
92
- fit_sample_rows = sample_config.fit_sample_rows
93
-
94
- if cv_type is not None and cv_type.is_time_series():
95
- return sample_time_series_train_eval(
96
- df,
97
- sample_columns,
98
- sample_config.fit_sample_rows_ts,
99
- trim_threshold=fit_sample_threshold,
100
- max_rows=fit_sample_rows,
101
- random_state=random_state,
102
- logger=logger,
103
- **kwargs,
104
- )
105
-
106
- if task_type is not None and task_type.is_classification() and balance:
107
- df = balance_undersample(
108
- df=df,
109
- target_column=sample_columns.target,
110
- task_type=task_type,
111
- random_state=random_state,
112
- binary_min_sample_threshold=sample_config.binary_min_sample_threshold,
113
- multiclass_min_sample_threshold=sample_config.multiclass_min_sample_threshold,
114
- binary_bootstrap_loops=sample_config.binary_bootstrap_loops,
115
- multiclass_bootstrap_loops=sample_config.multiclass_bootstrap_loops,
116
- logger=logger,
117
- **kwargs,
118
- )
119
-
120
- num_samples = _num_samples(df)
121
- if num_samples > fit_sample_threshold:
122
- logger.info(
123
- f"Etalon has size {num_samples} more than threshold {fit_sample_threshold} "
124
- f"and will be downsampled to {fit_sample_rows}"
125
- )
126
- df = df.sample(n=fit_sample_rows, random_state=random_state)
127
- logger.info(f"Shape after threshold resampling: {df.shape}")
128
-
129
- return df
130
-
131
-
132
- def sample_time_series_train_eval(
133
- df: pd.DataFrame,
134
- sample_columns: SampleColumns,
135
- sample_size: int,
136
- trim_threshold: int,
137
- max_rows: int,
138
- random_state: int = 42,
139
- logger: Optional[logging.Logger] = None,
140
- bundle: Optional[ResourceBundle] = None,
141
- **kwargs,
142
- ):
143
- if sample_columns.eval_set_index in df.columns:
144
- train_df = df[df[sample_columns.eval_set_index] == 0]
145
- eval_df = df[df[sample_columns.eval_set_index] > 0]
146
- else:
147
- train_df = df
148
- eval_df = None
149
-
150
- train_df = sample_time_series_trunc(
151
- train_df, sample_columns.ids, sample_columns.date, sample_size, random_state, logger=logger, **kwargs
152
- )
153
- if sample_columns.ids and eval_df is not None:
154
- missing_ids = (
155
- eval_df[~eval_df[sample_columns.ids].isin(np.unique(train_df[sample_columns.ids]))][sample_columns.ids]
156
- .dropna()
157
- .drop_duplicates()
158
- .values.tolist()
159
- )
160
- if missing_ids:
161
- bundle = bundle or get_custom_bundle()
162
- print(bundle.get("missing_ids_in_eval_set").format(missing_ids))
163
- eval_df = eval_df.merge(train_df[sample_columns.ids].drop_duplicates())
164
-
165
- if eval_df is not None:
166
- if len(eval_df) > trim_threshold - len(train_df):
167
- eval_df = sample_time_series_trunc(
168
- eval_df,
169
- sample_columns.ids,
170
- sample_columns.date,
171
- max_rows - len(train_df),
172
- random_state,
173
- logger=logger,
174
- **kwargs,
175
- )
176
- logger.info(f"Eval set size: {len(eval_df)}")
177
- df = pd.concat([train_df, eval_df])
178
-
179
- elif len(train_df) > max_rows:
180
- df = sample_time_series_trunc(
181
- train_df,
182
- sample_columns.ids,
183
- sample_columns.date,
184
- max_rows,
185
- random_state,
186
- logger=logger,
187
- **kwargs,
188
- )
189
- else:
190
- df = train_df
191
-
192
- logger.info(f"Train set size: {len(df)}")
193
-
194
- return df
195
-
196
-
197
- def sample_time_series_trunc(
198
- df: pd.DataFrame,
199
- id_columns: Optional[List[str]],
200
- date_column: str,
201
- sample_size: int,
202
- random_state: int = 42,
203
- logger: Optional[logging.Logger] = None,
204
- highfreq_trunc_lengths: List[pd.DateOffset] = TS_DEFAULT_HIGH_FREQ_TRUNC_LENGTHS,
205
- lowfreq_trunc_lengths: List[pd.DateOffset] = TS_DEFAULT_LOW_FREQ_TRUNC_LENGTHS,
206
- time_unit_threshold: pd.Timedelta = TS_DEFAULT_TIME_UNIT_THRESHOLD,
207
- **kwargs,
208
- ):
209
- if id_columns is None:
210
- id_columns = []
211
- # Convert date column to datetime
212
- dates_df = df[id_columns + [date_column]].copy().reset_index(drop=True)
213
- if pd.api.types.is_numeric_dtype(dates_df[date_column]):
214
- dates_df[date_column] = pd.to_datetime(dates_df[date_column], unit="ms")
215
- else:
216
- dates_df[date_column] = pd.to_datetime(dates_df[date_column])
217
-
218
- time_unit = get_most_frequent_time_unit(dates_df, id_columns, date_column)
219
- if logger is not None:
220
- logger.info(f"Time unit: {time_unit}")
221
-
222
- if time_unit is None:
223
- if logger is not None:
224
- logger.info("Cannot detect time unit, returning original dataset")
225
- return df
226
-
227
- if time_unit < time_unit_threshold:
228
- for trunc_length in highfreq_trunc_lengths:
229
- sampled_df = trunc_datetime(dates_df, id_columns, date_column, trunc_length, logger=logger)
230
- if len(sampled_df) <= sample_size:
231
- break
232
- if len(sampled_df) > sample_size:
233
- sampled_df = sample_time_series(
234
- sampled_df, id_columns, date_column, sample_size, random_state, logger=logger, **kwargs
235
- )
236
- else:
237
- for trunc_length in lowfreq_trunc_lengths:
238
- sampled_df = trunc_datetime(dates_df, id_columns, date_column, trunc_length, logger=logger)
239
- if len(sampled_df) <= sample_size:
240
- break
241
- if len(sampled_df) > sample_size:
242
- sampled_df = sample_time_series(
243
- sampled_df, id_columns, date_column, sample_size, random_state, logger=logger, **kwargs
244
- )
245
-
246
- return df.iloc[sampled_df.index]
247
-
248
-
249
- def sample_time_series(
250
- df: pd.DataFrame,
251
- id_columns: List[str],
252
- date_column: str,
253
- sample_size: int,
254
- random_state: int = 42,
255
- min_different_ids_ratio: float = TS_MIN_DIFFERENT_IDS_RATIO,
256
- prefer_recent_dates: bool = True,
257
- logger: Optional[logging.Logger] = None,
258
- **kwargs,
259
- ):
260
- def ensure_tuple(x):
261
- return tuple([x]) if not isinstance(x, tuple) else x
262
-
263
- random_state = np.random.RandomState(random_state)
264
-
265
- if not id_columns:
266
- id_columns = [date_column]
267
- ids_sort = df.groupby(id_columns)[date_column].aggregate(["max", "count"]).T.to_dict()
268
- ids_sort = {
269
- ensure_tuple(k): (
270
- (v["max"], v["count"], random_state.rand()) if prefer_recent_dates else (v["count"], random_state.rand())
271
- )
272
- for k, v in ids_sort.items()
273
- }
274
- id_counts = df[id_columns].value_counts()
275
- id_counts.index = [ensure_tuple(i) for i in id_counts.index]
276
- id_counts = id_counts.sort_index(key=lambda x: [ids_sort[y] for y in x], ascending=False).cumsum()
277
- id_counts = id_counts[id_counts <= sample_size]
278
- min_different_ids = max(int(len(df[id_columns].drop_duplicates()) * min_different_ids_ratio), 1)
279
-
280
- def id_mask(sample_index: pd.Index) -> pd.Index:
281
- if isinstance(sample_index, pd.MultiIndex):
282
- return pd.MultiIndex.from_frame(df[id_columns]).isin(sample_index)
283
- else:
284
- return df[id_columns[0]].isin(sample_index)
285
-
286
- if len(id_counts) < min_different_ids:
287
- if logger is not None:
288
- logger.info(
289
- f"Different ids count {len(id_counts)} for sample size {sample_size}"
290
- f" is less than min different ids {min_different_ids}, sampling time window"
291
- )
292
- date_counts = df.groupby(id_columns)[date_column].nunique().sort_values(ascending=False)
293
- ids_to_sample = date_counts.index[:min_different_ids] if len(id_counts) > 0 else date_counts.index
294
- mask = id_mask(ids_to_sample)
295
- df = df[mask]
296
- sample_date_counts = df[date_column].value_counts().sort_index(ascending=False).cumsum()
297
- sample_date_counts = sample_date_counts[sample_date_counts <= sample_size]
298
- df = df[df[date_column].isin(sample_date_counts.index)]
299
- else:
300
- if len(id_columns) > 1:
301
- id_counts.index = pd.MultiIndex.from_tuples(id_counts.index)
302
- else:
303
- id_counts.index = [i[0] for i in id_counts.index]
304
- mask = id_mask(id_counts.index)
305
- df = df[mask]
306
-
307
- return df
308
-
309
-
310
- def balance_undersample_forced(
311
- df: pd.DataFrame,
312
- sample_columns: SampleColumns,
313
- task_type: ModelTaskType,
314
- cv_type: Optional[CVType],
315
- random_state: int,
316
- sample_size: int = 7000,
317
- logger: Optional[logging.Logger] = None,
318
- bundle: Optional[ResourceBundle] = None,
319
- warning_callback: Optional[Callable] = None,
320
- ):
321
- if len(df) <= sample_size:
322
- return df
323
-
324
- if logger is None:
325
- logger = logging.getLogger("muted_logger")
326
- logger.setLevel("FATAL")
327
- bundle = bundle or get_custom_bundle()
328
- if SYSTEM_RECORD_ID not in df.columns:
329
- raise Exception("System record id must be presented for undersampling")
330
-
331
- msg = bundle.get("forced_balance_undersample")
332
- logger.info(msg)
333
- if warning_callback is not None:
334
- warning_callback(msg)
335
-
336
- target = df[sample_columns.target].copy()
337
-
338
- vc = target.value_counts()
339
- max_class_value = vc.index[0]
340
- min_class_value = vc.index[len(vc) - 1]
341
- max_class_count = vc[max_class_value]
342
- min_class_count = vc[min_class_value]
343
-
344
- resampled_data = df
345
- df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
346
- if cv_type is not None and cv_type.is_time_series():
347
- logger.warning(f"Sampling time series dataset from {len(df)} to {sample_size}")
348
- resampled_data = sample_time_series_train_eval(
349
- df,
350
- sample_columns=sample_columns,
351
- sample_size=sample_size,
352
- trim_threshold=sample_size,
353
- max_rows=sample_size,
354
- random_state=random_state,
355
- logger=logger,
356
- )
357
- elif task_type in [ModelTaskType.MULTICLASS, ModelTaskType.REGRESSION]:
358
- logger.warning(f"Sampling dataset from {len(df)} to {sample_size}")
359
- resampled_data = df.sample(n=sample_size, random_state=random_state)
360
- else:
361
- msg = bundle.get("imbalanced_target").format(min_class_value, min_class_count)
362
- logger.warning(msg)
363
-
364
- # fill up to min_sample_threshold by majority class
365
- minority_class = df[df[sample_columns.target] == min_class_value]
366
- majority_class = df[df[sample_columns.target] != min_class_value]
367
- logger.info(
368
- f"Min class count: {min_class_count}. Max class count: {max_class_count}."
369
- f" Rebalance sample size: {sample_size}"
370
- )
371
- if len(minority_class) > (sample_size / 2):
372
- sampled_minority_class = minority_class.sample(n=int(sample_size / 2), random_state=random_state)
373
- else:
374
- sampled_minority_class = minority_class
375
-
376
- if len(majority_class) > (sample_size) / 2:
377
- sampled_majority_class = majority_class.sample(n=int(sample_size / 2), random_state=random_state)
378
-
379
- resampled_data = df[
380
- (df[SYSTEM_RECORD_ID].isin(sampled_minority_class[SYSTEM_RECORD_ID]))
381
- | (df[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
382
- ]
383
-
384
- logger.info(f"Shape after forced rebalance resampling: {resampled_data}")
385
- return resampled_data
386
-
387
-
388
- def _num_samples(x):
389
- """Return number of samples in array-like x."""
390
- if x is None:
391
- return 0
392
- message = "Expected sequence or array-like, got %s" % type(x)
393
- if hasattr(x, "fit") and callable(x.fit):
394
- # Don't get num_samples from an ensembles length!
395
- raise TypeError(message)
396
-
397
- if not hasattr(x, "__len__") and not hasattr(x, "shape"):
398
- if hasattr(x, "__array__"):
399
- x = np.asarray(x)
400
- else:
401
- raise TypeError(message)
402
-
403
- if hasattr(x, "shape") and x.shape is not None:
404
- if len(x.shape) == 0:
405
- raise TypeError("Singleton array %r cannot be considered a valid collection." % x)
406
- # Check that shape is returning an integer or default to len
407
- # Dask dataframes may not return numeric shape[0] value
408
- if isinstance(x.shape[0], numbers.Integral):
409
- return x.shape[0]
410
-
411
- try:
412
- return len(x)
413
- except TypeError as type_error:
414
- raise TypeError(message) from type_error