upgini 1.2.91a3906.dev1__py3-none-any.whl → 1.2.93__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.91a3906.dev1"
1
+ __version__ = "1.2.93"
upgini/autofe/unary.py CHANGED
@@ -190,11 +190,3 @@ class Bin(PandasOperator):
190
190
  if isinstance(value, str):
191
191
  return json.loads(value)
192
192
  return value
193
-
194
-
195
- class Cluster(PandasOperator):
196
- name: str = "cluster"
197
- is_unary: bool = True
198
- input_type: Optional[str] = "vector"
199
- output_type: Optional[str] = "category"
200
- is_categorical: bool = True
upgini/dataset.py CHANGED
@@ -38,11 +38,7 @@ from upgini.metadata import (
38
38
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
39
39
  from upgini.search_task import SearchTask
40
40
  from upgini.utils.email_utils import EmailSearchKeyConverter
41
- from upgini.utils.target_utils import (
42
- balance_undersample,
43
- balance_undersample_forced,
44
- balance_undersample_time_series_trunc,
45
- )
41
+ from upgini.utils.sample_utils import SampleColumns, SampleConfig, sample
46
42
 
47
43
  try:
48
44
  from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
@@ -52,20 +48,10 @@ except Exception:
52
48
  )
53
49
 
54
50
 
55
- class Dataset: # (pd.DataFrame):
51
+ class Dataset:
56
52
  MIN_ROWS_COUNT = 100
57
53
  MAX_ROWS = 200_000
58
- FIT_SAMPLE_ROWS = 200_000
59
- FIT_SAMPLE_THRESHOLD = 200_000
60
- FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
61
- FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
62
- FIT_SAMPLE_THRESHOLD_TS = 54_000
63
- FIT_SAMPLE_ROWS_TS = 54_000
64
- BINARY_MIN_SAMPLE_THRESHOLD = 5_000
65
- MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
66
54
  IMBALANCE_THESHOLD = 0.6
67
- BINARY_BOOTSTRAP_LOOPS = 5
68
- MULTICLASS_BOOTSTRAP_LOOPS = 2
69
55
  MIN_TARGET_CLASS_ROWS = 100
70
56
  MAX_MULTICLASS_CLASS_COUNT = 100
71
57
  MIN_SUPPORTED_DATE_TS = 946684800000 # 2000-01-01
@@ -88,6 +74,7 @@ class Dataset: # (pd.DataFrame):
88
74
  date_column: Optional[str] = None,
89
75
  id_columns: Optional[List[str]] = None,
90
76
  random_state: Optional[int] = None,
77
+ sample_config: Optional[SampleConfig] = None,
91
78
  rest_client: Optional[_RestClient] = None,
92
79
  logger: Optional[logging.Logger] = None,
93
80
  bundle: Optional[ResourceBundle] = None,
@@ -95,6 +82,7 @@ class Dataset: # (pd.DataFrame):
95
82
  **kwargs,
96
83
  ):
97
84
  self.bundle = bundle or get_custom_bundle()
85
+ self.sample_config = sample_config or SampleConfig(force_sample_size=self.FORCE_SAMPLE_SIZE)
98
86
  if df is not None:
99
87
  data = df.copy()
100
88
  elif path is not None:
@@ -233,109 +221,70 @@ class Dataset: # (pd.DataFrame):
233
221
  raise ValidationError(self.bundle.get("dataset_invalid_timeseries_target").format(target.dtype))
234
222
 
235
223
  def __resample(self, force_downsampling=False):
236
- # self.logger.info("Resampling etalon")
237
- # Resample imbalanced target. Only train segment (without eval_set)
238
- if force_downsampling:
239
- target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
240
- self.data = balance_undersample_forced(
241
- df=self.data,
242
- target_column=target_column,
243
- task_type=self.task_type,
244
- cv_type=self.cv_type,
245
- date_column=self.date_column,
246
- id_columns=self.id_columns,
247
- random_state=self.random_state,
248
- sample_size=self.FORCE_SAMPLE_SIZE,
249
- logger=self.logger,
250
- bundle=self.bundle,
251
- warning_callback=self.warning_callback,
252
- )
253
- return
254
224
 
255
- if EVAL_SET_INDEX in self.data.columns:
225
+ if EVAL_SET_INDEX in self.data.columns and not force_downsampling:
256
226
  train_segment = self.data[self.data[EVAL_SET_INDEX] == 0]
257
227
  else:
258
228
  train_segment = self.data
259
229
 
260
- if self.task_type == ModelTaskType.MULTICLASS or (
261
- self.task_type == ModelTaskType.BINARY and len(train_segment) > self.BINARY_MIN_SAMPLE_THRESHOLD
262
- ):
263
- count = len(train_segment)
264
- target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
265
- target = train_segment[target_column]
266
- target_classes_count = target.nunique()
230
+ self.imbalanced = self.__is_imbalanced(train_segment)
267
231
 
268
- if target_classes_count > self.MAX_MULTICLASS_CLASS_COUNT:
269
- msg = self.bundle.get("dataset_to_many_multiclass_targets").format(
270
- target_classes_count, self.MAX_MULTICLASS_CLASS_COUNT
271
- )
272
- self.logger.warning(msg)
273
- raise ValidationError(msg)
232
+ sample_columns = SampleColumns(
233
+ ids=self.id_columns,
234
+ date=self.date_column,
235
+ target=self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET),
236
+ eval_set_index=EVAL_SET_INDEX,
237
+ )
274
238
 
275
- vc = target.value_counts()
276
- min_class_value = vc.index[len(vc) - 1]
277
- min_class_count = vc[min_class_value]
239
+ self.data = sample(
240
+ train_segment if self.imbalanced else self.data, # for imbalanced data we will be doing transform anyway
241
+ self.task_type,
242
+ self.cv_type,
243
+ self.sample_config,
244
+ sample_columns,
245
+ self.random_state,
246
+ balance=self.imbalanced,
247
+ force_downsampling=force_downsampling,
248
+ logger=self.logger,
249
+ bundle=self.bundle,
250
+ warning_callback=self.warning_callback,
251
+ )
278
252
 
279
- if min_class_count < self.MIN_TARGET_CLASS_ROWS:
280
- msg = self.bundle.get("dataset_rarest_class_less_min").format(
281
- min_class_value, min_class_count, self.MIN_TARGET_CLASS_ROWS
282
- )
283
- self.logger.warning(msg)
284
- raise ValidationError(msg)
253
+ def __is_imbalanced(self, data: pd.DataFrame) -> bool:
254
+ if self.task_type is None or not self.task_type.is_classification():
255
+ return False
285
256
 
286
- min_class_percent = self.IMBALANCE_THESHOLD / target_classes_count
287
- min_class_threshold = min_class_percent * count
288
-
289
- # If min class count less than 30% for binary or (60 / classes_count)% for multiclass
290
- if min_class_count < min_class_threshold:
291
- self.imbalanced = True
292
- self.data = balance_undersample(
293
- df=train_segment,
294
- target_column=target_column,
295
- task_type=self.task_type,
296
- random_state=self.random_state,
297
- binary_min_sample_threshold=self.BINARY_MIN_SAMPLE_THRESHOLD,
298
- multiclass_min_sample_threshold=self.MULTICLASS_MIN_SAMPLE_THRESHOLD,
299
- binary_bootstrap_loops=self.BINARY_BOOTSTRAP_LOOPS,
300
- multiclass_bootstrap_loops=self.MULTICLASS_BOOTSTRAP_LOOPS,
301
- logger=self.logger,
302
- bundle=self.bundle,
303
- warning_callback=self.warning_callback,
304
- )
257
+ if self.task_type == ModelTaskType.BINARY and len(data) <= self.sample_config.binary_min_sample_threshold:
258
+ return False
305
259
 
306
- # Resample over fit threshold
307
- if self.cv_type is not None and self.cv_type.is_time_series():
308
- sample_threshold = self.FIT_SAMPLE_THRESHOLD_TS
309
- sample_rows = self.FIT_SAMPLE_ROWS_TS
310
- elif not self.imbalanced and EVAL_SET_INDEX in self.data.columns:
311
- sample_threshold = self.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD
312
- sample_rows = self.FIT_SAMPLE_WITH_EVAL_SET_ROWS
313
- else:
314
- sample_threshold = self.FIT_SAMPLE_THRESHOLD
315
- sample_rows = self.FIT_SAMPLE_ROWS
260
+ count = len(data)
261
+ target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
262
+ target = data[target_column]
263
+ target_classes_count = target.nunique()
316
264
 
317
- if len(self.data) > sample_threshold:
318
- self.logger.info(
319
- f"Etalon has size {len(self.data)} more than threshold {sample_threshold} "
320
- f"and will be downsampled to {sample_rows}"
265
+ if target_classes_count > self.MAX_MULTICLASS_CLASS_COUNT:
266
+ msg = self.bundle.get("dataset_to_many_multiclass_targets").format(
267
+ target_classes_count, self.MAX_MULTICLASS_CLASS_COUNT
321
268
  )
322
- if self.cv_type is not None and self.cv_type.is_time_series():
323
- resampled_data = balance_undersample_time_series_trunc(
324
- df=self.data,
325
- id_columns=self.id_columns,
326
- date_column=next(
327
- k
328
- for k, v in self.meaning_types.items()
329
- if v in [FileColumnMeaningType.DATE, FileColumnMeaningType.DATETIME]
330
- ),
331
- sample_size=sample_rows,
332
- random_state=self.random_state,
333
- logger=self.logger,
334
- )
335
- else:
336
- resampled_data = self.data.sample(n=sample_rows, random_state=self.random_state)
337
- self.data = resampled_data
338
- self.logger.info(f"Shape after threshold resampling: {self.data.shape}")
269
+ self.logger.warning(msg)
270
+ raise ValidationError(msg)
271
+
272
+ vc = target.value_counts()
273
+ min_class_value = vc.index[len(vc) - 1]
274
+ min_class_count = vc[min_class_value]
275
+
276
+ if min_class_count < self.MIN_TARGET_CLASS_ROWS:
277
+ msg = self.bundle.get("dataset_rarest_class_less_min").format(
278
+ min_class_value, min_class_count, self.MIN_TARGET_CLASS_ROWS
279
+ )
280
+ self.logger.warning(msg)
281
+ raise ValidationError(msg)
282
+
283
+ min_class_percent = self.IMBALANCE_THESHOLD / target_classes_count
284
+ min_class_threshold = min_class_percent * count
285
+
286
+ # If min class count less than 30% for binary or (60 / classes_count)% for multiclass
287
+ return bool(min_class_count < min_class_threshold)
339
288
 
340
289
  def __validate_dataset(self, validate_target: bool, silent_mode: bool):
341
290
  """Validate DataSet"""
@@ -617,8 +566,8 @@ class Dataset: # (pd.DataFrame):
617
566
  def _set_sample_size(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
618
567
  if runtime_parameters is not None and runtime_parameters.properties is not None:
619
568
  if self.cv_type is not None and self.cv_type.is_time_series():
620
- runtime_parameters.properties["sample_size"] = self.FIT_SAMPLE_ROWS_TS
621
- runtime_parameters.properties["iter0_sample_size"] = self.FIT_SAMPLE_ROWS_TS
569
+ runtime_parameters.properties["sample_size"] = self.sample_config.fit_sample_rows_ts
570
+ runtime_parameters.properties["iter0_sample_size"] = self.sample_config.fit_sample_rows_ts
622
571
  return runtime_parameters
623
572
 
624
573
  def _clean_generate_features(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]: