upgini 1.2.114a4__py3-none-any.whl → 1.2.115a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.114a4"
1
+ __version__ = "1.2.115a1"
upgini/dataset.py CHANGED
@@ -25,7 +25,6 @@ from upgini.metadata import (
25
25
  AutoFEParameters,
26
26
  CVType,
27
27
  DataType,
28
- FeaturesFilter,
29
28
  FileColumnMeaningType,
30
29
  FileColumnMetadata,
31
30
  FileMetadata,
@@ -37,8 +36,9 @@ from upgini.metadata import (
37
36
  )
38
37
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
39
38
  from upgini.search_task import SearchTask
39
+ from upgini.utils.config import SampleConfig
40
40
  from upgini.utils.email_utils import EmailSearchKeyConverter
41
- from upgini.utils.sample_utils import SampleColumns, SampleConfig, sample
41
+ from upgini.utils.sample_utils import SampleColumns, sample
42
42
 
43
43
  try:
44
44
  from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
@@ -51,9 +51,6 @@ except Exception:
51
51
  class Dataset:
52
52
  MIN_ROWS_COUNT = 100
53
53
  MAX_ROWS = 200_000
54
- IMBALANCE_THESHOLD = 0.6
55
- MIN_TARGET_CLASS_ROWS = 100
56
- MAX_MULTICLASS_CLASS_COUNT = 100
57
54
  MIN_SUPPORTED_DATE_TS = 946684800000 # 2000-01-01
58
55
  MAX_FEATURES_COUNT = 3500
59
56
  MAX_UPLOADING_FILE_SIZE = 268435456 # 256 Mb
@@ -73,6 +70,7 @@ class Dataset:
73
70
  cv_type: Optional[CVType] = None,
74
71
  date_column: Optional[str] = None,
75
72
  id_columns: Optional[List[str]] = None,
73
+ is_imbalanced: bool = False,
76
74
  random_state: Optional[int] = None,
77
75
  sample_config: Optional[SampleConfig] = None,
78
76
  rest_client: Optional[_RestClient] = None,
@@ -117,8 +115,9 @@ class Dataset:
117
115
  self.rest_client = rest_client
118
116
  self.random_state = random_state
119
117
  self.columns_renaming: Dict[str, str] = {}
120
- self.imbalanced: bool = False
118
+ self.is_imbalanced: bool = False
121
119
  self.id_columns = id_columns
120
+ self.is_imbalanced = is_imbalanced
122
121
  self.date_column = date_column
123
122
  if logger is not None:
124
123
  self.logger = logger
@@ -239,8 +238,6 @@ class Dataset:
239
238
  else:
240
239
  train_segment = self.data
241
240
 
242
- self.imbalanced = self.__is_imbalanced(train_segment)
243
-
244
241
  sample_columns = SampleColumns(
245
242
  ids=self.id_columns,
246
243
  date=self.date_column,
@@ -249,55 +246,19 @@ class Dataset:
249
246
  )
250
247
 
251
248
  self.data = sample(
252
- train_segment if self.imbalanced else self.data, # for imbalanced data we will be doing transform anyway
249
+ train_segment if self.is_imbalanced else self.data, # for imbalanced data we will be doing transform anyway
253
250
  self.task_type,
254
251
  self.cv_type,
255
252
  self.sample_config,
256
253
  sample_columns,
257
254
  self.random_state,
258
- balance=self.imbalanced,
255
+ balance=self.is_imbalanced,
259
256
  force_downsampling=force_downsampling,
260
257
  logger=self.logger,
261
258
  bundle=self.bundle,
262
259
  warning_callback=self.warning_callback,
263
260
  )
264
261
 
265
- def __is_imbalanced(self, data: pd.DataFrame) -> bool:
266
- if self.task_type is None or not self.task_type.is_classification():
267
- return False
268
-
269
- if self.task_type == ModelTaskType.BINARY and len(data) <= self.sample_config.binary_min_sample_threshold:
270
- return False
271
-
272
- count = len(data)
273
- target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
274
- target = data[target_column]
275
- target_classes_count = target.nunique()
276
-
277
- if target_classes_count > self.MAX_MULTICLASS_CLASS_COUNT:
278
- msg = self.bundle.get("dataset_to_many_multiclass_targets").format(
279
- target_classes_count, self.MAX_MULTICLASS_CLASS_COUNT
280
- )
281
- self.logger.warning(msg)
282
- raise ValidationError(msg)
283
-
284
- vc = target.value_counts()
285
- min_class_value = vc.index[len(vc) - 1]
286
- min_class_count = vc[min_class_value]
287
-
288
- if min_class_count < self.MIN_TARGET_CLASS_ROWS:
289
- msg = self.bundle.get("dataset_rarest_class_less_min").format(
290
- min_class_value, min_class_count, self.MIN_TARGET_CLASS_ROWS
291
- )
292
- self.logger.warning(msg)
293
- raise ValidationError(msg)
294
-
295
- min_class_percent = self.IMBALANCE_THESHOLD / target_classes_count
296
- min_class_threshold = min_class_percent * count
297
-
298
- # If min class count less than 30% for binary or (60 / classes_count)% for multiclass
299
- return bool(min_class_count < min_class_threshold)
300
-
301
262
  def __validate_dataset(self, validate_target: bool, silent_mode: bool):
302
263
  """Validate DataSet"""
303
264
  # self.logger.info("validating etalon")
@@ -537,9 +498,6 @@ class Dataset:
537
498
  return_scores: bool,
538
499
  extract_features: bool,
539
500
  accurate_model: Optional[bool] = None,
540
- importance_threshold: Optional[float] = None,
541
- max_features: Optional[int] = None,
542
- filter_features: Optional[dict] = None,
543
501
  runtime_parameters: Optional[RuntimeParameters] = None,
544
502
  metrics_calculation: Optional[bool] = False,
545
503
  auto_fe_parameters: Optional[AutoFEParameters] = None,
@@ -548,28 +506,12 @@ class Dataset:
548
506
  search_customization = SearchCustomization(
549
507
  extractFeatures=extract_features,
550
508
  accurateModel=accurate_model,
551
- importanceThreshold=importance_threshold,
552
- maxFeatures=max_features,
553
509
  returnScores=return_scores,
554
510
  runtimeParameters=runtime_parameters,
555
511
  metricsCalculation=metrics_calculation,
556
512
  )
557
- if filter_features:
558
- if [
559
- key
560
- for key in filter_features
561
- if key not in {"min_importance", "max_psi", "max_count", "selected_features"}
562
- ]:
563
- raise ValidationError(self.bundle.get("dataset_invalid_filter"))
564
- feature_filter = FeaturesFilter(
565
- minImportance=filter_features.get("min_importance"),
566
- maxPSI=filter_features.get("max_psi"),
567
- maxCount=filter_features.get("max_count"),
568
- selectedFeatures=filter_features.get("selected_features"),
569
- )
570
- search_customization.featuresFilter = feature_filter
571
513
 
572
- search_customization.runtimeParameters.properties["etalon_imbalanced"] = self.imbalanced
514
+ search_customization.runtimeParameters.properties["etalon_imbalanced"] = self.is_imbalanced
573
515
  if auto_fe_parameters is not None:
574
516
  search_customization.runtimeParameters.properties["feature_generation_params.ts.gap_days"] = (
575
517
  auto_fe_parameters.ts_gap_days
@@ -624,9 +566,6 @@ class Dataset:
624
566
  extract_features: bool = False,
625
567
  accurate_model: bool = False,
626
568
  exclude_features_sources: Optional[List[str]] = None,
627
- importance_threshold: Optional[float] = None, # deprecated
628
- max_features: Optional[int] = None, # deprecated
629
- filter_features: Optional[dict] = None, # deprecated
630
569
  runtime_parameters: Optional[RuntimeParameters] = None,
631
570
  auto_fe_parameters: Optional[AutoFEParameters] = None,
632
571
  force_downsampling: bool = False,
@@ -643,9 +582,6 @@ class Dataset:
643
582
  return_scores=return_scores,
644
583
  extract_features=extract_features,
645
584
  accurate_model=accurate_model,
646
- importance_threshold=importance_threshold,
647
- max_features=max_features,
648
- filter_features=filter_features,
649
585
  runtime_parameters=runtime_parameters,
650
586
  auto_fe_parameters=auto_fe_parameters,
651
587
  )