upgini 1.1.261a3250.post2__py3-none-any.whl → 1.1.262a3250.post4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/autofe/date.py CHANGED
@@ -1,42 +1,51 @@
1
+ from typing import Optional, Union
1
2
  import numpy as np
2
3
  import pandas as pd
3
4
 
4
- from upgini.autofe.operand import PandasOperand, VectorizableMixin
5
+ from upgini.autofe.operand import PandasOperand
5
6
 
6
7
 
7
- class DateDiff(PandasOperand, VectorizableMixin):
8
+ class DateDiffMixin:
9
+ diff_unit: str = "D"
10
+ left_unit: Optional[str] = None
11
+ right_unit: Optional[str] = None
12
+
13
+ def _convert_to_date(
14
+ self, x: Union[pd.DataFrame, pd.Series], unit: Optional[str]
15
+ ) -> Union[pd.DataFrame, pd.Series]:
16
+ if isinstance(x, pd.DataFrame):
17
+ return x.apply(lambda y: self._convert_to_date(y, unit), axis=1)
18
+
19
+ return pd.to_datetime(x, unit=unit)
20
+
21
+
22
+ class DateDiff(PandasOperand, DateDiffMixin):
8
23
  name = "date_diff"
9
24
  is_binary = True
10
25
  has_symmetry_importance = True
11
- is_vectorizable = True
12
- unit: str = "D"
13
26
 
14
27
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
15
- return self.__replace_negative((left - right) / np.timedelta64(1, self.unit))
16
-
17
- def calculate_group(self, data: pd.DataFrame, **kwargs) -> pd.DataFrame:
18
- group_column, value_columns = self.validate_calculation(data.columns, **kwargs)
19
- d1 = data[value_columns]
20
- d2 = data[group_column]
21
-
22
- return self.__replace_negative(d1.sub(d2, axis=0) / np.timedelta64(1, self.unit))
28
+ left = self._convert_to_date(left, self.left_unit)
29
+ right = self._convert_to_date(right, self.right_unit)
30
+ return self.__replace_negative((left - right) / np.timedelta64(1, self.diff_unit))
23
31
 
24
- def __replace_negative(self, df):
25
- df[df < 0] = None
26
- return df
32
+ def __replace_negative(self, x: Union[pd.DataFrame, pd.Series]):
33
+ x[x < 0] = None
34
+ return x
27
35
 
28
36
 
29
- class DateDiffFuture(PandasOperand):
37
+ class DateDiffFuture(PandasOperand, DateDiffMixin):
30
38
  name = "date_diff_future"
31
39
  is_binary = True
32
40
  has_symmetry_importance = True
33
41
  is_vectorizable = False
34
- unit: str = "D"
35
42
 
36
43
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
44
+ left = self._convert_to_date(left, self.left_unit)
45
+ right = self._convert_to_date(right, self.right_unit)
37
46
  future = pd.to_datetime(dict(day=right.dt.day, month=right.dt.month, year=left.dt.year))
38
47
  before = future[future < left]
39
48
  future[future < left] = pd.to_datetime(dict(day=before.dt.day, month=before.dt.month, year=before.dt.year + 1))
40
- diff = (future - left) / np.timedelta64(1, self.unit)
49
+ diff = (future - left) / np.timedelta64(1, self.diff_unit)
41
50
 
42
51
  return diff
upgini/autofe/feature.py CHANGED
@@ -305,7 +305,7 @@ class FeatureGroup:
305
305
  grouped_features = []
306
306
 
307
307
  def groupby_func(f: Feature) -> Tuple[Operand, Union[Column, Feature]]:
308
- return (f.op, f.children[0] if f.op.is_unary or f.op.is_vector else f.children[1])
308
+ return (f.op, f.children[0 if not f.op.is_vectorizable else f.op.group_index])
309
309
 
310
310
  for op_child, features in itertools.groupby(candidates, groupby_func):
311
311
  op, main_child = op_child
upgini/autofe/operand.py CHANGED
@@ -73,6 +73,8 @@ class PandasOperand(Operand, abc.ABC):
73
73
 
74
74
 
75
75
  class VectorizableMixin(Operand):
76
+ group_index: int = 1
77
+
76
78
  def validate_calculation(self, input_columns: List[str], **kwargs) -> Tuple[str, List[str]]:
77
79
  if not kwargs.get(MAIN_COLUMN):
78
80
  raise ValueError(f"Expected argument {MAIN_COLUMN} for grouping operator {self.name} not found")
upgini/autofe/unary.py CHANGED
@@ -1,12 +1,13 @@
1
- from upgini.autofe.operand import PandasOperand
1
+ from upgini.autofe.operand import PandasOperand, VectorizableMixin
2
2
  import numpy as np
3
3
  import pandas as pd
4
4
 
5
5
 
6
- class Abs(PandasOperand):
6
+ class Abs(PandasOperand, VectorizableMixin):
7
7
  name = "abs"
8
8
  is_unary = True
9
9
  is_vectorizable = True
10
+ group_index = 0
10
11
 
11
12
  def calculate_unary(self, data: pd.Series) -> pd.Series:
12
13
  return data.abs()
@@ -15,11 +16,12 @@ class Abs(PandasOperand):
15
16
  return data.abs()
16
17
 
17
18
 
18
- class Log(PandasOperand):
19
+ class Log(PandasOperand, VectorizableMixin):
19
20
  name = "log"
20
21
  is_unary = True
21
22
  is_vectorizable = True
22
23
  output_type = "float"
24
+ group_index = 0
23
25
 
24
26
  def calculate_unary(self, data: pd.Series) -> pd.Series:
25
27
  return self._round_value(np.log(np.abs(data.replace(0, np.nan))), 10)
@@ -28,11 +30,12 @@ class Log(PandasOperand):
28
30
  return self._round_value(np.log(data.replace(0, np.nan).abs()), 10)
29
31
 
30
32
 
31
- class Sqrt(PandasOperand):
33
+ class Sqrt(PandasOperand, VectorizableMixin):
32
34
  name = "sqrt"
33
35
  is_unary = True
34
36
  is_vectorizable = True
35
37
  output_type = "float"
38
+ group_index = 0
36
39
 
37
40
  def calculate_unary(self, data: pd.Series) -> pd.Series:
38
41
  return self._round_value(np.sqrt(np.abs(data)))
@@ -41,10 +44,11 @@ class Sqrt(PandasOperand):
41
44
  return self._round_value(np.sqrt(data.abs()))
42
45
 
43
46
 
44
- class Square(PandasOperand):
47
+ class Square(PandasOperand, VectorizableMixin):
45
48
  name = "square"
46
49
  is_unary = True
47
50
  is_vectorizable = True
51
+ group_index = 0
48
52
 
49
53
  def calculate_unary(self, data: pd.Series) -> pd.Series:
50
54
  return np.square(data)
@@ -53,11 +57,12 @@ class Square(PandasOperand):
53
57
  return np.square(data)
54
58
 
55
59
 
56
- class Sigmoid(PandasOperand):
60
+ class Sigmoid(PandasOperand, VectorizableMixin):
57
61
  name = "sigmoid"
58
62
  is_unary = True
59
63
  is_vectorizable = True
60
64
  output_type = "float"
65
+ group_index = 0
61
66
 
62
67
  def calculate_unary(self, data: pd.Series) -> pd.Series:
63
68
  return self._round_value(1 / (1 + np.exp(-data)))
@@ -66,12 +71,13 @@ class Sigmoid(PandasOperand):
66
71
  return self._round_value(1 / (1 + np.exp(-data)))
67
72
 
68
73
 
69
- class Floor(PandasOperand):
74
+ class Floor(PandasOperand, VectorizableMixin):
70
75
  name = "floor"
71
76
  is_unary = True
72
77
  is_vectorizable = True
73
78
  output_type = "int"
74
79
  input_type = "continuous"
80
+ group_index = 0
75
81
 
76
82
  def calculate_unary(self, data: pd.Series) -> pd.Series:
77
83
  return np.floor(data)
@@ -80,11 +86,12 @@ class Floor(PandasOperand):
80
86
  return np.floor(data)
81
87
 
82
88
 
83
- class Residual(PandasOperand):
89
+ class Residual(PandasOperand, VectorizableMixin):
84
90
  name = "residual"
85
91
  is_unary = True
86
92
  is_vectorizable = True
87
93
  input_type = "continuous"
94
+ group_index = 0
88
95
 
89
96
  def calculate_unary(self, data: pd.Series) -> pd.Series:
90
97
  return data - np.floor(data)
upgini/autofe/vector.py CHANGED
@@ -1,20 +1,22 @@
1
1
  from typing import List
2
2
  import pandas as pd
3
- from upgini.autofe.operand import PandasOperand
3
+ from upgini.autofe.operand import PandasOperand, VectorizableMixin
4
4
 
5
5
 
6
- class Mean(PandasOperand):
6
+ class Mean(PandasOperand, VectorizableMixin):
7
7
  name = "mean"
8
8
  output_type = "float"
9
9
  is_vector = True
10
+ group_index = 0
10
11
 
11
12
  def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
12
13
  return pd.DataFrame(data).T.fillna(0).mean(axis=1)
13
14
 
14
15
 
15
- class Sum(PandasOperand):
16
+ class Sum(PandasOperand, VectorizableMixin):
16
17
  name = "sum"
17
18
  is_vector = True
19
+ group_index = 0
18
20
 
19
21
  def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
20
22
  return pd.DataFrame(data).T.fillna(0).sum(axis=1)
@@ -40,7 +40,7 @@ class DataSourcePublisher:
40
40
  if logs_enabled:
41
41
  self.logger = LoggerFactory().get_logger(endpoint, api_key)
42
42
  else:
43
- self.logger = logging.getLogger()
43
+ self.logger = logging.getLogger("muted_logger")
44
44
  self.logger.setLevel("FATAL")
45
45
 
46
46
  def place(
@@ -170,6 +170,7 @@ class DataSourcePublisher:
170
170
  print(msg)
171
171
  self.logger.info(msg)
172
172
  self._rest_client.stop_ads_management_task(task_id, trace_id)
173
+ raise
173
174
  except Exception:
174
175
  self.logger.exception("Failed to register data table")
175
176
  raise
@@ -289,6 +290,7 @@ class DataSourcePublisher:
289
290
  raise ValidationError("One of arguments: bq_table_id or search_keys should be presented")
290
291
  if bq_table_id is not None and search_keys is not None:
291
292
  raise ValidationError("Only one argument could be presented: bq_table_id or search_keys")
293
+ task_id = None
292
294
  try:
293
295
  search_keys = [k.value.value for k in search_keys] if search_keys else None
294
296
  request = {"bqTableId": bq_table_id, "searchKeys": search_keys}
@@ -303,6 +305,13 @@ class DataSourcePublisher:
303
305
  raise Exception("Failed to register ADS: " + status_response["errorMessage"])
304
306
 
305
307
  print("Uploading successfully finished")
308
+ except KeyboardInterrupt:
309
+ if task_id is not None:
310
+ msg = f"Stopping AdsManagementTask {task_id}"
311
+ print(msg)
312
+ self.logger.info(msg)
313
+ self._rest_client.stop_ads_management_task(task_id, trace_id)
314
+ raise
306
315
  except Exception:
307
316
  self.logger.exception(f"Failed to upload table {bq_table_id}")
308
317
  raise
upgini/dataset.py CHANGED
@@ -39,10 +39,10 @@ from upgini.metadata import (
39
39
  )
40
40
  from upgini.normalizer.phone_normalizer import PhoneNormalizer
41
41
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
42
- from upgini.sampler.random_under_sampler import RandomUnderSampler
43
42
  from upgini.search_task import SearchTask
44
43
  from upgini.utils import combine_search_keys, find_numbers_with_decimal_comma
45
44
  from upgini.utils.email_utils import EmailSearchKeyConverter
45
+ from upgini.utils.target_utils import balance_undersample
46
46
 
47
47
  try:
48
48
  from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
@@ -61,6 +61,8 @@ class Dataset: # (pd.DataFrame):
61
61
  FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
62
62
  MIN_SAMPLE_THRESHOLD = 5_000
63
63
  IMBALANCE_THESHOLD = 0.4
64
+ BINARY_BOOTSTRAP_LOOPS = 5
65
+ MULTICLASS_BOOTSTRAP_LOOPS = 2
64
66
  MIN_TARGET_CLASS_ROWS = 100
65
67
  MAX_MULTICLASS_CLASS_COUNT = 100
66
68
  MIN_SUPPORTED_DATE_TS = 946684800000 # 2000-01-01
@@ -460,10 +462,8 @@ class Dataset: # (pd.DataFrame):
460
462
  self.task_type == ModelTaskType.BINARY and len(train_segment) > self.MIN_SAMPLE_THRESHOLD
461
463
  ):
462
464
  count = len(train_segment)
463
- min_class_count = count
464
- min_class_value = None
465
- target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, "")
466
- target = train_segment[target_column].copy()
465
+ target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
466
+ target = train_segment[target_column]
467
467
  target_classes_count = target.nunique()
468
468
 
469
469
  if target_classes_count > self.MAX_MULTICLASS_CLASS_COUNT:
@@ -473,12 +473,9 @@ class Dataset: # (pd.DataFrame):
473
473
  self.logger.warning(msg)
474
474
  raise ValidationError(msg)
475
475
 
476
- unique_target = target.unique()
477
- for v in list(unique_target): # type: ignore
478
- current_class_count = len(train_segment.loc[target == v])
479
- if current_class_count < min_class_count:
480
- min_class_count = current_class_count
481
- min_class_value = v
476
+ vc = target.value_counts()
477
+ min_class_value = vc.index[len(vc) - 1]
478
+ min_class_count = vc[min_class_value]
482
479
 
483
480
  if min_class_count < self.MIN_TARGET_CLASS_ROWS:
484
481
  msg = self.bundle.get("dataset_rarest_class_less_min").format(
@@ -491,53 +488,19 @@ class Dataset: # (pd.DataFrame):
491
488
  min_class_threshold = min_class_percent * count
492
489
 
493
490
  if min_class_count < min_class_threshold:
494
- msg = self.bundle.get("dataset_rarest_class_less_threshold").format(
495
- min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
496
- )
497
- self.logger.warning(msg)
498
- print(msg)
499
- self.warning_counter.increment()
500
-
501
- train_segment = train_segment.copy().sort_values(by=SYSTEM_RECORD_ID)
502
- if self.task_type == ModelTaskType.MULTICLASS:
503
- # Sort classes by rows count and find 25% quantile class
504
- classes = target.value_counts().index
505
- quantile25_idx = int(0.75 * len(classes))
506
- quantile25_class = classes[quantile25_idx]
507
- count_of_quantile25_class = len(target[target == quantile25_class])
508
- msg = self.bundle.get("imbalance_multiclass").format(quantile25_class, count_of_quantile25_class)
509
- self.logger.warning(msg)
510
- print(msg)
511
- # 25% and lower classes will stay as is. Higher classes will be downsampled
512
- parts = []
513
- for class_idx in range(quantile25_idx):
514
- sampled = train_segment[train_segment[target_column] == classes[class_idx]].sample(
515
- n=count_of_quantile25_class, random_state=self.random_state
516
- )
517
- parts.append(sampled)
518
- for class_idx in range(quantile25_idx, len(classes)):
519
- parts.append(train_segment[train_segment[target_column] == classes[class_idx]])
520
- resampled_data = pd.concat(parts)
521
- elif self.task_type == ModelTaskType.BINARY and min_class_count < self.MIN_SAMPLE_THRESHOLD / 2:
522
- minority_class = train_segment[train_segment[target_column] == min_class_value]
523
- majority_class = train_segment[train_segment[target_column] != min_class_value]
524
- sampled_majority_class = majority_class.sample(
525
- n=self.MIN_SAMPLE_THRESHOLD - min_class_count, random_state=self.random_state
526
- )
527
- resampled_data = train_segment[
528
- (train_segment[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
529
- | (train_segment[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
530
- ]
531
- else:
532
- sampler = RandomUnderSampler(random_state=self.random_state)
533
- X = train_segment[SYSTEM_RECORD_ID]
534
- X = X.to_frame(SYSTEM_RECORD_ID)
535
- new_x, _ = sampler.fit_resample(X, target) # type: ignore
536
- resampled_data = train_segment[train_segment[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
537
-
538
- self.data = resampled_data
539
- self.logger.info(f"Shape after rebalance resampling: {self.data.shape}")
540
491
  self.imbalanced = True
492
+ self.data = balance_undersample(
493
+ df=train_segment,
494
+ target_column=target_column,
495
+ task_type=self.task_type,
496
+ random_state=self.random_state,
497
+ imbalance_threshold=self.IMBALANCE_THESHOLD,
498
+ binary_bootstrap_loops=self.BINARY_BOOTSTRAP_LOOPS,
499
+ multiclass_bootstrap_loops=self.MULTICLASS_BOOTSTRAP_LOOPS,
500
+ logger=self.logger,
501
+ bundle=self.bundle,
502
+ warning_counter=self.warning_counter,
503
+ )
541
504
 
542
505
  # Resample over fit threshold
543
506
  if not self.imbalanced and EVAL_SET_INDEX in self.data.columns:
@@ -220,7 +220,7 @@ class FeaturesEnricher(TransformerMixin):
220
220
  if logs_enabled:
221
221
  self.logger = LoggerFactory().get_logger(endpoint, self._api_key, client_ip, client_visitorid)
222
222
  else:
223
- self.logger = logging.getLogger()
223
+ self.logger = logging.getLogger("muted_logger")
224
224
  self.logger.setLevel("FATAL")
225
225
 
226
226
  if len(kwargs) > 0:
upgini/search_task.py CHANGED
@@ -57,7 +57,7 @@ class SearchTask:
57
57
  if logger is not None:
58
58
  self.logger = logger
59
59
  else:
60
- self.logger = logging.getLogger()
60
+ self.logger = logging.getLogger("muted_logger")
61
61
  self.logger.setLevel("FATAL")
62
62
  self.provider_metadata_v2: Optional[List[ProviderTaskMetadataV2]] = None
63
63
  self.unused_features_for_generation: Optional[List[str]] = None
@@ -44,7 +44,7 @@ class DateTimeSearchKeyConverter:
44
44
  if logger is not None:
45
45
  self.logger = logger
46
46
  else:
47
- self.logger = logging.getLogger()
47
+ self.logger = logging.getLogger("muted_logger")
48
48
  self.logger.setLevel("FATAL")
49
49
  self.generated_features: List[str] = []
50
50
  self.bundle = bundle or get_custom_bundle()
@@ -6,8 +6,10 @@ import pandas as pd
6
6
  from pandas.api.types import is_numeric_dtype
7
7
 
8
8
  from upgini.errors import ValidationError
9
- from upgini.metadata import ModelTaskType
10
- from upgini.resource_bundle import bundle
9
+ from upgini.metadata import SYSTEM_RECORD_ID, ModelTaskType
10
+ from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
11
+ from upgini.sampler.random_under_sampler import RandomUnderSampler
12
+ from upgini.utils.warning_counter import WarningCounter
11
13
 
12
14
 
13
15
  def correct_string_target(y: Union[pd.Series, np.ndarray]) -> Union[pd.Series, np.ndarray]:
@@ -72,3 +74,110 @@ def is_int_encoding(unique_values):
72
74
  return set(unique_values) == set(range(len(unique_values))) or set(unique_values) == set(
73
75
  range(1, len(unique_values) + 1)
74
76
  )
77
+
78
+
79
+ def balance_undersample(
80
+ df: pd.DataFrame,
81
+ target_column: str,
82
+ task_type: ModelTaskType,
83
+ random_state: int,
84
+ imbalance_threshold: int = 0.2,
85
+ min_sample_threshold: int = 5000,
86
+ binary_bootstrap_loops: int = 5,
87
+ multiclass_bootstrap_loops: int = 2,
88
+ logger: Optional[logging.Logger] = None,
89
+ bundle: Optional[ResourceBundle] = None,
90
+ warning_counter: Optional[WarningCounter] = None,
91
+ ) -> pd.DataFrame:
92
+ if logger is None:
93
+ logger = logging.getLogger("muted_logger")
94
+ logger.setLevel("FATAL")
95
+ bundle = bundle or get_custom_bundle()
96
+ if SYSTEM_RECORD_ID not in df.columns:
97
+ raise Exception("System record id must be presented for undersampling")
98
+
99
+ count = len(df)
100
+ target = df[target_column].copy()
101
+ target_classes_count = target.nunique()
102
+
103
+ vc = target.value_counts()
104
+ max_class_value = vc.index[0]
105
+ min_class_value = vc.index[len(vc) - 1]
106
+ max_class_count = vc[max_class_value]
107
+ min_class_count = vc[min_class_value]
108
+
109
+ min_class_percent = imbalance_threshold / target_classes_count
110
+ min_class_threshold = min_class_percent * count
111
+
112
+ resampled_data = df
113
+ df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
114
+ if task_type == ModelTaskType.MULTICLASS:
115
+ # Sort classes by rows count and find 25% quantile class
116
+ classes = vc.index
117
+ quantile25_idx = int(0.75 * len(classes)) - 1
118
+ quantile25_class = classes[quantile25_idx]
119
+ quantile25_class_cnt = vc[quantile25_class]
120
+
121
+ if max_class_count > (quantile25_class_cnt * multiclass_bootstrap_loops):
122
+ msg = bundle.get("imbalance_multiclass").format(quantile25_class, quantile25_class_cnt)
123
+ logger.warning(msg)
124
+ print(msg)
125
+ if warning_counter:
126
+ warning_counter.increment()
127
+
128
+ # 25% and lower classes will stay as is. Higher classes will be downsampled
129
+ sample_strategy = dict()
130
+ for class_idx in range(quantile25_idx):
131
+ # compare class count with count_of_quantile25_class * 2
132
+ class_value = classes[class_idx]
133
+ class_count = vc[class_value]
134
+ sample_strategy[class_value] = min(class_count, quantile25_class_cnt * multiclass_bootstrap_loops)
135
+ sampler = RandomUnderSampler(
136
+ sampling_strategy=sample_strategy, random_state=random_state
137
+ )
138
+ X = df[SYSTEM_RECORD_ID]
139
+ X = X.to_frame(SYSTEM_RECORD_ID)
140
+ new_x, _ = sampler.fit_resample(X, target) # type: ignore
141
+
142
+ resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
143
+ elif len(df) > min_sample_threshold and min_class_count < min_sample_threshold / 2:
144
+ msg = bundle.get("dataset_rarest_class_less_threshold").format(
145
+ min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
146
+ )
147
+ logger.warning(msg)
148
+ print(msg)
149
+ if warning_counter:
150
+ warning_counter.increment()
151
+
152
+ # fill up to min_sample_threshold by majority class
153
+ minority_class = df[df[target_column] == min_class_value]
154
+ majority_class = df[df[target_column] != min_class_value]
155
+ sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
156
+ sampled_majority_class = majority_class.sample(
157
+ n=sample_size, random_state=random_state
158
+ )
159
+ resampled_data = df[
160
+ (df[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
161
+ | (df[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
162
+ ]
163
+
164
+ elif max_class_count > min_class_count * binary_bootstrap_loops:
165
+ msg = bundle.get("dataset_rarest_class_less_threshold").format(
166
+ min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
167
+ )
168
+ logger.warning(msg)
169
+ print(msg)
170
+ if warning_counter:
171
+ warning_counter.increment()
172
+
173
+ sampler = RandomUnderSampler(
174
+ sampling_strategy={max_class_value: binary_bootstrap_loops * min_class_count}, random_state=random_state
175
+ )
176
+ X = df[SYSTEM_RECORD_ID]
177
+ X = X.to_frame(SYSTEM_RECORD_ID)
178
+ new_x, _ = sampler.fit_resample(X, target) # type: ignore
179
+
180
+ resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
181
+
182
+ logger.info(f"Shape after rebalance resampling: {resampled_data}")
183
+ return resampled_data
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.261a3250.post2
3
+ Version: 1.1.262a3250.post4
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -1,12 +1,12 @@
1
1
  upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
2
2
  upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
3
- upgini/dataset.py,sha256=ywBwf93d0IH39ZGfmNDlAwe1ILQtt1WzJ87WfIOMI2g,48149
3
+ upgini/dataset.py,sha256=4LfrUwxhd__ZVqZkjPVxbC4SW3YLsk1sMMqnYPUaVpw,45529
4
4
  upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
5
- upgini/features_enricher.py,sha256=fFSLW6aAzVq5YYaVcl-xbjSd3qYt8dW9hYAIestylSk,172118
5
+ upgini/features_enricher.py,sha256=WbwnLvPVqn4m995b6jSamWkXyRVy18fnG7faBeuJbWI,172132
6
6
  upgini/http.py,sha256=zaO86LBBLmkieGbgYifk29eVoPCxXimZQ8YkQtKcM0I,42244
7
7
  upgini/metadata.py,sha256=fwVxtkR6Mn4iRoOqV6BfMJvJrx65I3YwZUMbZjhPyOI,9673
8
8
  upgini/metrics.py,sha256=3VvSZW1cCOIPHImXuqcnWzD3fWcpPzVa9k8eulLbUmY,27426
9
- upgini/search_task.py,sha256=5n4qGJmtu48s0-FHAtF3L5qVLMd1JVW3FJlM8dFbh-s,17063
9
+ upgini/search_task.py,sha256=tmJ17WUxv3J5NWrYUJB_NKdZ792Ifz8Z8UnDXeQnpss,17077
10
10
  upgini/spinner.py,sha256=Dm1dQ5F_z_Ua2odLxZX7OypcOX9tSx_vE5MGaKtUmfw,1118
11
11
  upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
12
12
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
@@ -14,14 +14,14 @@ upgini/ads_management/ads_manager.py,sha256=fP4Yqx3h2Snw5X335TbXEwFoupq1RYsE7y0P
14
14
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
15
  upgini/autofe/all_operands.py,sha256=KWAdcYv6cToc6NZPcCmz6P3N8Nwjp8UqojKuz-f2BZY,1589
16
16
  upgini/autofe/binary.py,sha256=f8LQqZi9zyaMUAv-jASMmWNA_vT05ncYCjZq0qx3USs,3972
17
- upgini/autofe/date.py,sha256=tZFwxkRlxlleBso3NwfhGrI0YjABlfL7LP5w_Vlv_jU,1450
18
- upgini/autofe/feature.py,sha256=xeqTq35-BX4KCt0xAkk3UZAGzV5VyjorV5AdNdA5yLs,11851
17
+ upgini/autofe/date.py,sha256=AC7Gabc7x2n4-_EmO1Q-7ncfCI_5-kPMQ3r3vFgQ1g4,1788
18
+ upgini/autofe/feature.py,sha256=2FQRGtIumNz60hFAjfLReaY18SI7HxzYZOoC5avzSjQ,11847
19
19
  upgini/autofe/groupby.py,sha256=iXRfOmOc84ooSzRhsh9GmmG7rTafX0-ekXko8s9Qs68,3089
20
- upgini/autofe/operand.py,sha256=Rhy7Ky3we-I1Su1--dS4xdsO3K8neV4rqM_Q4xYE4ug,2779
21
- upgini/autofe/unary.py,sha256=gyMkrx9bfa3o19zS-4JaRlScHrfeZGBsYe7d_6ePT-0,2853
22
- upgini/autofe/vector.py,sha256=Qk7VmdwURNwVw7fIMEspWEo7HTiyUWCYIqu3hcWQQio,507
20
+ upgini/autofe/operand.py,sha256=dhtToPDGWtP_0u_RjayUpezJJZAgq_TzNbPH0bI9OXI,2805
21
+ upgini/autofe/unary.py,sha256=YRTzQLttbDdOnkogWBPnBexpu7uHWSLSFAxSCu3iFdY,3145
22
+ upgini/autofe/vector.py,sha256=5qhI_bdwaWM1l7fgCkx1tMt9R9gxWzoYCl-7WO4KiOs,604
23
23
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
- upgini/data_source/data_source_publisher.py,sha256=yCMyYwFTfv0e7h-kAdtiQCF42J1DbqmJ1Wi0xt_ZzeM,15578
24
+ upgini/data_source/data_source_publisher.py,sha256=QASEDhJ9SxJKcWxoN2vUPxrM_HTlwKQOPa92L7EQneA,15962
25
25
  upgini/mdc/__init__.py,sha256=ETDh3JKbrDdPMOECiYLAa8lvKYe68mv4IY6fZa9FimA,1126
26
26
  upgini/mdc/context.py,sha256=Sl1S_InKlzzRxYqwJ2k24lawJdCKWgGJ-RIRfvzWJrk,1468
27
27
  upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -40,7 +40,7 @@ upgini/utils/blocked_time_series.py,sha256=dMz5ewk3PsoeOrc3lDzInCVPS9u_2XQkV0W6P
40
40
  upgini/utils/country_utils.py,sha256=1KXhLSNqkNYVL3on8-zK0Arc_SspUH7AMZvGZICysOU,6462
41
41
  upgini/utils/custom_loss_utils.py,sha256=DBslpjWGPt7xTeypt78baR59012SYphbPsO_YLKdilo,3972
42
42
  upgini/utils/cv_utils.py,sha256=Tn01RJvpZGZh0PUQUimlBkV-AXwe7s6yjCNFtw352Uc,3525
43
- upgini/utils/datetime_utils.py,sha256=5wvEz9DWL_RS4EST5FFIidfD36MSL-wij4P9AAJpMl0,8822
43
+ upgini/utils/datetime_utils.py,sha256=ol5Bgh98wU6KBY9z4QskNO0ja-L7HJL70HmTAjl7iRU,8836
44
44
  upgini/utils/deduplicate_utils.py,sha256=ckJrpU8Ruc_vcwIPTopbUjyJuNiseLHNAbQlLfhUCxo,5888
45
45
  upgini/utils/display_utils.py,sha256=LKoSwjrE0xgS5_cqVhc2og2CQ1UCZ1nTI2VKboIhoQA,10858
46
46
  upgini/utils/email_utils.py,sha256=3CvHXTSzlgLyGsQOXfRYVfFhfPy6OXG4uXOBWRaLfHg,3479
@@ -52,11 +52,11 @@ upgini/utils/phone_utils.py,sha256=JNSkF8G6mgsN8Czy11pamaJdsY6rBINEMpi7jbVt_RA,4
52
52
  upgini/utils/postal_code_utils.py,sha256=_8CR9tBqsPptQsmMUvnrCAmBaMIQSWH3JfJ4ly3x_zs,409
53
53
  upgini/utils/progress_bar.py,sha256=iNXyqT3vKCeHpfiG5HHwr7Lk2cTtKViM93Fl8iZnjGc,1564
54
54
  upgini/utils/sklearn_ext.py,sha256=fvuTWJ5AnT3ED9KSaQu_yIgW2JR19hFlaGDoVP3k60g,44027
55
- upgini/utils/target_utils.py,sha256=DH812qcZ7Pvf9WVVb33fbwQjb1W9h1hXRNCCiG7Y6tI,2563
55
+ upgini/utils/target_utils.py,sha256=WVhhxpQVvnhsDV7ctlds51VFg7hz59S_MFUSoRZFszw,7204
56
56
  upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
57
57
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
58
- upgini-1.1.261a3250.post2.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
59
- upgini-1.1.261a3250.post2.dist-info/METADATA,sha256=51NlABKzSIZ6kYyKaLkHvB9Sl-vzfVCgi_zCyzsCGQU,48167
60
- upgini-1.1.261a3250.post2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
61
- upgini-1.1.261a3250.post2.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
62
- upgini-1.1.261a3250.post2.dist-info/RECORD,,
58
+ upgini-1.1.262a3250.post4.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
59
+ upgini-1.1.262a3250.post4.dist-info/METADATA,sha256=XfUGTmbya5IYq0uJYXwhUGxBy9DAnrQyWvNsyiZl6gM,48167
60
+ upgini-1.1.262a3250.post4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
61
+ upgini-1.1.262a3250.post4.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
62
+ upgini-1.1.262a3250.post4.dist-info/RECORD,,