upgini 1.2.14a3616.dev3__py3-none-any.whl → 1.2.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.14a3616.dev3"
1
+ __version__ = "1.2.15"
upgini/dataset.py CHANGED
@@ -53,7 +53,8 @@ class Dataset: # (pd.DataFrame):
53
53
  FIT_SAMPLE_THRESHOLD = 200_000
54
54
  FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
55
55
  FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
56
- MIN_SAMPLE_THRESHOLD = 5_000
56
+ BINARY_MIN_SAMPLE_THRESHOLD = 5_000
57
+ MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
57
58
  IMBALANCE_THESHOLD = 0.6
58
59
  BINARY_BOOTSTRAP_LOOPS = 5
59
60
  MULTICLASS_BOOTSTRAP_LOOPS = 2
@@ -225,7 +226,7 @@ class Dataset: # (pd.DataFrame):
225
226
  train_segment = self.data
226
227
 
227
228
  if self.task_type == ModelTaskType.MULTICLASS or (
228
- self.task_type == ModelTaskType.BINARY and len(train_segment) > self.MIN_SAMPLE_THRESHOLD
229
+ self.task_type == ModelTaskType.BINARY and len(train_segment) > self.BINARY_MIN_SAMPLE_THRESHOLD
229
230
  ):
230
231
  count = len(train_segment)
231
232
  target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
@@ -253,6 +254,7 @@ class Dataset: # (pd.DataFrame):
253
254
  min_class_percent = self.IMBALANCE_THESHOLD / target_classes_count
254
255
  min_class_threshold = min_class_percent * count
255
256
 
257
+ # If min class count less than 30% for binary or (60 / classes_count)% for multiclass
256
258
  if min_class_count < min_class_threshold:
257
259
  self.imbalanced = True
258
260
  self.data = balance_undersample(
@@ -260,7 +262,8 @@ class Dataset: # (pd.DataFrame):
260
262
  target_column=target_column,
261
263
  task_type=self.task_type,
262
264
  random_state=self.random_state,
263
- imbalance_threshold=self.IMBALANCE_THESHOLD,
265
+ binary_min_sample_threshold=self.BINARY_MIN_SAMPLE_THRESHOLD,
266
+ multiclass_min_sample_threshold=self.MULTICLASS_MIN_SAMPLE_THRESHOLD,
264
267
  binary_bootstrap_loops=self.BINARY_BOOTSTRAP_LOOPS,
265
268
  multiclass_bootstrap_loops=self.MULTICLASS_BOOTSTRAP_LOOPS,
266
269
  logger=self.logger,
@@ -1577,8 +1577,8 @@ class FeaturesEnricher(TransformerMixin):
1577
1577
  df = generator.generate(df)
1578
1578
  generated_features.extend(generator.generated_features)
1579
1579
 
1580
- normalizer = Normalizer(search_keys, generated_features, self.bundle, self.logger, self.warning_counter)
1581
- df = normalizer.normalize(df)
1580
+ normalizer = Normalizer(self.bundle, self.logger, self.warning_counter)
1581
+ df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
1582
1582
  columns_renaming = normalizer.columns_renaming
1583
1583
 
1584
1584
  df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
@@ -2017,10 +2017,8 @@ class FeaturesEnricher(TransformerMixin):
2017
2017
  df = generator.generate(df)
2018
2018
  generated_features.extend(generator.generated_features)
2019
2019
 
2020
- normalizer = Normalizer(
2021
- search_keys, generated_features, self.bundle, self.logger, self.warning_counter, silent_mode
2022
- )
2023
- df = normalizer.normalize(df)
2020
+ normalizer = Normalizer(self.bundle, self.logger, self.warning_counter, silent_mode)
2021
+ df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
2024
2022
  columns_renaming = normalizer.columns_renaming
2025
2023
 
2026
2024
  # Don't pass all features in backend on transform
@@ -2449,16 +2447,13 @@ class FeaturesEnricher(TransformerMixin):
2449
2447
  if is_numeric_dtype(df[self.TARGET_NAME]) and has_date:
2450
2448
  self._validate_PSI(df.sort_values(by=maybe_date_column))
2451
2449
 
2452
- normalizer = Normalizer(
2453
- self.fit_search_keys, self.fit_generated_features, self.bundle, self.logger, self.warning_counter
2450
+ normalizer = Normalizer(self.bundle, self.logger, self.warning_counter)
2451
+ df, self.fit_search_keys, self.fit_generated_features = normalizer.normalize(
2452
+ df, self.fit_search_keys, self.fit_generated_features
2454
2453
  )
2455
- df = normalizer.normalize(df)
2456
- columns_renaming = normalizer.columns_renaming
2457
- self.fit_columns_renaming = columns_renaming
2454
+ self.fit_columns_renaming = normalizer.columns_renaming
2458
2455
 
2459
- self.__adjust_cv(
2460
- df, normalizer.search_keys, self.model_task_type
2461
- )
2456
+ self.__adjust_cv(df)
2462
2457
 
2463
2458
  df = remove_fintech_duplicates(
2464
2459
  df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
@@ -2472,7 +2467,7 @@ class FeaturesEnricher(TransformerMixin):
2472
2467
  self.df_with_original_index = df.copy()
2473
2468
  # TODO check maybe need to drop _time column from df_with_original_index
2474
2469
 
2475
- df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys, columns_renaming)
2470
+ df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys, self.fit_columns_renaming)
2476
2471
 
2477
2472
  # Convert EMAIL to HEM after unnesting to do it only with one column
2478
2473
  email_column = self._get_email_column(self.fit_search_keys)
@@ -2482,7 +2477,7 @@ class FeaturesEnricher(TransformerMixin):
2482
2477
  email_column,
2483
2478
  hem_column,
2484
2479
  self.fit_search_keys,
2485
- columns_renaming,
2480
+ self.fit_columns_renaming,
2486
2481
  list(unnest_search_keys.keys()),
2487
2482
  self.logger,
2488
2483
  )
@@ -2493,7 +2488,7 @@ class FeaturesEnricher(TransformerMixin):
2493
2488
  converter = IpSearchKeyConverter(
2494
2489
  ip_column,
2495
2490
  self.fit_search_keys,
2496
- columns_renaming,
2491
+ self.fit_columns_renaming,
2497
2492
  list(unnest_search_keys.keys()),
2498
2493
  self.bundle,
2499
2494
  self.logger,
@@ -2524,7 +2519,7 @@ class FeaturesEnricher(TransformerMixin):
2524
2519
  features_columns = [c for c in df.columns if c not in non_feature_columns]
2525
2520
 
2526
2521
  features_to_drop = FeaturesValidator(self.logger).validate(
2527
- df, features_columns, self.generate_features, self.warning_counter, columns_renaming
2522
+ df, features_columns, self.generate_features, self.warning_counter, self.fit_columns_renaming
2528
2523
  )
2529
2524
  self.fit_dropped_features.update(features_to_drop)
2530
2525
  df = df.drop(columns=features_to_drop)
@@ -2565,7 +2560,7 @@ class FeaturesEnricher(TransformerMixin):
2565
2560
  rest_client=self.rest_client,
2566
2561
  logger=self.logger,
2567
2562
  )
2568
- dataset.columns_renaming = columns_renaming
2563
+ dataset.columns_renaming = self.fit_columns_renaming
2569
2564
 
2570
2565
  self.passed_features = [
2571
2566
  column for column, meaning_type in meaning_types.items() if meaning_type == FileColumnMeaningType.FEATURE
@@ -2712,22 +2707,22 @@ class FeaturesEnricher(TransformerMixin):
2712
2707
  if not self.warning_counter.has_warnings():
2713
2708
  self.__display_support_link(self.bundle.get("all_ok_community_invite"))
2714
2709
 
2715
- def __adjust_cv(self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], model_task_type: ModelTaskType):
2716
- date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2710
+ def __adjust_cv(self, df: pd.DataFrame):
2711
+ date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2717
2712
  # Check Multivariate time series
2718
2713
  if (
2719
2714
  self.cv is None
2720
2715
  and date_column
2721
- and model_task_type == ModelTaskType.REGRESSION
2722
- and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(search_keys.keys())) == 0
2723
- and is_blocked_time_series(df, date_column, list(search_keys.keys()) + [TARGET])
2716
+ and self.model_task_type == ModelTaskType.REGRESSION
2717
+ and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys())) == 0
2718
+ and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
2724
2719
  ):
2725
2720
  msg = self.bundle.get("multivariate_timeseries_detected")
2726
2721
  self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
2727
- elif self.cv is None and model_task_type != ModelTaskType.REGRESSION:
2722
+ elif self.cv is None and self.model_task_type != ModelTaskType.REGRESSION:
2728
2723
  msg = self.bundle.get("group_k_fold_in_classification")
2729
2724
  self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
2730
- group_columns = self._get_group_columns(df, search_keys)
2725
+ group_columns = self._get_group_columns(df, self.fit_search_keys)
2731
2726
  self.runtime_parameters.properties["cv_params.group_columns"] = ",".join(group_columns)
2732
2727
  self.runtime_parameters.properties["cv_params.shuffle_kfold"] = "True"
2733
2728
 
@@ -1,6 +1,6 @@
1
1
  import hashlib
2
2
  from logging import Logger, getLogger
3
- from typing import Dict, List
3
+ from typing import Dict, List, Tuple
4
4
 
5
5
  import numpy as np
6
6
  import pandas as pd
@@ -35,22 +35,25 @@ class Normalizer:
35
35
 
36
36
  def __init__(
37
37
  self,
38
- search_keys: Dict[str, SearchKey],
39
- generated_features: List[str],
40
38
  bundle: ResourceBundle = None,
41
39
  logger: Logger = None,
42
40
  warnings_counter: WarningCounter = None,
43
41
  silent_mode=False,
44
42
  ):
45
- self.search_keys = search_keys
46
- self.generated_features = generated_features
47
43
  self.bundle = bundle or get_custom_bundle()
48
44
  self.logger = logger or getLogger()
49
45
  self.warnings_counter = warnings_counter or WarningCounter()
50
46
  self.silent_mode = silent_mode
51
47
  self.columns_renaming = {}
48
+ self.search_keys = {}
49
+ self.generated_features = []
50
+
51
+ def normalize(
52
+ self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], generated_features: List[str]
53
+ ) -> Tuple[pd.DataFrame, Dict[str, SearchKey], List[str]]:
54
+ self.search_keys = search_keys.copy()
55
+ self.generated_features = generated_features.copy()
52
56
 
53
- def normalize(self, df: pd.DataFrame) -> pd.DataFrame:
54
57
  df = df.copy()
55
58
  df = self._rename_columns(df)
56
59
 
@@ -68,21 +71,25 @@ class Normalizer:
68
71
 
69
72
  df = self.__convert_features_types(df)
70
73
 
71
- return df
74
+ return df, self.search_keys, self.generated_features
72
75
 
73
76
  def _rename_columns(self, df: pd.DataFrame):
74
77
  # logger.info("Replace restricted symbols in column names")
75
78
  new_columns = []
76
79
  dup_counter = 0
77
80
  for column in df.columns:
78
- if column in [
79
- TARGET,
80
- EVAL_SET_INDEX,
81
- SYSTEM_RECORD_ID,
82
- ENTITY_SYSTEM_RECORD_ID,
83
- SEARCH_KEY_UNNEST,
84
- DateTimeSearchKeyConverter.DATETIME_COL,
85
- ] + self.generated_features:
81
+ if (
82
+ column
83
+ in [
84
+ TARGET,
85
+ EVAL_SET_INDEX,
86
+ SYSTEM_RECORD_ID,
87
+ ENTITY_SYSTEM_RECORD_ID,
88
+ SEARCH_KEY_UNNEST,
89
+ DateTimeSearchKeyConverter.DATETIME_COL,
90
+ ]
91
+ + self.generated_features
92
+ ):
86
93
  self.columns_renaming[column] = column
87
94
  new_columns.append(column)
88
95
  continue
@@ -203,11 +203,18 @@ email_detected=Emails detected in column `{}`. It will be used as a search key\n
203
203
  email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
204
204
  phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
205
205
  phone_detected_not_registered=\nWARNING: Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
206
- target_type_detected=\nDetected task type: {}\n
206
+ target_type_detected=\nDetected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
207
+ binary_target_reason=only two unique label-values observed
208
+ non_numeric_multiclass_reason=non-numeric label values observed
209
+ few_unique_label_multiclass_reason=few unique label-values observed and can be considered as categorical
210
+ date_search_key_regression_reason=date search key is present, treating as regression
211
+ many_unique_label_regression_reason=many unique label-values or non-integer floating point values observed
212
+ limited_int_multiclass_reason=integer-like values with limited unique values observed
207
213
  # all_ok_community_invite=Chat with us in Slack community:
208
214
  all_ok_community_invite=❓ Support request
209
215
  too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
210
216
  imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
217
+ imbalanced_target=\nWARNING: Target is imbalanced and will be undersampled. Frequency of the rarest class `{}` is {}
211
218
  loss_selection_info=Using loss `{}` for feature selection
212
219
  loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
213
220
 
@@ -24,49 +24,83 @@ def define_task(
24
24
  ) -> ModelTaskType:
25
25
  if logger is None:
26
26
  logger = logging.getLogger()
27
+
28
+ # Replace inf and -inf with NaN to handle extreme values correctly
29
+ y = y.replace([np.inf, -np.inf], np.nan, inplace=False)
30
+
31
+ # Drop NaN values from the target
27
32
  target = y.dropna()
33
+
34
+ # Check if target is numeric and finite
28
35
  if is_numeric_dtype(target):
29
36
  target = target.loc[np.isfinite(target)]
30
37
  else:
38
+ # If not numeric, drop empty strings as well
31
39
  target = target.loc[target != ""]
40
+
41
+ # Raise error if there are no valid values left in the target
32
42
  if len(target) == 0:
33
43
  raise ValidationError(bundle.get("empty_target"))
44
+
45
+ # Count unique values in the target
34
46
  target_items = target.nunique()
47
+
48
+ # Raise error if all target values are the same
35
49
  if target_items == 1:
36
50
  raise ValidationError(bundle.get("dataset_constant_target"))
51
+
52
+ reason = "" # Will store the reason for selecting the task type
53
+
54
+ # Binary classification case: exactly two unique values
37
55
  if target_items == 2:
38
56
  task = ModelTaskType.BINARY
57
+ reason = bundle.get("binary_target_reason")
39
58
  else:
59
+ # Attempt to convert target to numeric
40
60
  try:
41
61
  target = pd.to_numeric(target)
42
62
  is_numeric = True
43
63
  except Exception:
44
64
  is_numeric = False
45
65
 
46
- # If any value is non numeric - multiclass
66
+ # If target cannot be converted to numeric, assume multiclass classification
47
67
  if not is_numeric:
48
68
  task = ModelTaskType.MULTICLASS
69
+ reason = bundle.get("non_numeric_multiclass_reason")
49
70
  else:
71
+ # Multiclass classification: few unique values and integer encoding
50
72
  if target.nunique() <= 50 and is_int_encoding(target.unique()):
51
73
  task = ModelTaskType.MULTICLASS
74
+ reason = bundle.get("few_unique_label_multiclass_reason")
75
+ # Regression case: if there is date, assume regression
52
76
  elif has_date:
53
77
  task = ModelTaskType.REGRESSION
78
+ reason = bundle.get("date_search_key_regression_reason")
54
79
  else:
80
+ # Remove zero values and recalculate unique ratio
55
81
  non_zero_target = target[target != 0]
56
82
  target_items = non_zero_target.nunique()
57
83
  target_ratio = target_items / len(non_zero_target)
84
+
85
+ # Use unique_ratio to determine whether to classify as regression or multiclass
58
86
  if (
59
- (target.dtype.kind == "f" and np.any(target != target.astype(int))) # any non integer
87
+ (target.dtype.kind == "f" and np.any(target != target.astype(int))) # Non-integer float values
60
88
  or target_items > 50
61
- or target_ratio > 0.2
89
+ or target_ratio > 0.2 # If non-zero values have high ratio of uniqueness
62
90
  ):
63
91
  task = ModelTaskType.REGRESSION
92
+ reason = bundle.get("many_unique_label_regression_reason")
64
93
  else:
65
94
  task = ModelTaskType.MULTICLASS
95
+ reason = bundle.get("limited_int_multiclass_reason")
66
96
 
67
- logger.info(f"Detected task type: {task}")
97
+ # Log or print the reason for the selected task type
98
+ logger.info(f"Detected task type: {task} (Reason: {reason})")
99
+
100
+ # Print task type and reason if silent mode is off
68
101
  if not silent:
69
- print(bundle.get("target_type_detected").format(task))
102
+ print(bundle.get("target_type_detected").format(task, reason))
103
+
70
104
  return task
71
105
 
72
106
 
@@ -81,8 +115,8 @@ def balance_undersample(
81
115
  target_column: str,
82
116
  task_type: ModelTaskType,
83
117
  random_state: int,
84
- imbalance_threshold: int = 0.2,
85
- min_sample_threshold: int = 5000,
118
+ binary_min_sample_threshold: int = 5000,
119
+ multiclass_min_sample_threshold: int = 25000,
86
120
  binary_bootstrap_loops: int = 5,
87
121
  multiclass_bootstrap_loops: int = 2,
88
122
  logger: Optional[logging.Logger] = None,
@@ -96,52 +130,60 @@ def balance_undersample(
96
130
  if SYSTEM_RECORD_ID not in df.columns:
97
131
  raise Exception("System record id must be presented for undersampling")
98
132
 
99
- count = len(df)
133
+ # count = len(df)
100
134
  target = df[target_column].copy()
101
- target_classes_count = target.nunique()
135
+ # target_classes_count = target.nunique()
102
136
 
103
137
  vc = target.value_counts()
104
138
  max_class_value = vc.index[0]
105
139
  min_class_value = vc.index[len(vc) - 1]
106
140
  max_class_count = vc[max_class_value]
107
141
  min_class_count = vc[min_class_value]
142
+ num_classes = len(vc)
108
143
 
109
- min_class_percent = imbalance_threshold / target_classes_count
110
- min_class_threshold = int(min_class_percent * count)
144
+ # min_class_percent = imbalance_threshold / target_classes_count
145
+ # min_class_threshold = int(min_class_percent * count)
111
146
 
112
147
  resampled_data = df
113
148
  df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
114
149
  if task_type == ModelTaskType.MULTICLASS:
115
- # Sort classes by rows count and find 25% quantile class
116
- classes = vc.index
117
- quantile25_idx = int(0.75 * len(classes)) - 1
118
- quantile25_class = classes[quantile25_idx]
119
- quantile25_class_cnt = vc[quantile25_class]
120
-
121
- if max_class_count > (quantile25_class_cnt * multiclass_bootstrap_loops):
122
- msg = bundle.get("imbalance_multiclass").format(quantile25_class, quantile25_class_cnt)
150
+ if len(df) > multiclass_min_sample_threshold and max_class_count > (
151
+ min_class_count * multiclass_bootstrap_loops
152
+ ):
153
+
154
+ # msg = bundle.get("imbalance_multiclass").format(min_class_value, min_class_count)
155
+ msg = bundle.get("imbalanced_target").format(min_class_value, min_class_count)
123
156
  logger.warning(msg)
124
157
  print(msg)
125
158
  if warning_counter:
126
159
  warning_counter.increment()
127
160
 
128
- # 25% and lower classes will stay as is. Higher classes will be downsampled
129
161
  sample_strategy = dict()
130
- for class_idx in range(quantile25_idx):
131
- # compare class count with count_of_quantile25_class * 2
132
- class_value = classes[class_idx]
162
+ for class_value in vc.index:
163
+ if class_value == min_class_value:
164
+ continue
133
165
  class_count = vc[class_value]
134
- sample_strategy[class_value] = min(class_count, quantile25_class_cnt * multiclass_bootstrap_loops)
166
+ sample_size = min(
167
+ class_count,
168
+ multiclass_bootstrap_loops
169
+ * (
170
+ min_class_count
171
+ + max((multiclass_min_sample_threshold - num_classes * min_class_count) / (num_classes - 1), 0)
172
+ ),
173
+ )
174
+ sample_strategy[class_value] = int(sample_size)
175
+ logger.info(f"Rebalance sample strategy: {sample_strategy}. Min class count: {min_class_count}")
135
176
  sampler = RandomUnderSampler(sampling_strategy=sample_strategy, random_state=random_state)
136
177
  X = df[SYSTEM_RECORD_ID]
137
178
  X = X.to_frame(SYSTEM_RECORD_ID)
138
179
  new_x, _ = sampler.fit_resample(X, target) # type: ignore
139
180
 
140
181
  resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
141
- elif len(df) > min_sample_threshold and min_class_count < min_sample_threshold / 2:
142
- msg = bundle.get("dataset_rarest_class_less_threshold").format(
143
- min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
144
- )
182
+ elif len(df) > binary_min_sample_threshold:
183
+ # msg = bundle.get("dataset_rarest_class_less_threshold").format(
184
+ # min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
185
+ # )
186
+ msg = bundle.get("imbalanced_target").format(min_class_value, min_class_count)
145
187
  logger.warning(msg)
146
188
  print(msg)
147
189
  if warning_counter:
@@ -150,30 +192,38 @@ def balance_undersample(
150
192
  # fill up to min_sample_threshold by majority class
151
193
  minority_class = df[df[target_column] == min_class_value]
152
194
  majority_class = df[df[target_column] != min_class_value]
153
- sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
195
+ # sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
196
+ sample_size = min(
197
+ max_class_count,
198
+ binary_bootstrap_loops * (min_class_count + max(binary_min_sample_threshold - 2 * min_class_count, 0)),
199
+ )
200
+ logger.info(
201
+ f"Min class count: {min_class_count}. Max class count: {max_class_count}."
202
+ f" Rebalance sample size: {sample_size}"
203
+ )
154
204
  sampled_majority_class = majority_class.sample(n=sample_size, random_state=random_state)
155
205
  resampled_data = df[
156
206
  (df[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
157
207
  | (df[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
158
208
  ]
159
209
 
160
- elif max_class_count > min_class_count * binary_bootstrap_loops:
161
- msg = bundle.get("dataset_rarest_class_less_threshold").format(
162
- min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
163
- )
164
- logger.warning(msg)
165
- print(msg)
166
- if warning_counter:
167
- warning_counter.increment()
168
-
169
- sampler = RandomUnderSampler(
170
- sampling_strategy={max_class_value: binary_bootstrap_loops * min_class_count}, random_state=random_state
171
- )
172
- X = df[SYSTEM_RECORD_ID]
173
- X = X.to_frame(SYSTEM_RECORD_ID)
174
- new_x, _ = sampler.fit_resample(X, target) # type: ignore
175
-
176
- resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
210
+ # elif max_class_count > min_class_count * binary_bootstrap_loops:
211
+ # msg = bundle.get("dataset_rarest_class_less_threshold").format(
212
+ # min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
213
+ # )
214
+ # logger.warning(msg)
215
+ # print(msg)
216
+ # if warning_counter:
217
+ # warning_counter.increment()
218
+
219
+ # sampler = RandomUnderSampler(
220
+ # sampling_strategy={max_class_value: binary_bootstrap_loops * min_class_count}, random_state=random_state
221
+ # )
222
+ # X = df[SYSTEM_RECORD_ID]
223
+ # X = X.to_frame(SYSTEM_RECORD_ID)
224
+ # new_x, _ = sampler.fit_resample(X, target) # type: ignore
225
+
226
+ # resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
177
227
 
178
228
  logger.info(f"Shape after rebalance resampling: {resampled_data}")
179
229
  return resampled_data
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.14a3616.dev3
3
+ Version: 1.2.15
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,9 +1,9 @@
1
- upgini/__about__.py,sha256=r3lE_a_du_MY_PJ07BeAX4zN5ZZJoiV-YXe1uJzNwTo,33
1
+ upgini/__about__.py,sha256=Q6rDLuL8XHKQggYBtRCtxzpPQJgFYWn4x0gcVlH7H4g,23
2
2
  upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=olZ-OHSfBNoBSCo7R5t7uCLukI2nO7afpx_A-HCiJLk,31067
4
+ upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=tGdWQdCgilWlG-sDebIBxQ_OMpnOqg8mTzxCj7Xp-yo,188320
6
+ upgini/features_enricher.py,sha256=vRC7g6n6XQxSrvzXk6NJjP0ZytDQhWR4sTAo4Hp7gmA,188319
7
7
  upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
9
  upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
@@ -27,10 +27,10 @@ upgini/data_source/data_source_publisher.py,sha256=X-8aGtVgzGmxyXkMVBoBLIGDMb4lY
27
27
  upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
28
28
  upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
29
29
  upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
- upgini/normalizer/normalize_utils.py,sha256=bHRPWCNrUvt2R9qMX6dZFCJ0i8ENVCQ2Rw3dHH9IJEg,7447
30
+ upgini/normalizer/normalize_utils.py,sha256=Lv75lq7M46z9cAIutwkdKZtPZkWblgoRzToAJ1BwY8A,7709
31
31
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
32
32
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
33
- upgini/resource_bundle/strings.properties,sha256=9kvmcUrsSFUCrzOiN0Ozf-lQ2H8Igz5gATUPoHMOaU4,26456
33
+ upgini/resource_bundle/strings.properties,sha256=eqJP6bGu12zFuQJqMY03QbMhppcdwIfL2bsJWaqmuZ4,27221
34
34
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
35
35
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
36
  upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -54,10 +54,10 @@ upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,1
54
54
  upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
55
55
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
56
56
  upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
57
- upgini/utils/target_utils.py,sha256=BVtDmrmFMKerSUWaNOIEdzsYHIFiODdpnWbE50QDPDc,7864
57
+ upgini/utils/target_utils.py,sha256=qHzZRmICFbLNCrmVqGkaBcjm91L2ERRZMppci36acV4,10085
58
58
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
59
59
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
60
- upgini-1.2.14a3616.dev3.dist-info/METADATA,sha256=t3fSIzaoSthUHfOhJmHqz45r_3UpZhF2ur9cFekdcA8,48587
61
- upgini-1.2.14a3616.dev3.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
62
- upgini-1.2.14a3616.dev3.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
- upgini-1.2.14a3616.dev3.dist-info/RECORD,,
60
+ upgini-1.2.15.dist-info/METADATA,sha256=Hua2FUNftyzzpi9eR090MFJ-5F8S_KS_5SrZhwOUgco,48577
61
+ upgini-1.2.15.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
+ upgini-1.2.15.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
+ upgini-1.2.15.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.24.2
2
+ Generator: hatchling 1.25.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any