upgini 1.2.14a1__tar.gz → 1.2.14a2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. {upgini-1.2.14a1 → upgini-1.2.14a2}/PKG-INFO +1 -1
  2. upgini-1.2.14a2/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/dataset.py +5 -3
  4. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/resource_bundle/strings.properties +1 -0
  5. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/target_utils.py +31 -24
  6. upgini-1.2.14a1/src/upgini/__about__.py +0 -1
  7. {upgini-1.2.14a1 → upgini-1.2.14a2}/.gitignore +0 -0
  8. {upgini-1.2.14a1 → upgini-1.2.14a2}/LICENSE +0 -0
  9. {upgini-1.2.14a1 → upgini-1.2.14a2}/README.md +0 -0
  10. {upgini-1.2.14a1 → upgini-1.2.14a2}/pyproject.toml +0 -0
  11. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/__init__.py +0 -0
  12. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/ads.py +0 -0
  13. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/ads_management/__init__.py +0 -0
  14. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/ads_management/ads_manager.py +0 -0
  15. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/autofe/__init__.py +0 -0
  16. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/autofe/all_operands.py +0 -0
  17. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/autofe/binary.py +0 -0
  18. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/autofe/date.py +0 -0
  19. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/autofe/feature.py +0 -0
  20. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/autofe/groupby.py +0 -0
  21. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/autofe/operand.py +0 -0
  22. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/autofe/unary.py +0 -0
  23. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/autofe/vector.py +0 -0
  24. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/data_source/__init__.py +0 -0
  25. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/data_source/data_source_publisher.py +0 -0
  26. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/errors.py +0 -0
  27. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/features_enricher.py +0 -0
  28. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/http.py +0 -0
  29. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/lazy_import.py +0 -0
  30. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/mdc/__init__.py +0 -0
  31. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/mdc/context.py +0 -0
  32. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/metadata.py +0 -0
  33. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/metrics.py +0 -0
  34. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/normalizer/__init__.py +0 -0
  35. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/normalizer/normalize_utils.py +0 -0
  36. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/resource_bundle/__init__.py +0 -0
  37. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/resource_bundle/exceptions.py +0 -0
  38. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  39. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/sampler/__init__.py +0 -0
  40. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/sampler/base.py +0 -0
  41. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/sampler/random_under_sampler.py +0 -0
  42. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/sampler/utils.py +0 -0
  43. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/search_task.py +0 -0
  44. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/spinner.py +0 -0
  45. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/__init__.py +0 -0
  46. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/base_search_key_detector.py +0 -0
  47. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/blocked_time_series.py +0 -0
  48. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/country_utils.py +0 -0
  49. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/custom_loss_utils.py +0 -0
  50. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/cv_utils.py +0 -0
  51. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/datetime_utils.py +0 -0
  52. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/deduplicate_utils.py +0 -0
  53. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/display_utils.py +0 -0
  54. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/email_utils.py +0 -0
  55. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/fallback_progress_bar.py +0 -0
  56. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/features_validator.py +0 -0
  57. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/format.py +0 -0
  58. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/ip_utils.py +0 -0
  59. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/phone_utils.py +0 -0
  60. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/postal_code_utils.py +0 -0
  61. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/progress_bar.py +0 -0
  62. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/sklearn_ext.py +0 -0
  63. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/track_info.py +0 -0
  64. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/utils/warning_counter.py +0 -0
  65. {upgini-1.2.14a1 → upgini-1.2.14a2}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.14a1
3
+ Version: 1.2.14a2
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.14a2"
@@ -53,7 +53,8 @@ class Dataset: # (pd.DataFrame):
53
53
  FIT_SAMPLE_THRESHOLD = 200_000
54
54
  FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
55
55
  FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
56
- MIN_SAMPLE_THRESHOLD = 5_000
56
+ BINARY_MIN_SAMPLE_THRESHOLD = 5_000
57
+ MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
57
58
  IMBALANCE_THESHOLD = 0.6
58
59
  BINARY_BOOTSTRAP_LOOPS = 5
59
60
  MULTICLASS_BOOTSTRAP_LOOPS = 2
@@ -225,7 +226,7 @@ class Dataset: # (pd.DataFrame):
225
226
  train_segment = self.data
226
227
 
227
228
  if self.task_type == ModelTaskType.MULTICLASS or (
228
- self.task_type == ModelTaskType.BINARY and len(train_segment) > self.MIN_SAMPLE_THRESHOLD
229
+ self.task_type == ModelTaskType.BINARY and len(train_segment) > self.BINARY_MIN_SAMPLE_THRESHOLD
229
230
  ):
230
231
  count = len(train_segment)
231
232
  target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
@@ -261,7 +262,8 @@ class Dataset: # (pd.DataFrame):
261
262
  target_column=target_column,
262
263
  task_type=self.task_type,
263
264
  random_state=self.random_state,
264
- imbalance_threshold=self.IMBALANCE_THESHOLD,
265
+ binary_min_sample_threshold=self.BINARY_MIN_SAMPLE_THRESHOLD,
266
+ multiclass_min_sample_threshold=self.MULTICLASS_MIN_SAMPLE_THRESHOLD,
265
267
  binary_bootstrap_loops=self.BINARY_BOOTSTRAP_LOOPS,
266
268
  multiclass_bootstrap_loops=self.MULTICLASS_BOOTSTRAP_LOOPS,
267
269
  logger=self.logger,
@@ -208,6 +208,7 @@ target_type_detected=\nDetected task type: {}\n
208
208
  all_ok_community_invite=❓ Support request
209
209
  too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
210
210
  imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
211
+ imbalanced_target=\nWARNING: Target is imbalanced and will be undersampled. Frequency of the rarest class `{}` is {}
211
212
  loss_selection_info=Using loss `{}` for feature selection
212
213
  loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
213
214
 
@@ -81,8 +81,8 @@ def balance_undersample(
81
81
  target_column: str,
82
82
  task_type: ModelTaskType,
83
83
  random_state: int,
84
- imbalance_threshold: int = 0.2,
85
- min_sample_threshold: int = 5000,
84
+ binary_min_sample_threshold: int = 5000,
85
+ multiclass_min_sample_threshold: int = 25000,
86
86
  binary_bootstrap_loops: int = 5,
87
87
  multiclass_bootstrap_loops: int = 2,
88
88
  logger: Optional[logging.Logger] = None,
@@ -96,52 +96,59 @@ def balance_undersample(
96
96
  if SYSTEM_RECORD_ID not in df.columns:
97
97
  raise Exception("System record id must be presented for undersampling")
98
98
 
99
- count = len(df)
99
+ # count = len(df)
100
100
  target = df[target_column].copy()
101
- target_classes_count = target.nunique()
101
+ # target_classes_count = target.nunique()
102
102
 
103
103
  vc = target.value_counts()
104
104
  max_class_value = vc.index[0]
105
105
  min_class_value = vc.index[len(vc) - 1]
106
106
  max_class_count = vc[max_class_value]
107
107
  min_class_count = vc[min_class_value]
108
+ num_classes = len(vc)
108
109
 
109
- min_class_percent = imbalance_threshold / target_classes_count
110
- min_class_threshold = int(min_class_percent * count)
110
+ # min_class_percent = imbalance_threshold / target_classes_count
111
+ # min_class_threshold = int(min_class_percent * count)
111
112
 
112
113
  resampled_data = df
113
114
  df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
114
115
  if task_type == ModelTaskType.MULTICLASS:
115
- # Sort classes by rows count and find 25% quantile class
116
- classes = vc.index
117
- quantile25_idx = int(0.75 * len(classes)) - 1
118
- quantile25_class = classes[quantile25_idx]
119
- quantile25_class_cnt = vc[quantile25_class]
120
-
121
- if max_class_count > (quantile25_class_cnt * multiclass_bootstrap_loops):
122
- msg = bundle.get("imbalance_multiclass").format(quantile25_class, quantile25_class_cnt)
116
+ if len(df) > multiclass_min_sample_threshold and max_class_count > (
117
+ min_class_count * multiclass_bootstrap_loops
118
+ ):
119
+
120
+ # msg = bundle.get("imbalance_multiclass").format(min_class_value, min_class_count)
121
+ msg = bundle.get("imbalanced_target").format(min_class_value, min_class_count)
123
122
  logger.warning(msg)
124
123
  print(msg)
125
124
  if warning_counter:
126
125
  warning_counter.increment()
127
126
 
128
- # 25% and lower classes will stay as is. Higher classes will be downsampled
129
127
  sample_strategy = dict()
130
- for class_idx in range(quantile25_idx):
131
- # compare class count with count_of_quantile25_class * 2
132
- class_value = classes[class_idx]
128
+ for class_value in vc.index:
129
+ if class_value == min_class_value:
130
+ continue
133
131
  class_count = vc[class_value]
134
- sample_strategy[class_value] = min(class_count, quantile25_class_cnt * multiclass_bootstrap_loops)
132
+ sample_size = min(
133
+ class_count,
134
+ multiclass_bootstrap_loops
135
+ * (
136
+ min_class_count
137
+ + max((multiclass_min_sample_threshold - num_classes * min_class_count) / (num_classes - 1), 0)
138
+ ),
139
+ )
140
+ sample_strategy[class_value] = int(sample_size)
135
141
  sampler = RandomUnderSampler(sampling_strategy=sample_strategy, random_state=random_state)
136
142
  X = df[SYSTEM_RECORD_ID]
137
143
  X = X.to_frame(SYSTEM_RECORD_ID)
138
144
  new_x, _ = sampler.fit_resample(X, target) # type: ignore
139
145
 
140
146
  resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
141
- elif len(df) > min_sample_threshold and min_class_count < min_sample_threshold / 2:
142
- msg = bundle.get("dataset_rarest_class_less_threshold").format(
143
- min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
144
- )
147
+ elif len(df) > binary_min_sample_threshold:
148
+ # msg = bundle.get("dataset_rarest_class_less_threshold").format(
149
+ # min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
150
+ # )
151
+ msg = bundle.get("imbalanced_target").format(min_class_value, min_class_count)
145
152
  logger.warning(msg)
146
153
  print(msg)
147
154
  if warning_counter:
@@ -153,7 +160,7 @@ def balance_undersample(
153
160
  # sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
154
161
  sample_size = min(
155
162
  max_class_count,
156
- binary_bootstrap_loops * (min_class_count + max(min_sample_threshold - 2 * min_class_count, 0)),
163
+ binary_bootstrap_loops * (min_class_count + max(binary_min_sample_threshold - 2 * min_class_count, 0)),
157
164
  )
158
165
  sampled_majority_class = majority_class.sample(n=sample_size, random_state=random_state)
159
166
  resampled_data = df[
@@ -1 +0,0 @@
1
- __version__ = "1.2.14a1"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes