upgini 1.2.14a4__py3-none-any.whl → 1.2.14a3616.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.14a4"
1
+ __version__ = "1.2.14a3616.dev2"
upgini/dataset.py CHANGED
@@ -53,8 +53,7 @@ class Dataset: # (pd.DataFrame):
53
53
  FIT_SAMPLE_THRESHOLD = 200_000
54
54
  FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
55
55
  FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
56
- BINARY_MIN_SAMPLE_THRESHOLD = 5_000
57
- MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
56
+ MIN_SAMPLE_THRESHOLD = 5_000
58
57
  IMBALANCE_THESHOLD = 0.6
59
58
  BINARY_BOOTSTRAP_LOOPS = 5
60
59
  MULTICLASS_BOOTSTRAP_LOOPS = 2
@@ -226,7 +225,7 @@ class Dataset: # (pd.DataFrame):
226
225
  train_segment = self.data
227
226
 
228
227
  if self.task_type == ModelTaskType.MULTICLASS or (
229
- self.task_type == ModelTaskType.BINARY and len(train_segment) > self.BINARY_MIN_SAMPLE_THRESHOLD
228
+ self.task_type == ModelTaskType.BINARY and len(train_segment) > self.MIN_SAMPLE_THRESHOLD
230
229
  ):
231
230
  count = len(train_segment)
232
231
  target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
@@ -254,7 +253,6 @@ class Dataset: # (pd.DataFrame):
254
253
  min_class_percent = self.IMBALANCE_THESHOLD / target_classes_count
255
254
  min_class_threshold = min_class_percent * count
256
255
 
257
- # If min class count less than 30% for binary or (60 / classes_count)% for multiclass
258
256
  if min_class_count < min_class_threshold:
259
257
  self.imbalanced = True
260
258
  self.data = balance_undersample(
@@ -262,8 +260,7 @@ class Dataset: # (pd.DataFrame):
262
260
  target_column=target_column,
263
261
  task_type=self.task_type,
264
262
  random_state=self.random_state,
265
- binary_min_sample_threshold=self.BINARY_MIN_SAMPLE_THRESHOLD,
266
- multiclass_min_sample_threshold=self.MULTICLASS_MIN_SAMPLE_THRESHOLD,
263
+ imbalance_threshold=self.IMBALANCE_THESHOLD,
267
264
  binary_bootstrap_loops=self.BINARY_BOOTSTRAP_LOOPS,
268
265
  multiclass_bootstrap_loops=self.MULTICLASS_BOOTSTRAP_LOOPS,
269
266
  logger=self.logger,
@@ -2721,13 +2721,12 @@ class FeaturesEnricher(TransformerMixin):
2721
2721
  ):
2722
2722
  msg = self.bundle.get("multivariate_timeseries_detected")
2723
2723
  self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
2724
- elif (
2725
- self.cv is None
2726
- and model_task_type != ModelTaskType.REGRESSION
2727
- and self._get_group_columns(df, self.fit_search_keys)
2728
- ):
2724
+ elif self.cv is None and model_task_type != ModelTaskType.REGRESSION:
2729
2725
  msg = self.bundle.get("group_k_fold_in_classification")
2730
2726
  self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
2727
+ group_columns = self._get_group_columns(df, self.fit_search_keys)
2728
+ self.runtime_parameters.properties["cv_params.group_columns"] = ",".join(group_columns)
2729
+ self.runtime_parameters.properties["cv_params.shuffle_kfold"] = "True"
2731
2730
 
2732
2731
  def __override_cv(self, cv: CVType, msg: str, print_warning: bool = True):
2733
2732
  if print_warning:
@@ -203,12 +203,11 @@ email_detected=Emails detected in column `{}`. It will be used as a search key\n
203
203
  email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
204
204
  phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
205
205
  phone_detected_not_registered=\nWARNING: Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
206
- target_type_detected=\nDetected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
206
+ target_type_detected=\nDetected task type: {}\n
207
207
  # all_ok_community_invite=Chat with us in Slack community:
208
208
  all_ok_community_invite=❓ Support request
209
209
  too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
210
210
  imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
211
- imbalanced_target=\nWARNING: Target is imbalanced and will be undersampled. Frequency of the rarest class `{}` is {}
212
211
  loss_selection_info=Using loss `{}` for feature selection
213
212
  loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
214
213
 
@@ -24,87 +24,49 @@ def define_task(
24
24
  ) -> ModelTaskType:
25
25
  if logger is None:
26
26
  logger = logging.getLogger()
27
-
28
- # Replace inf and -inf with NaN to handle extreme values correctly
29
- y = y.replace([np.inf, -np.inf], np.nan, inplace=False)
30
-
31
- # Drop NaN values from the target
32
27
  target = y.dropna()
33
-
34
- # Check if target is numeric and finite
35
28
  if is_numeric_dtype(target):
36
29
  target = target.loc[np.isfinite(target)]
37
30
  else:
38
- # If not numeric, drop empty strings as well
39
31
  target = target.loc[target != ""]
40
-
41
- # Raise error if there are no valid values left in the target
42
32
  if len(target) == 0:
43
33
  raise ValidationError(bundle.get("empty_target"))
44
-
45
- # Count unique values in the target
46
34
  target_items = target.nunique()
47
-
48
- # Raise error if all target values are the same
49
35
  if target_items == 1:
50
36
  raise ValidationError(bundle.get("dataset_constant_target"))
51
-
52
- reason = "" # Will store the reason for selecting the task type
53
-
54
- # Binary classification case: exactly two unique values
55
37
  if target_items == 2:
56
38
  task = ModelTaskType.BINARY
57
- reason = "only two unique label-values observed"
58
39
  else:
59
- # Attempt to convert target to numeric
60
40
  try:
61
41
  target = pd.to_numeric(target)
62
42
  is_numeric = True
63
43
  except Exception:
64
44
  is_numeric = False
65
45
 
66
- # If target cannot be converted to numeric, assume multiclass classification
46
+ # If any value is non numeric - multiclass
67
47
  if not is_numeric:
68
48
  task = ModelTaskType.MULTICLASS
69
- reason = "non-numeric label values observed"
70
49
  else:
71
- # Calculate the ratio of unique values to total number of values
72
- unique_ratio = target.nunique() / float(len(target))
73
-
74
- # Multiclass classification: few unique values and integer encoding
75
50
  if target.nunique() <= 50 and is_int_encoding(target.unique()):
76
51
  task = ModelTaskType.MULTICLASS
77
- reason = "few unique label-values observed and can be considered as categorical"
78
- # Regression case: if there are date features, assume regression
79
52
  elif has_date:
80
53
  task = ModelTaskType.REGRESSION
81
- reason = "date features are present, treating as regression"
82
54
  else:
83
- # Remove zero values and recalculate unique ratio
84
55
  non_zero_target = target[target != 0]
85
56
  target_items = non_zero_target.nunique()
86
57
  target_ratio = target_items / len(non_zero_target)
87
-
88
- # Use unique_ratio to determine whether to classify as regression or multiclass
89
58
  if (
90
- unique_ratio > 0.1 # Use threshold to differentiate between regression and classification
91
- or (target.dtype.kind == "f" and np.any(target != target.astype(int))) # Non-integer float values
59
+ (target.dtype.kind == "f" and np.any(target != target.astype(int))) # any non integer
92
60
  or target_items > 50
93
- or target_ratio > 0.2 # If non-zero values have high ratio of uniqueness
61
+ or target_ratio > 0.2
94
62
  ):
95
63
  task = ModelTaskType.REGRESSION
96
- reason = "many unique label-values or non-integer floating point values observed"
97
64
  else:
98
65
  task = ModelTaskType.MULTICLASS
99
- reason = "integer-like values with limited unique values observed"
100
-
101
- # Log or print the reason for the selected task type
102
- logger.info(f"Detected task type: {task} (Reason: {reason})")
103
66
 
104
- # Print task type and reason if silent mode is off
67
+ logger.info(f"Detected task type: {task}")
105
68
  if not silent:
106
- print(bundle.get("target_type_detected").format(task, reason))
107
-
69
+ print(bundle.get("target_type_detected").format(task))
108
70
  return task
109
71
 
110
72
 
@@ -119,8 +81,8 @@ def balance_undersample(
119
81
  target_column: str,
120
82
  task_type: ModelTaskType,
121
83
  random_state: int,
122
- binary_min_sample_threshold: int = 5000,
123
- multiclass_min_sample_threshold: int = 25000,
84
+ imbalance_threshold: int = 0.2,
85
+ min_sample_threshold: int = 5000,
124
86
  binary_bootstrap_loops: int = 5,
125
87
  multiclass_bootstrap_loops: int = 2,
126
88
  logger: Optional[logging.Logger] = None,
@@ -134,60 +96,52 @@ def balance_undersample(
134
96
  if SYSTEM_RECORD_ID not in df.columns:
135
97
  raise Exception("System record id must be presented for undersampling")
136
98
 
137
- # count = len(df)
99
+ count = len(df)
138
100
  target = df[target_column].copy()
139
- # target_classes_count = target.nunique()
101
+ target_classes_count = target.nunique()
140
102
 
141
103
  vc = target.value_counts()
142
104
  max_class_value = vc.index[0]
143
105
  min_class_value = vc.index[len(vc) - 1]
144
106
  max_class_count = vc[max_class_value]
145
107
  min_class_count = vc[min_class_value]
146
- num_classes = len(vc)
147
108
 
148
- # min_class_percent = imbalance_threshold / target_classes_count
149
- # min_class_threshold = int(min_class_percent * count)
109
+ min_class_percent = imbalance_threshold / target_classes_count
110
+ min_class_threshold = int(min_class_percent * count)
150
111
 
151
112
  resampled_data = df
152
113
  df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
153
114
  if task_type == ModelTaskType.MULTICLASS:
154
- if len(df) > multiclass_min_sample_threshold and max_class_count > (
155
- min_class_count * multiclass_bootstrap_loops
156
- ):
157
-
158
- # msg = bundle.get("imbalance_multiclass").format(min_class_value, min_class_count)
159
- msg = bundle.get("imbalanced_target").format(min_class_value, min_class_count)
115
+ # Sort classes by rows count and find 25% quantile class
116
+ classes = vc.index
117
+ quantile25_idx = int(0.75 * len(classes)) - 1
118
+ quantile25_class = classes[quantile25_idx]
119
+ quantile25_class_cnt = vc[quantile25_class]
120
+
121
+ if max_class_count > (quantile25_class_cnt * multiclass_bootstrap_loops):
122
+ msg = bundle.get("imbalance_multiclass").format(quantile25_class, quantile25_class_cnt)
160
123
  logger.warning(msg)
161
124
  print(msg)
162
125
  if warning_counter:
163
126
  warning_counter.increment()
164
127
 
128
+ # 25% and lower classes will stay as is. Higher classes will be downsampled
165
129
  sample_strategy = dict()
166
- for class_value in vc.index:
167
- if class_value == min_class_value:
168
- continue
130
+ for class_idx in range(quantile25_idx):
131
+ # compare class count with count_of_quantile25_class * 2
132
+ class_value = classes[class_idx]
169
133
  class_count = vc[class_value]
170
- sample_size = min(
171
- class_count,
172
- multiclass_bootstrap_loops
173
- * (
174
- min_class_count
175
- + max((multiclass_min_sample_threshold - num_classes * min_class_count) / (num_classes - 1), 0)
176
- ),
177
- )
178
- sample_strategy[class_value] = int(sample_size)
179
- logger.info(f"Rebalance sample strategy: {sample_strategy}. Min class count: {min_class_count}")
134
+ sample_strategy[class_value] = min(class_count, quantile25_class_cnt * multiclass_bootstrap_loops)
180
135
  sampler = RandomUnderSampler(sampling_strategy=sample_strategy, random_state=random_state)
181
136
  X = df[SYSTEM_RECORD_ID]
182
137
  X = X.to_frame(SYSTEM_RECORD_ID)
183
138
  new_x, _ = sampler.fit_resample(X, target) # type: ignore
184
139
 
185
140
  resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
186
- elif len(df) > binary_min_sample_threshold:
187
- # msg = bundle.get("dataset_rarest_class_less_threshold").format(
188
- # min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
189
- # )
190
- msg = bundle.get("imbalanced_target").format(min_class_value, min_class_count)
141
+ elif len(df) > min_sample_threshold and min_class_count < min_sample_threshold / 2:
142
+ msg = bundle.get("dataset_rarest_class_less_threshold").format(
143
+ min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
144
+ )
191
145
  logger.warning(msg)
192
146
  print(msg)
193
147
  if warning_counter:
@@ -196,38 +150,30 @@ def balance_undersample(
196
150
  # fill up to min_sample_threshold by majority class
197
151
  minority_class = df[df[target_column] == min_class_value]
198
152
  majority_class = df[df[target_column] != min_class_value]
199
- # sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
200
- sample_size = min(
201
- max_class_count,
202
- binary_bootstrap_loops * (min_class_count + max(binary_min_sample_threshold - 2 * min_class_count, 0)),
203
- )
204
- logger.info(
205
- f"Min class count: {min_class_count}. Max class count: {max_class_count}."
206
- f" Rebalance sample size: {sample_size}"
207
- )
153
+ sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
208
154
  sampled_majority_class = majority_class.sample(n=sample_size, random_state=random_state)
209
155
  resampled_data = df[
210
156
  (df[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
211
157
  | (df[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
212
158
  ]
213
159
 
214
- # elif max_class_count > min_class_count * binary_bootstrap_loops:
215
- # msg = bundle.get("dataset_rarest_class_less_threshold").format(
216
- # min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
217
- # )
218
- # logger.warning(msg)
219
- # print(msg)
220
- # if warning_counter:
221
- # warning_counter.increment()
222
-
223
- # sampler = RandomUnderSampler(
224
- # sampling_strategy={max_class_value: binary_bootstrap_loops * min_class_count}, random_state=random_state
225
- # )
226
- # X = df[SYSTEM_RECORD_ID]
227
- # X = X.to_frame(SYSTEM_RECORD_ID)
228
- # new_x, _ = sampler.fit_resample(X, target) # type: ignore
229
-
230
- # resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
160
+ elif max_class_count > min_class_count * binary_bootstrap_loops:
161
+ msg = bundle.get("dataset_rarest_class_less_threshold").format(
162
+ min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
163
+ )
164
+ logger.warning(msg)
165
+ print(msg)
166
+ if warning_counter:
167
+ warning_counter.increment()
168
+
169
+ sampler = RandomUnderSampler(
170
+ sampling_strategy={max_class_value: binary_bootstrap_loops * min_class_count}, random_state=random_state
171
+ )
172
+ X = df[SYSTEM_RECORD_ID]
173
+ X = X.to_frame(SYSTEM_RECORD_ID)
174
+ new_x, _ = sampler.fit_resample(X, target) # type: ignore
175
+
176
+ resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
231
177
 
232
178
  logger.info(f"Shape after rebalance resampling: {resampled_data}")
233
179
  return resampled_data
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.14a4
3
+ Version: 1.2.14a3616.dev2
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,9 +1,9 @@
1
- upgini/__about__.py,sha256=tfhdEEoOzTUSKNF9hQy8PZO57ri0xEeduAwFCwtVLCg,25
1
+ upgini/__about__.py,sha256=11a3ZzwaEFhVx7oIELanEC_YCKKOT0lyRpXdoTeUml4,33
2
2
  upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
4
+ upgini/dataset.py,sha256=olZ-OHSfBNoBSCo7R5t7uCLukI2nO7afpx_A-HCiJLk,31067
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=HJJZbZScVrl6ugDBQE71m7om5-ahvMyEnAqZNw-OEJ0,188058
6
+ upgini/features_enricher.py,sha256=7hibmMuykV1FwXm9yOPBrRy3c730Q_EtHwhgb-ajQlc,188217
7
7
  upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
9
  upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
@@ -30,7 +30,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
30
30
  upgini/normalizer/normalize_utils.py,sha256=bHRPWCNrUvt2R9qMX6dZFCJ0i8ENVCQ2Rw3dHH9IJEg,7447
31
31
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
32
32
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
33
- upgini/resource_bundle/strings.properties,sha256=uo9CIQMg8VFeHlL_mY2dwOumQnr0TenJNPNOfXPWlPI,26715
33
+ upgini/resource_bundle/strings.properties,sha256=9kvmcUrsSFUCrzOiN0Ozf-lQ2H8Igz5gATUPoHMOaU4,26456
34
34
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
35
35
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
36
  upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -54,10 +54,10 @@ upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,1
54
54
  upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
55
55
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
56
56
  upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
57
- upgini/utils/target_utils.py,sha256=3itoOxwEycnIdWeTL3KjuS_NdJleL6nMRqblQLmy6Kk,10413
57
+ upgini/utils/target_utils.py,sha256=BVtDmrmFMKerSUWaNOIEdzsYHIFiODdpnWbE50QDPDc,7864
58
58
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
59
59
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
60
- upgini-1.2.14a4.dist-info/METADATA,sha256=iwhYx9Mru7TEEylcJbeOaZlnKVBcpMrnYXzNEU2M4fg,48579
61
- upgini-1.2.14a4.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
- upgini-1.2.14a4.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
- upgini-1.2.14a4.dist-info/RECORD,,
60
+ upgini-1.2.14a3616.dev2.dist-info/METADATA,sha256=C5wROJCdIAYVObZ_bslXFYwQyVRUppBI_WI7NGtVM20,48587
61
+ upgini-1.2.14a3616.dev2.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
62
+ upgini-1.2.14a3616.dev2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
+ upgini-1.2.14a3616.dev2.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.25.0
2
+ Generator: hatchling 1.24.2
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any