upgini 1.2.14a1__tar.gz → 1.2.14a3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (65) hide show
  1. {upgini-1.2.14a1 → upgini-1.2.14a3}/PKG-INFO +1 -1
  2. upgini-1.2.14a3/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/dataset.py +5 -3
  4. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/resource_bundle/strings.properties +2 -1
  5. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/utils/target_utils.py +74 -29
  6. upgini-1.2.14a1/src/upgini/__about__.py +0 -1
  7. {upgini-1.2.14a1 → upgini-1.2.14a3}/.gitignore +0 -0
  8. {upgini-1.2.14a1 → upgini-1.2.14a3}/LICENSE +0 -0
  9. {upgini-1.2.14a1 → upgini-1.2.14a3}/README.md +0 -0
  10. {upgini-1.2.14a1 → upgini-1.2.14a3}/pyproject.toml +0 -0
  11. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/__init__.py +0 -0
  12. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/ads.py +0 -0
  13. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/ads_management/__init__.py +0 -0
  14. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/ads_management/ads_manager.py +0 -0
  15. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/autofe/__init__.py +0 -0
  16. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/autofe/all_operands.py +0 -0
  17. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/autofe/binary.py +0 -0
  18. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/autofe/date.py +0 -0
  19. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/autofe/feature.py +0 -0
  20. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/autofe/groupby.py +0 -0
  21. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/autofe/operand.py +0 -0
  22. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/autofe/unary.py +0 -0
  23. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/autofe/vector.py +0 -0
  24. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/data_source/__init__.py +0 -0
  25. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/data_source/data_source_publisher.py +0 -0
  26. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/errors.py +0 -0
  27. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/features_enricher.py +0 -0
  28. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/http.py +0 -0
  29. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/lazy_import.py +0 -0
  30. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/mdc/__init__.py +0 -0
  31. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/mdc/context.py +0 -0
  32. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/metadata.py +0 -0
  33. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/metrics.py +0 -0
  34. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/normalizer/__init__.py +0 -0
  35. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/normalizer/normalize_utils.py +0 -0
  36. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/resource_bundle/__init__.py +0 -0
  37. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/resource_bundle/exceptions.py +0 -0
  38. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  39. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/sampler/__init__.py +0 -0
  40. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/sampler/base.py +0 -0
  41. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/sampler/random_under_sampler.py +0 -0
  42. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/sampler/utils.py +0 -0
  43. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/search_task.py +0 -0
  44. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/spinner.py +0 -0
  45. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/utils/__init__.py +0 -0
  46. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/utils/base_search_key_detector.py +0 -0
  47. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/utils/blocked_time_series.py +0 -0
  48. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/utils/country_utils.py +0 -0
  49. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/utils/custom_loss_utils.py +0 -0
  50. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/utils/cv_utils.py +0 -0
  51. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/utils/datetime_utils.py +0 -0
  52. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/utils/deduplicate_utils.py +0 -0
  53. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/utils/display_utils.py +0 -0
  54. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/utils/email_utils.py +0 -0
  55. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/utils/fallback_progress_bar.py +0 -0
  56. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/utils/features_validator.py +0 -0
  57. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/utils/format.py +0 -0
  58. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/utils/ip_utils.py +0 -0
  59. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/utils/phone_utils.py +0 -0
  60. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/utils/postal_code_utils.py +0 -0
  61. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/utils/progress_bar.py +0 -0
  62. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/utils/sklearn_ext.py +0 -0
  63. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/utils/track_info.py +0 -0
  64. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/utils/warning_counter.py +0 -0
  65. {upgini-1.2.14a1 → upgini-1.2.14a3}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.14a1
3
+ Version: 1.2.14a3
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.14a3"
@@ -53,7 +53,8 @@ class Dataset: # (pd.DataFrame):
53
53
  FIT_SAMPLE_THRESHOLD = 200_000
54
54
  FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
55
55
  FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
56
- MIN_SAMPLE_THRESHOLD = 5_000
56
+ BINARY_MIN_SAMPLE_THRESHOLD = 5_000
57
+ MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
57
58
  IMBALANCE_THESHOLD = 0.6
58
59
  BINARY_BOOTSTRAP_LOOPS = 5
59
60
  MULTICLASS_BOOTSTRAP_LOOPS = 2
@@ -225,7 +226,7 @@ class Dataset: # (pd.DataFrame):
225
226
  train_segment = self.data
226
227
 
227
228
  if self.task_type == ModelTaskType.MULTICLASS or (
228
- self.task_type == ModelTaskType.BINARY and len(train_segment) > self.MIN_SAMPLE_THRESHOLD
229
+ self.task_type == ModelTaskType.BINARY and len(train_segment) > self.BINARY_MIN_SAMPLE_THRESHOLD
229
230
  ):
230
231
  count = len(train_segment)
231
232
  target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
@@ -261,7 +262,8 @@ class Dataset: # (pd.DataFrame):
261
262
  target_column=target_column,
262
263
  task_type=self.task_type,
263
264
  random_state=self.random_state,
264
- imbalance_threshold=self.IMBALANCE_THESHOLD,
265
+ binary_min_sample_threshold=self.BINARY_MIN_SAMPLE_THRESHOLD,
266
+ multiclass_min_sample_threshold=self.MULTICLASS_MIN_SAMPLE_THRESHOLD,
265
267
  binary_bootstrap_loops=self.BINARY_BOOTSTRAP_LOOPS,
266
268
  multiclass_bootstrap_loops=self.MULTICLASS_BOOTSTRAP_LOOPS,
267
269
  logger=self.logger,
@@ -203,11 +203,12 @@ email_detected=Emails detected in column `{}`. It will be used as a search key\n
203
203
  email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
204
204
  phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
205
205
  phone_detected_not_registered=\nWARNING: Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
206
- target_type_detected=\nDetected task type: {}\n
206
+ target_type_detected=\nDetected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
207
207
  # all_ok_community_invite=Chat with us in Slack community:
208
208
  all_ok_community_invite=❓ Support request
209
209
  too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
210
210
  imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
211
+ imbalanced_target=\nWARNING: Target is imbalanced and will be undersampled. Frequency of the rarest class `{}` is {}
211
212
  loss_selection_info=Using loss `{}` for feature selection
212
213
  loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
213
214
 
@@ -24,49 +24,87 @@ def define_task(
24
24
  ) -> ModelTaskType:
25
25
  if logger is None:
26
26
  logger = logging.getLogger()
27
+
28
+ # Replace inf and -inf with NaN to handle extreme values correctly
29
+ y = y.replace([np.inf, -np.inf], np.nan, inplace=False)
30
+
31
+ # Drop NaN values from the target
27
32
  target = y.dropna()
33
+
34
+ # Check if target is numeric and finite
28
35
  if is_numeric_dtype(target):
29
36
  target = target.loc[np.isfinite(target)]
30
37
  else:
38
+ # If not numeric, drop empty strings as well
31
39
  target = target.loc[target != ""]
40
+
41
+ # Raise error if there are no valid values left in the target
32
42
  if len(target) == 0:
33
43
  raise ValidationError(bundle.get("empty_target"))
44
+
45
+ # Count unique values in the target
34
46
  target_items = target.nunique()
47
+
48
+ # Raise error if all target values are the same
35
49
  if target_items == 1:
36
50
  raise ValidationError(bundle.get("dataset_constant_target"))
51
+
52
+ reason = "" # Will store the reason for selecting the task type
53
+
54
+ # Binary classification case: exactly two unique values
37
55
  if target_items == 2:
38
56
  task = ModelTaskType.BINARY
57
+ reason = "only two unique label-values observed"
39
58
  else:
59
+ # Attempt to convert target to numeric
40
60
  try:
41
61
  target = pd.to_numeric(target)
42
62
  is_numeric = True
43
63
  except Exception:
44
64
  is_numeric = False
45
65
 
46
- # If any value is non numeric - multiclass
66
+ # If target cannot be converted to numeric, assume multiclass classification
47
67
  if not is_numeric:
48
68
  task = ModelTaskType.MULTICLASS
69
+ reason = "non-numeric label values observed"
49
70
  else:
71
+ # Calculate the ratio of unique values to total number of values
72
+ unique_ratio = target.nunique() / float(len(target))
73
+
74
+ # Multiclass classification: few unique values and integer encoding
50
75
  if target.nunique() <= 50 and is_int_encoding(target.unique()):
51
76
  task = ModelTaskType.MULTICLASS
77
+ reason = "few unique label-values observed and can be considered as categorical"
78
+ # Regression case: if there are date features, assume regression
52
79
  elif has_date:
53
80
  task = ModelTaskType.REGRESSION
81
+ reason = "date features are present, treating as regression"
54
82
  else:
83
+ # Remove zero values and recalculate unique ratio
55
84
  non_zero_target = target[target != 0]
56
85
  target_items = non_zero_target.nunique()
57
86
  target_ratio = target_items / len(non_zero_target)
87
+
88
+ # Use unique_ratio to determine whether to classify as regression or multiclass
58
89
  if (
59
- (target.dtype.kind == "f" and np.any(target != target.astype(int))) # any non integer
90
+ unique_ratio > 0.1 # Use threshold to differentiate between regression and classification
91
+ or (target.dtype.kind == "f" and np.any(target != target.astype(int))) # Non-integer float values
60
92
  or target_items > 50
61
- or target_ratio > 0.2
93
+ or target_ratio > 0.2 # If non-zero values have high ratio of uniqueness
62
94
  ):
63
95
  task = ModelTaskType.REGRESSION
96
+ reason = "many unique label-values or non-integer floating point values observed"
64
97
  else:
65
98
  task = ModelTaskType.MULTICLASS
99
+ reason = "integer-like values with limited unique values observed"
100
+
101
+ # Log or print the reason for the selected task type
102
+ logger.info(f"Detected task type: {task} (Reason: {reason})")
66
103
 
67
- logger.info(f"Detected task type: {task}")
104
+ # Print task type and reason if silent mode is off
68
105
  if not silent:
69
- print(bundle.get("target_type_detected").format(task))
106
+ print(bundle.get("target_type_detected").format(task, reason))
107
+
70
108
  return task
71
109
 
72
110
 
@@ -81,8 +119,8 @@ def balance_undersample(
81
119
  target_column: str,
82
120
  task_type: ModelTaskType,
83
121
  random_state: int,
84
- imbalance_threshold: int = 0.2,
85
- min_sample_threshold: int = 5000,
122
+ binary_min_sample_threshold: int = 5000,
123
+ multiclass_min_sample_threshold: int = 25000,
86
124
  binary_bootstrap_loops: int = 5,
87
125
  multiclass_bootstrap_loops: int = 2,
88
126
  logger: Optional[logging.Logger] = None,
@@ -96,52 +134,59 @@ def balance_undersample(
96
134
  if SYSTEM_RECORD_ID not in df.columns:
97
135
  raise Exception("System record id must be presented for undersampling")
98
136
 
99
- count = len(df)
137
+ # count = len(df)
100
138
  target = df[target_column].copy()
101
- target_classes_count = target.nunique()
139
+ # target_classes_count = target.nunique()
102
140
 
103
141
  vc = target.value_counts()
104
142
  max_class_value = vc.index[0]
105
143
  min_class_value = vc.index[len(vc) - 1]
106
144
  max_class_count = vc[max_class_value]
107
145
  min_class_count = vc[min_class_value]
146
+ num_classes = len(vc)
108
147
 
109
- min_class_percent = imbalance_threshold / target_classes_count
110
- min_class_threshold = int(min_class_percent * count)
148
+ # min_class_percent = imbalance_threshold / target_classes_count
149
+ # min_class_threshold = int(min_class_percent * count)
111
150
 
112
151
  resampled_data = df
113
152
  df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
114
153
  if task_type == ModelTaskType.MULTICLASS:
115
- # Sort classes by rows count and find 25% quantile class
116
- classes = vc.index
117
- quantile25_idx = int(0.75 * len(classes)) - 1
118
- quantile25_class = classes[quantile25_idx]
119
- quantile25_class_cnt = vc[quantile25_class]
120
-
121
- if max_class_count > (quantile25_class_cnt * multiclass_bootstrap_loops):
122
- msg = bundle.get("imbalance_multiclass").format(quantile25_class, quantile25_class_cnt)
154
+ if len(df) > multiclass_min_sample_threshold and max_class_count > (
155
+ min_class_count * multiclass_bootstrap_loops
156
+ ):
157
+
158
+ # msg = bundle.get("imbalance_multiclass").format(min_class_value, min_class_count)
159
+ msg = bundle.get("imbalanced_target").format(min_class_value, min_class_count)
123
160
  logger.warning(msg)
124
161
  print(msg)
125
162
  if warning_counter:
126
163
  warning_counter.increment()
127
164
 
128
- # 25% and lower classes will stay as is. Higher classes will be downsampled
129
165
  sample_strategy = dict()
130
- for class_idx in range(quantile25_idx):
131
- # compare class count with count_of_quantile25_class * 2
132
- class_value = classes[class_idx]
166
+ for class_value in vc.index:
167
+ if class_value == min_class_value:
168
+ continue
133
169
  class_count = vc[class_value]
134
- sample_strategy[class_value] = min(class_count, quantile25_class_cnt * multiclass_bootstrap_loops)
170
+ sample_size = min(
171
+ class_count,
172
+ multiclass_bootstrap_loops
173
+ * (
174
+ min_class_count
175
+ + max((multiclass_min_sample_threshold - num_classes * min_class_count) / (num_classes - 1), 0)
176
+ ),
177
+ )
178
+ sample_strategy[class_value] = int(sample_size)
135
179
  sampler = RandomUnderSampler(sampling_strategy=sample_strategy, random_state=random_state)
136
180
  X = df[SYSTEM_RECORD_ID]
137
181
  X = X.to_frame(SYSTEM_RECORD_ID)
138
182
  new_x, _ = sampler.fit_resample(X, target) # type: ignore
139
183
 
140
184
  resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
141
- elif len(df) > min_sample_threshold and min_class_count < min_sample_threshold / 2:
142
- msg = bundle.get("dataset_rarest_class_less_threshold").format(
143
- min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
144
- )
185
+ elif len(df) > binary_min_sample_threshold:
186
+ # msg = bundle.get("dataset_rarest_class_less_threshold").format(
187
+ # min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
188
+ # )
189
+ msg = bundle.get("imbalanced_target").format(min_class_value, min_class_count)
145
190
  logger.warning(msg)
146
191
  print(msg)
147
192
  if warning_counter:
@@ -153,7 +198,7 @@ def balance_undersample(
153
198
  # sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
154
199
  sample_size = min(
155
200
  max_class_count,
156
- binary_bootstrap_loops * (min_class_count + max(min_sample_threshold - 2 * min_class_count, 0)),
201
+ binary_bootstrap_loops * (min_class_count + max(binary_min_sample_threshold - 2 * min_class_count, 0)),
157
202
  )
158
203
  sampled_majority_class = majority_class.sample(n=sample_size, random_state=random_state)
159
204
  resampled_data = df[
@@ -1 +0,0 @@
1
- __version__ = "1.2.14a1"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes